In [1]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
import rdkit

# check out https://greglandrum.github.io/rdkit-blog/posts/2023-01-18-fingerprint-generator-tutorial.html

In [2]:
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)
m = Chem.MolFromSmiles('CC(C)(C)P(N[Si](C)(C)C)C(C)(C)C')

fp = mfpgen.GetFingerprint(m)
fp

<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x13673cacba0>

In [3]:
import numpy as np

np_bits = mfpgen.GetFingerprintAsNumPy(m)
print(len(np_bits))
sparse = mfpgen.GetSparseFingerprint(m)
print(sparse.GetNumOnBits())
list(sparse.GetOnBits())

2048
15


[-2049689486,
 -2048238559,
 -905989766,
 -757843576,
 -634266200,
 -403210415,
 -160313689,
 -100366592,
 -15168503,
 -6712373,
 2782958,
 392737424,
 847961216,
 984473645,
 1798518767]

In [4]:
import pandas as pd

kraken_df = pd.read_csv('kraken_data/ml_8_210.csv')[['molecule_id', 'smiles']]
dft_df = pd.read_csv('dft_data/grand_lambda_max_data.csv')
fp_df = pd.merge(dft_df, kraken_df, on='molecule_id')

fp_df['Mol'] = fp_df['smiles'].apply(Chem.MolFromSmiles)
fp_df['morgan_fp'] = fp_df['Mol'].apply(mfpgen.GetFingerprintAsNumPy)
print(fp_df.shape)
fp_df.head()

(70, 6)


Unnamed: 0,molecule_id,lambda_max,shift,smiles,Mol,morgan_fp
0,8,395.4,0.0,CC(C)(C)P(C(C)(C)C)C(C)(C)C,<rdkit.Chem.rdchem.Mol object at 0x0000013673C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,12,421.5,26.1,CC(C)(C)CP(C(C)(C)C)C(C)(C)C,<rdkit.Chem.rdchem.Mol object at 0x00000136748...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1136,342.5,-52.9,COc1ccccc1P(C(C)(C)C)C(C)(C)C,<rdkit.Chem.rdchem.Mol object at 0x00000136748...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,1648,378.0,-17.4,CC(C)(C)P(c1cccc2ccccc12)C(C)(C)C,<rdkit.Chem.rdchem.Mol object at 0x00000136748...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1656,414.7,19.3,Cc1cc(C)c(P(C(C)(C)C)C(C)(C)C)c(C)c1,<rdkit.Chem.rdchem.Mol object at 0x00000136748...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [5]:
X = pd.DataFrame(fp_df['morgan_fp'].to_list(), columns=range(2048))
target = fp_df.columns[1]
y = fp_df[target].values
print(X.shape)
print(y.shape)

(70, 2048)
(70,)


In [24]:
from sklearn.model_selection import LeaveOneOut

def loocv(X, y, model, verbose=False):
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    
    y_preds = []
    for i, (train_indices, test_index) in enumerate(loo.split(X)):
        X_train, y_train = X.iloc[train_indices], y[train_indices]
        X_test, y_test = X.iloc[test_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_preds.append(y_pred[0])
    return y_preds

In [25]:
from sklearn.linear_model import LinearRegression

y_preds = loocv(X, y, LinearRegression(), verbose=True)

In [29]:
preds_df = pd.DataFrame({
    'y': y,
    'y_preds': y_preds
})
preds_df.to_csv('preds_data/ecfp_preds.csv')