In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from autogluon.tabular import TabularPredictor
from skfp.preprocessing import *
from skfp.fingerprints import MordredFingerprint, ECFPFingerprint, PharmacophoreFingerprint, PhysiochemicalPropertiesFingerprint

In [46]:
origin = pd.read_csv("../data/derivatives/3-(5-Nitro-2-furyl)acrylic acid.txt", header=None, sep="\t", dtype="object").values[0][0]

derivatives = pd.read_csv("../data/derivatives/Combination products without salts.txt", header=None, sep="\t", dtype="object")
derivatives.columns = ["smiles"]
derivatives["No."] = range(1, len(derivatives) + 1)

data = pd.read_csv("../data/entry_dataset/merged_cleaned_dataset.csv")
data = data[["smiles", "Accum_class"]]
data.rename(columns={"Accum_class": "label"}, inplace=True)
label_dict = {"low": 0, "high": 1}
data["label"] = data["label"].map(label_dict)

In [48]:
derivatives

Unnamed: 0,smiles,No.
0,CCNC(/C=C/C1=CC=C(O1)[N+]([O-])=O)=O,1
1,CN(C(/C=C/C1=CC=C(O1)[N+]([O-])=O)=O)N,2
2,CNNC(/C=C/C1=CC=C(O1)[N+]([O-])=O)=O,3
3,O=C(NC1CC1)/C=C/C2=CC=C(O2)[N+]([O-])=O,4
4,CC(NC(/C=C/C1=CC=C(O1)[N+]([O-])=O)=O)C,5
...,...,...
187,O=C(N1CCN(c(ccc2)c3c2scc3)CC1)/C=C/C4=CC=C(O4)[N+]([O-])=O,188
188,N(C(/C=C/C1=CC=C(O1)[N+]([O-])=O)=O)Cc2ccccc2n3c(C)ncc3,189
189,O=C(N(Cc1ccncc1)CC2OCCC2)/C=C/C3=CC=C(O3)[N+]([O-])=O,190
190,[O-][N+](C(O1)=CC=C1/C=C/C(NC(C)c2ccc(N3CCCCC3)cc2)=O)=O,191


In [49]:
def smiles2smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return str(Chem.MolToSmiles(mol))
    except:
        return None

In [50]:
# Clean smiles

data["smiles"] = [smiles2smiles(smiles) for smiles in data["smiles"]]
data = data.drop_duplicates().dropna()
data.reset_index(drop=True, inplace=True)

derivatives["smiles"] = [smiles2smiles(smiles) for smiles in derivatives["smiles"]]
derivatives = derivatives.drop_duplicates().dropna()
derivatives.reset_index(drop=True, inplace=True)

[16:48:24] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7
[16:48:24] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7
[16:48:24] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 8
[16:48:24] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7 8 9 10 11
[16:48:24] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 10 11 12 13
[16:48:24] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 19


In [52]:
fp_gen = {
    "Mordred": MordredFingerprint(use_3D=True, n_jobs=32),
    "ECFP": ECFPFingerprint(1024, n_jobs=32),
    # "Pharmacophore": PharmacophoreFingerprint(fp_size=1024, n_jobs=32),
    "Physiochemical": PhysiochemicalPropertiesFingerprint(1024, n_jobs=32)
}

In [53]:
data_fp_dict = {}
derivatives_fp_dict = {}
for fp_name, gen in fp_gen.items():
    data_fp_dict[fp_name] = gen.transform(data["smiles"])
    derivatives_fp_dict[fp_name] = gen.transform(derivatives["smiles"])
    
    assert data_fp_dict[fp_name].shape[0] == len(data)
    # assert not np.any(np.isnan(data_fp_dict[fp_name]))
    assert derivatives_fp_dict[fp_name].shape[0] == len(derivatives)
    # assert not np.any(np.isnan(derivatives_fp_dict[fp_name]))

In [54]:
for fp_type in fp_gen.keys():
    print(fp_type)
    train_df = pd.DataFrame(data_fp_dict[fp_type])
    train_df["label"] = data["label"]
    print(train_df.isna().sum().sum())
    predictor = TabularPredictor(label="label", eval_metric="average_precision", verbosity=0).fit(train_df)

    pred_df = pd.DataFrame(derivatives_fp_dict[fp_type])
    preds = predictor.predict_proba(pred_df)
    
    assert preds.isna().sum().sum() == 0
    derivatives[fp_type] = preds[1]

Mordred
47389


		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'


ECFP
0


		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'


Physiochemical
0


		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'


In [55]:
derivatives["total"] = derivatives[fp_gen.keys()].mean(axis=1)
derivatives.sort_values("total", ascending=False, inplace=True)

In [57]:
derivatives.to_csv("derivatives_predictions.csv", index=False)