In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.model_selection import train_test_split



In [2]:
path = "data_corona/"
seed = 0

In [3]:
x = pd.read_csv("{}smiles_train.csv".format(path))
y = pd.read_csv("{}y_train.csv".format(path))
z = pd.concat([x, y],axis=1, ignore_index = True)
z = z.rename(columns={0: "Smiles", 1: "Activity"})
num_mol = len(x)

# Train

In [5]:
def get_Descriptors(df, desc_names):
    num_mol = len(df["Smiles"])
    mol_desc_calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_names)
    desc_mat = np.zeros((num_mol,len(desc_names)))
    verbosity = int(num_mol*0.05)
    for i in range(num_mol):
        smile = df["Smiles"][i]
        if  i % verbosity == 0:
            print("{} out of {} = {}%".format(i,num_mol, int(i*100/num_mol)))
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            print("Invalid Molecule at index {}, {}".format(i,smile))
            desc_mat[i] = np.nan
        else:
            desc_mat[i] = mol_desc_calc.CalcDescriptors(mol)
    return pd.DataFrame(desc_mat, columns = desc_names)

In [7]:
def get_MorganFP(df, fp_len, radius=2):
    num_mol = len(df["Smiles"])
    fp_mat = np.zeros((num_mol,fp_len))
    verbosity = int(num_mol*0.05)
    for i in range(num_mol):
        smile = df["Smiles"][i]
        if  i % verbosity == 0:
            print("{} out of {} = {}%".format(i,num_mol, int(i*100/num_mol)))
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            print("Invalid Molecule at index {}, {}".format(i,smile))
            fp_mat[i] = np.nan
        else:
            fp_mat[i] = AllChem.GetMorganFingerprintAsBitVect(mol, radius, fp_len)
    return pd.DataFrame(fp_mat)

In [8]:
calculate_MorganFP = True
desc_names = [desc[0] for desc in Chem.Descriptors._descList]
fp_len = 1024
data = pd.concat([z, get_Descriptors(z, desc_names)],axis=1, ignore_index = True)
data.columns = ["Smiles","Activity"] + desc_names
if calculate_MorganFP:
    data = pd.concat([data, get_MorganFP(z, fp_len)],axis=1, ignore_index = True)
    data.columns = ["Smiles","Activity"] + desc_names + list(range(fp_len))
data.head()

0 out of 189001 = 0%
9450 out of 189001 = 4%
18900 out of 189001 = 9%
28350 out of 189001 = 14%
37800 out of 189001 = 19%
47250 out of 189001 = 24%
56700 out of 189001 = 29%
66150 out of 189001 = 34%
75600 out of 189001 = 39%
85050 out of 189001 = 44%
94500 out of 189001 = 49%
103950 out of 189001 = 54%
113400 out of 189001 = 59%
122850 out of 189001 = 64%
132300 out of 189001 = 69%
141750 out of 189001 = 74%
151200 out of 189001 = 79%
160650 out of 189001 = 84%
170100 out of 189001 = 89%
179550 out of 189001 = 94%
189000 out of 189001 = 99%
0 out of 189001 = 0%
9450 out of 189001 = 4%
18900 out of 189001 = 9%
28350 out of 189001 = 14%
37800 out of 189001 = 19%
47250 out of 189001 = 24%
56700 out of 189001 = 29%
66150 out of 189001 = 34%
75600 out of 189001 = 39%
85050 out of 189001 = 44%
94500 out of 189001 = 49%
103950 out of 189001 = 54%
113400 out of 189001 = 59%
122850 out of 189001 = 64%
132300 out of 189001 = 69%
141750 out of 189001 = 74%
151200 out of 189001 = 79%
160650 out o

Unnamed: 0,Smiles,Activity,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)CO)O)O)N,0,9.950184,-1.18913,9.950184,0.218049,0.49052,267.245,254.141,267.096754,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,C1(C(=O)NC(=O)N1)NC(=O)N,0,10.655463,-1.041667,10.655463,0.616852,0.325138,158.117,152.069,158.04399,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,C1C2C(C(S1)CCCCC(=O)O)NC(=O)N2,0,11.112311,-0.728901,11.112311,0.064041,0.493478,244.316,228.188,244.088163,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,C1C2C(C(S1)CCCCC(=O)O)NC(=O)N2,0,11.112311,-0.728901,11.112311,0.064041,0.493478,244.316,228.188,244.088163,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,C1C2C(C(S1)CCCCC(=O)O)NC(=O)N2,0,11.112311,-0.728901,11.112311,0.064041,0.493478,244.316,228.188,244.088163,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
appendix = "desc_fp_"
data.to_csv("{}{}train.csv".format(path,appendix), index=None)

# Test Data

In [10]:
x_test = pd.read_csv("{}smiles_test.csv".format(path))
x_test.head()

Unnamed: 0,Smiles
0,C1=CC(=C(C=C1C(=O)O)O)O
1,C(CC(=O)O)CN
2,C1C2C(C(S1)CCCCC(=O)O)NC(=O)N2
3,C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)[N+](=O)[O-]
4,C1=CC=C(C(=C1)C(=O)O)O


In [11]:
calculate_MorganFP = True
desc_names = [desc[0] for desc in Chem.Descriptors._descList]
fp_len = 1024
test_data = pd.concat([x_test, get_Descriptors(x_test, desc_names)],axis=1, ignore_index = True)
test_data.columns = ["Smiles"] + desc_names
if calculate_MorganFP:
    test_data = pd.concat([test_data, get_MorganFP(x_test, fp_len)],axis=1, ignore_index = True)
    test_data.columns = ["Smiles"] + desc_names + list(range(fp_len))
test_data.head()

0 out of 93088 = 0%
4654 out of 93088 = 4%
9308 out of 93088 = 9%
13962 out of 93088 = 14%
18616 out of 93088 = 19%
23270 out of 93088 = 24%
27924 out of 93088 = 29%
32578 out of 93088 = 34%
37232 out of 93088 = 39%
41886 out of 93088 = 44%
46540 out of 93088 = 49%
51194 out of 93088 = 54%
55848 out of 93088 = 59%
60502 out of 93088 = 64%
65156 out of 93088 = 69%
69810 out of 93088 = 74%
74464 out of 93088 = 79%
79118 out of 93088 = 84%
83772 out of 93088 = 89%
88426 out of 93088 = 94%
93080 out of 93088 = 99%
0 out of 93088 = 0%
4654 out of 93088 = 4%
9308 out of 93088 = 9%
13962 out of 93088 = 14%
18616 out of 93088 = 19%
23270 out of 93088 = 24%
27924 out of 93088 = 29%
32578 out of 93088 = 34%
37232 out of 93088 = 39%
41886 out of 93088 = 44%
46540 out of 93088 = 49%
51194 out of 93088 = 54%
55848 out of 93088 = 59%
60502 out of 93088 = 64%
65156 out of 93088 = 69%
69810 out of 93088 = 74%
74464 out of 93088 = 79%
79118 out of 93088 = 84%
83772 out of 93088 = 89%
88426 out of 93088

Unnamed: 0,Smiles,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,C1=CC(=C(C=C1C(=O)O)O)O,10.271704,-1.138611,10.271704,0.055278,0.522491,154.121,148.073,154.026609,58.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C(CC(=O)O)CN,9.702639,-0.772731,9.702639,0.190972,0.520009,103.121,94.049,103.063329,42.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C1C2C(C(S1)CCCCC(=O)O)NC(=O)N2,11.112311,-0.728901,11.112311,0.064041,0.493478,244.316,228.188,244.088163,90.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)[N+](=O)[O-],11.304786,-1.325823,11.304786,0.132301,0.409071,323.132,311.036,322.012327,110.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,C1=CC=C(C(=C1)C(=O)O)O,10.261759,-1.11287,10.261759,0.06713,0.610259,138.122,132.074,138.031694,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
appendix = "desc_fp_"
test_data.to_csv("{}{}test.csv".format(path, appendix), header=True, index=None)

In [3]:
df_train = pd.read_csv("{}desc_fp_train.csv".format(path))
df_test = pd.read_csv("{}desc_fp_test.csv".format(path))

In [None]:
index = df_train.columns.get_loc("0")
df_train.iloc[:,0:index].to_csv("{}desc_train.csv".format(path), header=True, index=None)
index = df_test.columns.get_loc("0")
df_test.iloc[:,0:index].to_csv("{}desc_test.csv".format(path), header=True, index=None)