In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import pandas as pd
import numpy as np
from mordred import Calculator, descriptors

In [2]:
train_data = pd.read_csv('train_II.csv')
test_data = pd.read_csv('test_II.csv')

In [3]:
test_data = test_data.rename(columns = {"x": "Id"})
test_data.head()

Unnamed: 0,Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...


In [4]:
train_data[["Chemical_Id", "Assay_Id"]] = train_data.Id.str.split(";", expand = True)
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16
4,[Na+].[I-];1856,2,[Na+].[I-],1856


In [5]:
test_data[["Chemical_Id", "Assay_Id"]] = test_data.Id.str.split(";", expand = True)
test_data.head()

Unnamed: 0,Id,Chemical_Id,Assay_Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30


In [6]:
train_data['Assay_Id'] = train_data['Assay_Id'].astype(int)
test_data['Assay_Id'] = test_data['Assay_Id'].astype(int)

In [7]:
train_data = train_data[train_data['Chemical_Id'] != 'F[Si-2](F)(F)(F)(F)F.[Na+].[Na+]']
train_data.shape, test_data.shape

((75377, 4), (10994, 3))

In [8]:
def canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles] 
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [9]:
Canon = canonical_smiles(train_data.Chemical_Id)
len(Canon)

75377

In [10]:
train_data['Chemical_Id'] = Canon
train_data

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1644
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(C)=O,2451
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,O=[N+]([O-])NC1=NCCN1Cc1ccc(Cl)nc1,16
4,[Na+].[I-];1856,2,[I-].[Na+],1856
...,...,...,...,...
75378,COC(=O)NS(=O)(=O)C1=CC=C(C=C1)N;33,2,COC(=O)NS(=O)(=O)c1ccc(N)cc1,33
75379,CCOP(=S)(OCC)OC1=NN(C(=N1)Cl)C(C)C;1632,1,CCOP(=S)(OCC)Oc1nc(Cl)n(C(C)C)n1,1632
75380,C1=CC=C2C(=C1)NC(=S)S2;1373,1,S=c1[nH]c2ccccc2s1,1373
75381,CCCCC(CC)C=O;2,2,CCCCC(C=O)CC,2


In [11]:
Canon_test = canonical_smiles(test_data.Chemical_Id)
len(Canon_test)

10994

In [12]:
test_data['Chemical_Id'] = Canon_test
test_data

Unnamed: 0,Id,Chemical_Id,Assay_Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,Cc1ccc(C(C)(C)C)c(O)c1,1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)c1cccnc1S(=O)(=O)NC(=O)Nc1nc(OC)cc(...,1656
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CNS(=O)(=O)c1cc(C(=O)N2CCC(CCN3C4CCC3CC(n3c(C)...,36
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@@H](C)C=C2C=C[C@H](C)...,30
...,...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,CON=C(C(=O)OC)c1ccccc1CON=C(C)c1cccc(C(F)(F)F)c1,38
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,O=C(O)c1ccccc1C(c1ccc(O)cc1)c1ccc(O)cc1,34
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,COC(=O)c1ccccc1S(=O)(=O)NC(=O)Nc1nc(C)nc(OC)n1,1640
10992,COP(=O)(OC)OC=C(Cl)Cl;28,COP(=O)(OC)OC=C(Cl)Cl,28


In [13]:
duplicates_smiles = train_data[train_data['Chemical_Id'].duplicated()]['Chemical_Id'].values
len(duplicates_smiles)

72153

In [14]:
train_data[train_data['Chemical_Id'].isin(duplicates_smiles)].sort_values(by=['Chemical_Id'])

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id
34245,CN1[C@@H]2CC(C[C@H]1[C@H]3[C@@H]2O3)OC(=O)[C@H...,2,Br.CN1[C@@H]2CC(OC(=O)[C@H](CO)c3ccccc3)C[C@H]...,2452
47399,CN1[C@@H]2CC(C[C@H]1[C@H]3[C@@H]2O3)OC(=O)[C@H...,2,Br.CN1[C@@H]2CC(OC(=O)[C@H](CO)c3ccccc3)C[C@H]...,2453
21121,C(Br)(Br)Br;2451,2,BrC(Br)Br,2451
46968,C(Br)(Br)Br;2453,2,BrC(Br)Br,2453
72450,C(Br)(Br)Br;2452,2,BrC(Br)Br,2452
...,...,...,...,...
57807,C1=NC=NN1;1388,2,c1nc[nH]n1,1388
33330,C1=NC=NN1;37,2,c1nc[nH]n1,37
57257,C1=NC=NN1;1389,2,c1nc[nH]n1,1389
61146,C1=NC=NN1;1373,2,c1nc[nH]n1,1373


In [15]:
train_data = train_data.drop_duplicates(subset=['Chemical_Id'])
len(train_data)

3224

In [16]:
train_data

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1644
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(C)=O,2451
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,O=[N+]([O-])NC1=NCCN1Cc1ccc(Cl)nc1,16
4,[Na+].[I-];1856,2,[I-].[Na+],1856
...,...,...,...,...
74085,C1=CC=C(C=C1)NC(=S)N;1852,1,NC(=S)Nc1ccccc1,1852
74550,CN1CN(CN(C1)C)C;2,2,CN1CN(C)CN(C)C1,2
74760,CCCCC1CCC(=O)O1;1852,1,CCCCC1CCC(=O)O1,1852
74936,CCOC(=O)CCC1=CC=CC=C1;2,2,CCOC(=O)CCc1ccccc1,2


In [17]:
from rdkit.Chem import Descriptors
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        if mol:
        # add hydrogens to molecules
            mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
            descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(train_data['Chemical_Id'])

In [18]:
df_descriptors_train = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_descriptors_train

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.874650,-2.987783,8.874650,0.765809,0.794714,317.599,306.511,315.982463,100,0,...,0,0,0,0,0,0,0,0,0,0
1,11.875100,-4.547647,11.875100,2.480493,0.516641,156.269,136.109,156.151415,66,0,...,0,0,0,0,0,0,0,0,0,0
2,8.736945,-6.030543,8.736945,0.000000,0.251327,362.086,313.702,361.347528,148,0,...,0,0,0,0,0,0,0,0,0,0
3,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,245.585,255.052302,90,0,...,0,0,0,0,0,0,0,0,0,0
4,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,149.894,149.894242,8,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3219,7.565278,-0.630990,7.565278,0.052521,0.596319,152.222,144.158,152.040819,52,0,...,0,0,0,0,0,0,0,0,0,0
3220,7.765833,-3.703125,7.765833,0.590208,0.443158,129.207,114.087,129.126597,54,0,...,0,0,0,0,0,0,0,0,0,0
3221,11.496092,-4.068542,11.496092,1.975352,0.562255,142.198,128.086,142.099380,58,0,...,0,0,0,0,0,0,0,0,0,0
3222,12.026002,-3.792040,12.026002,0.869053,0.660429,178.231,164.119,178.099380,70,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_descriptors_train.columns[df_descriptors_train.isnull().any()].tolist()

['MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW']

In [20]:
df_descriptors_train = df_descriptors_train.replace([np.inf, -np.inf], np.nan)

In [21]:
df_descriptors_train = df_descriptors_train.fillna(df_descriptors_train.mean())

In [22]:
df_descriptors_train.columns[df_descriptors_train.isnull().any()].tolist()

[]

In [23]:
train_data = train_data.reset_index(drop=True)
train_data = pd.concat([train_data, df_descriptors_train], axis = 1)

In [24]:
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1644,8.87465,-2.987783,8.87465,0.765809,0.794714,317.599,...,0,0,0,0,0,0,0,0,0,0
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(C)=O,2451,11.8751,-4.547647,11.8751,2.480493,0.516641,156.269,...,0,0,0,0,0,0,0,0,0,0
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384,8.736945,-6.030543,8.736945,0.0,0.251327,362.086,...,0,0,0,0,0,0,0,0,0,0
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,O=[N+]([O-])NC1=NCCN1Cc1ccc(Cl)nc1,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,...,0,0,0,0,0,0,0,0,0,0
4,[Na+].[I-];1856,2,[I-].[Na+],1856,0.0,0.0,0.0,0.0,0.237972,149.894,...,0,0,0,0,0,0,0,0,0,0


In [25]:
from rdkit.Chem import Descriptors
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        if mol:
        # add hydrogens to molecules
            mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
            descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(test_data['Chemical_Id'])

In [26]:
df_descriptors_test = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_descriptors_test

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,164.120115,66,0,...,0,0,0,0,0,0,0,0,0,0
1,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,431.056940,152,0,...,0,1,1,0,0,0,0,0,0,1
2,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,695.250845,254,0,...,0,1,0,0,0,0,0,0,0,0
3,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,200.949810,56,0,...,0,0,0,0,0,0,0,0,0,0
4,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,418.271924,168,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,13.677486,-5.541970,13.677486,1.263580,0.391778,408.376,389.224,408.129692,154,0,...,0,0,0,0,0,0,0,0,0,0
10990,12.722294,-3.388339,12.722294,0.917255,0.635519,320.344,304.216,320.104859,120,0,...,0,0,0,0,0,0,0,0,0,0
10991,13.186679,-5.902467,13.186679,0.638923,0.705120,381.370,366.250,381.074304,138,0,...,0,1,0,0,0,0,0,0,0,1
10992,11.701188,-5.111667,11.701188,0.820825,0.540194,220.976,213.920,219.945901,66,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df_descriptors_test.columns[df_descriptors_test.isnull().any()].tolist()

['MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW']

In [28]:
df_descriptors_test = df_descriptors_test.replace([np.inf, -np.inf], np.nan)

In [29]:
df_descriptors_test = df_descriptors_test.fillna(df_descriptors_test.mean())

In [30]:
df_descriptors_test.columns[df_descriptors_test.isnull().any()].tolist()

[]

In [31]:
test_data = test_data.reset_index(drop=True)
test_data = pd.concat([test_data, df_descriptors_test], axis = 1)

In [32]:
test_data

Unnamed: 0,Id,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,Cc1ccc(C(C)(C)C)c(O)c1,1682,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,...,0,0,0,0,0,0,0,0,0,0
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)c1cccnc1S(=O)(=O)NC(=O)Nc1nc(OC)cc(...,1656,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,...,0,1,1,0,0,0,0,0,0,1
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CNS(=O)(=O)c1cc(C(=O)N2CCC(CCN3C4CCC3CC(n3c(C)...,36,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,...,0,1,0,0,0,0,0,0,0,0
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,...,0,0,0,0,0,0,0,0,0,0
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@@H](C)C=C2C=C[C@H](C)...,30,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,CON=C(C(=O)OC)c1ccccc1CON=C(C)c1cccc(C(F)(F)F)c1,38,13.677486,-5.541970,13.677486,1.263580,0.391778,408.376,389.224,...,0,0,0,0,0,0,0,0,0,0
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,O=C(O)c1ccccc1C(c1ccc(O)cc1)c1ccc(O)cc1,34,12.722294,-3.388339,12.722294,0.917255,0.635519,320.344,304.216,...,0,0,0,0,0,0,0,0,0,0
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,COC(=O)c1ccccc1S(=O)(=O)NC(=O)Nc1nc(C)nc(OC)n1,1640,13.186679,-5.902467,13.186679,0.638923,0.705120,381.370,366.250,...,0,1,0,0,0,0,0,0,0,1
10992,COP(=O)(OC)OC=C(Cl)Cl;28,COP(=O)(OC)OC=C(Cl)Cl,28,11.701188,-5.111667,11.701188,0.820825,0.540194,220.976,213.920,...,0,0,0,0,0,0,0,0,0,0


In [33]:
train_data.shape, test_data.shape

((3224, 212), (10994, 211))

In [34]:
from rdkit.Chem import AllChem
def smiles_to_fingerprints(smiles_list):
    fingerprints = []
    for smiles in smiles_list:
        m = Chem.MolFromSmiles(smiles)
        if m is None:
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(m, 2, 2048)
        mfpts = np.array(fp)
        fingerprints.append(mfpts)
    return np.array(fingerprints)

In [35]:
chemical_Id_list_train = train_data['Chemical_Id'].to_list()

In [36]:
train_fingerprints = smiles_to_fingerprints(chemical_Id_list_train)

In [37]:
train_Mfingerprints = pd.DataFrame(train_fingerprints,columns=['V{}'.format(i) for i in range(train_fingerprints.shape[1])])

In [38]:
train_Mfingerprints.columns[train_Mfingerprints.isnull().any()].tolist()

[]

In [39]:
train_Mfingerprints.columns[train_Mfingerprints.isna().any()].tolist()

[]

In [40]:
train_data = train_data.reset_index(drop=True)
train_data = pd.concat([train_data, train_Mfingerprints], axis = 1)

In [41]:
train_data.shape

(3224, 2260)

In [42]:
chemical_Id_list_test = test_data['Chemical_Id'].to_list()

In [43]:
test_fingerprints = smiles_to_fingerprints(chemical_Id_list_test)

In [44]:
test_Mfingerprints = pd.DataFrame(test_fingerprints,columns=['V{}'.format(i) for i in range(test_fingerprints.shape[1])])

In [45]:
test_Mfingerprints.columns[test_Mfingerprints.isnull().any()].tolist()

[]

In [46]:
test_Mfingerprints.columns[test_Mfingerprints.isna().any()].tolist()

[]

In [47]:
test_data = test_data.reset_index(drop=True)
test_data = pd.concat([test_data, test_Mfingerprints], axis = 1)

In [48]:
test_data.shape

(10994, 2259)

In [49]:
test_data.columns[test_data.isna().any()].tolist()

[]

In [50]:
train_data.columns[train_data.isna().any()].tolist()

[]

In [51]:
from mordred import Calculator, descriptors
def All_Mordred_descriptors(data):
    calc = Calculator(descriptors, ignore_3D=False)
    mols = [Chem.MolFromSmiles(smi) for smi in data]
    
    # pandas df
    df = calc.pandas(mols)
    return df

In [52]:
train_mordred_descriptors = All_Mordred_descriptors(train_data['Chemical_Id'])

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [53]:
train_mordred_descriptors

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,23.128032,2.424173,4.848346,23.128032,1.217265,3.862863,...,9.861675,52.418777,315.982463,10.532749,678,28,98.0,112.0,7.618056,4.055556
1,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,12.784906,1.975377,3.950753,12.784906,1.162264,3.210676,...,7.832411,37.843620,156.151415,5.037142,212,8,40.0,38.0,4.861111,2.833333
2,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,1,1,multiple fragments (SpAbs_A/SpAbs),multiple fragments (SpMax_A/SpMax),multiple fragments (SpDiam_A/SpDiam),multiple fragments (SpAD_A/SpAD),multiple fragments (SpMAD_A/SpMAD),multiple fragments (LogEE_A/LogEE),...,9.034200,55.641246,361.347528,5.018716,2300001804,22,92.0,92.0,divide by zero encountered in power (mZagreb1),5.750000
3,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,3,21.30202,2.337073,4.596189,21.30202,1.25306,3.748471,...,9.430840,62.679417,255.052302,9.446382,559,20,84.0,94.0,5.805556,3.777778
4,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,1,1,multiple fragments (SpAbs_A/SpAbs),multiple fragments (SpMax_A/SpMax),multiple fragments (SpDiam_A/SpDiam),multiple fragments (SpAD_A/SpAD),multiple fragments (SpMAD_A/SpMAD),multiple fragments (LogEE_A/LogEE),...,0.000000,2.000000,149.894242,74.947121,100000000,0,0.0,0.0,divide by zero encountered in power (mZagreb1),0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3219,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,12.066822,2.175328,4.350655,12.066822,1.206682,3.188073,...,8.564268,38.519688,152.040819,8.446712,126,9,44.0,46.0,3.722222,2.333333
3220,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,3,10.12899,2.236068,4.472136,10.12899,1.125443,3.099014,...,8.760767,37.736937,129.126597,5.380275,84,9,42.0,45.0,4.083333,2.000000
3221,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,12.4268,2.236324,4.251712,12.4268,1.24268,3.191982,...,8.497195,51.903908,142.099380,5.920807,132,8,44.0,47.0,3.722222,2.416667
3222,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,16.330127,2.175328,4.350655,16.330127,1.256164,3.436887,...,8.743053,42.596927,178.099380,6.596273,290,13,56.0,59.0,4.472222,3.166667


In [54]:
train_mordred_descriptors.columns[train_mordred_descriptors.isnull().any()].tolist()

[]

In [55]:
train_data = train_data.reset_index(drop=True)
train_data = pd.concat([train_data, train_mordred_descriptors], axis = 1)

In [56]:
train_data

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1644,8.874650,-2.987783,8.874650,0.765809,0.794714,317.599,...,9.861675,52.418777,315.982463,10.532749,678,28,98.0,112.0,7.618056,4.055556
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(C)=O,2451,11.875100,-4.547647,11.875100,2.480493,0.516641,156.269,...,7.832411,37.843620,156.151415,5.037142,212,8,40.0,38.0,4.861111,2.833333
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384,8.736945,-6.030543,8.736945,0.000000,0.251327,362.086,...,9.034200,55.641246,361.347528,5.018716,2300001804,22,92.0,92.0,divide by zero encountered in power (mZagreb1),5.750000
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,O=[N+]([O-])NC1=NCCN1Cc1ccc(Cl)nc1,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,...,9.430840,62.679417,255.052302,9.446382,559,20,84.0,94.0,5.805556,3.777778
4,[Na+].[I-];1856,2,[I-].[Na+],1856,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,...,0.000000,2.000000,149.894242,74.947121,100000000,0,0.0,0.0,divide by zero encountered in power (mZagreb1),0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3219,C1=CC=C(C=C1)NC(=S)N;1852,1,NC(=S)Nc1ccccc1,1852,7.565278,-0.630990,7.565278,0.052521,0.596319,152.222,...,8.564268,38.519688,152.040819,8.446712,126,9,44.0,46.0,3.722222,2.333333
3220,CN1CN(CN(C1)C)C;2,2,CN1CN(C)CN(C)C1,2,7.765833,-3.703125,7.765833,0.590208,0.443158,129.207,...,8.760767,37.736937,129.126597,5.380275,84,9,42.0,45.0,4.083333,2.000000
3221,CCCCC1CCC(=O)O1;1852,1,CCCCC1CCC(=O)O1,1852,11.496092,-4.068542,11.496092,1.975352,0.562255,142.198,...,8.497195,51.903908,142.099380,5.920807,132,8,44.0,47.0,3.722222,2.416667
3222,CCOC(=O)CCC1=CC=CC=C1;2,2,CCOC(=O)CCc1ccccc1,2,12.026002,-3.792040,12.026002,0.869053,0.660429,178.231,...,8.743053,42.596927,178.099380,6.596273,290,13,56.0,59.0,4.472222,3.166667


In [57]:
test_mordred_descriptors = All_Mordred_descriptors(test_data['Chemical_Id'])

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [58]:
test_mordred_descriptors

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,13.258897,2.364437,4.728875,13.258897,1.104908,3.390642,...,9.394743,43.032166,164.120115,5.861433,186,16,60.0,67.0,6.145833,2.527778
1,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,34.163378,2.459149,4.918298,34.163378,1.220121,4.234561,...,10.273636,63.339280,431.056940,9.579043,2119,45,142.0,164.0,11.791667,6.277778
2,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,1,62.343433,2.532977,5.019557,62.343433,1.298822,4.825487,...,11.049603,103.100919,695.250845,7.900578,9290,90,272.0,333.0,15.402778,10.194444
3,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,1,1,multiple fragments (SpAbs_A/SpAbs),multiple fragments (SpMax_A/SpMax),multiple fragments (SpDiam_A/SpDiam),multiple fragments (SpAD_A/SpAD),multiple fragments (SpMAD_A/SpMAD),multiple fragments (LogEE_A/LogEE),...,9.169623,41.037455,200.949810,13.396654,1000000109,11,50.0,54.0,divide by zero encountered in power (mZagreb1),2.083333
4,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,36.874803,2.481037,4.962075,36.874803,1.22916,4.321583,...,10.398641,65.900498,418.271924,6.151058,2440,50,158.0,185.0,11.923611,6.486111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,36.020897,2.367932,4.735863,36.020897,1.2421,4.260749,...,10.137808,64.086011,408.129692,8.502702,2512,44,142.0,162.0,11.340278,6.611111
10990,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,1,0,30.949962,2.445978,4.891956,30.949962,1.289582,4.103177,...,10.072090,58.538924,320.104859,8.002621,1242,38,124.0,145.0,7.888889,5.305556
10991,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,31.748747,2.422802,4.845603,31.748747,1.221106,4.159123,...,10.112329,60.757403,381.074304,9.294495,1753,40,130.0,149.0,10.590278,5.847222
10992,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,12.540144,2.224278,4.448556,12.540144,1.140013,3.23728,...,8.788898,40.102163,219.945901,12.219217,168,12,46.0,48.0,6.173611,2.708333


In [59]:
test_mordred_descriptors.columns[test_mordred_descriptors.isnull().any()].tolist()

[]

In [60]:
test_data = test_data.reset_index(drop=True)
test_data = pd.concat([test_data, test_mordred_descriptors], axis = 1)

In [61]:
test_data

Unnamed: 0,Id,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,Cc1ccc(C(C)(C)C)c(O)c1,1682,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,...,9.394743,43.032166,164.120115,5.861433,186,16,60.0,67.0,6.145833,2.527778
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)c1cccnc1S(=O)(=O)NC(=O)Nc1nc(OC)cc(...,1656,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,...,10.273636,63.339280,431.056940,9.579043,2119,45,142.0,164.0,11.791667,6.277778
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CNS(=O)(=O)c1cc(C(=O)N2CCC(CCN3C4CCC3CC(n3c(C)...,36,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,...,11.049603,103.100919,695.250845,7.900578,9290,90,272.0,333.0,15.402778,10.194444
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,...,9.169623,41.037455,200.949810,13.396654,1000000109,11,50.0,54.0,divide by zero encountered in power (mZagreb1),2.083333
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@@H](C)C=C2C=C[C@H](C)...,30,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,...,10.398641,65.900498,418.271924,6.151058,2440,50,158.0,185.0,11.923611,6.486111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,CON=C(C(=O)OC)c1ccccc1CON=C(C)c1cccc(C(F)(F)F)c1,38,13.677486,-5.541970,13.677486,1.263580,0.391778,408.376,389.224,...,10.137808,64.086011,408.129692,8.502702,2512,44,142.0,162.0,11.340278,6.611111
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,O=C(O)c1ccccc1C(c1ccc(O)cc1)c1ccc(O)cc1,34,12.722294,-3.388339,12.722294,0.917255,0.635519,320.344,304.216,...,10.072090,58.538924,320.104859,8.002621,1242,38,124.0,145.0,7.888889,5.305556
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,COC(=O)c1ccccc1S(=O)(=O)NC(=O)Nc1nc(C)nc(OC)n1,1640,13.186679,-5.902467,13.186679,0.638923,0.705120,381.370,366.250,...,10.112329,60.757403,381.074304,9.294495,1753,40,130.0,149.0,10.590278,5.847222
10992,COP(=O)(OC)OC=C(Cl)Cl;28,COP(=O)(OC)OC=C(Cl)Cl,28,11.701188,-5.111667,11.701188,0.820825,0.540194,220.976,213.920,...,8.788898,40.102163,219.945901,12.219217,168,12,46.0,48.0,6.173611,2.708333


In [62]:
train_data.shape, test_data.shape

((3224, 4086), (10994, 4085))

# Feature selection using variance threshold

In [63]:
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing

In [64]:
le = preprocessing.LabelEncoder()
train_chemical_id =  list(train_data.Chemical_Id.unique())
le.fit(train_chemical_id)
train_data['Chemical_Id']=le.transform(train_data['Chemical_Id'])

In [65]:
#split target variable from rest of the data
train_Y = train_data["Expected"]

In [66]:
train_Y

0       2
1       2
2       2
3       2
4       2
       ..
3219    1
3220    2
3221    1
3222    2
3223    1
Name: Expected, Length: 3224, dtype: int64

In [67]:
train_data.drop(['Expected'], axis = 1, inplace=True) 
train_X = pd.get_dummies(train_data)

In [68]:
train_X

Unnamed: 0,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,...,mZagreb1_divide by zero encountered in power (mZagreb1),mZagreb1_divide by zero encountered in power (mZagreb1).1,mZagreb1_8.645833333333332,mZagreb1_5.819444444444445,mZagreb1_9.756944444444445,mZagreb1_14.1875,mZagreb1_5.527777777777777,mZagreb1_divide by zero encountered in power (mZagreb1).2,mZagreb1_4.166666666666666,mZagreb1_10.756944444444443
0,3097,1644,8.874650,-2.987783,8.874650,0.765809,0.794714,317.599,306.511,315.982463,...,0,0,0,0,0,0,0,0,0,0
1,997,2451,11.875100,-4.547647,11.875100,2.480493,0.516641,156.269,136.109,156.151415,...,0,0,0,0,0,0,0,0,0,0
2,1096,1384,8.736945,-6.030543,8.736945,0.000000,0.251327,362.086,313.702,361.347528,...,0,0,0,0,0,0,0,0,0,0
3,2940,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,245.585,255.052302,...,0,0,0,0,0,0,0,0,0,0
4,3148,1856,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,149.894,149.894242,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3219,2462,1852,7.565278,-0.630990,7.565278,0.052521,0.596319,152.222,144.158,152.040819,...,0,0,0,0,0,0,0,0,0,0
3220,1647,2,7.765833,-3.703125,7.765833,0.590208,0.443158,129.207,114.087,129.126597,...,0,0,0,0,0,0,0,0,0,0
3221,931,1852,11.496092,-4.068542,11.496092,1.975352,0.562255,142.198,128.086,142.099380,...,0,0,0,0,0,0,0,0,0,0
3222,1382,2,12.026002,-3.792040,12.026002,0.869053,0.660429,178.231,164.119,178.099380,...,0,0,0,0,0,0,0,0,0,0


In [69]:
# Setting up variance threshold
sel = VarianceThreshold(threshold=0.25)
X_new = sel.fit_transform(train_X)

In [70]:
# Retrieve list of features from the index of features selected by variance threshold.
list_x = []
col_names = []
selected_features = []

In [71]:
list_x = sel.get_support(indices=True)
col_names = train_X.columns.tolist()

In [72]:
for i in list_x: 
    selected_features.append(col_names[i])

print("Selected features:")
print(selected_features)

Selected features:
['Chemical_Id', 'Assay_Id', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_MRHI', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EStat

In [73]:
le = preprocessing.LabelEncoder()
test_chemical_id =  list(test_data.Chemical_Id.unique())
le.fit(test_chemical_id)
test_data['Chemical_Id']=le.transform(test_data['Chemical_Id'])

In [74]:
features = ['Chemical_Id', 'Assay_Id', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_MRHI', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'HeavyAtomCount', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds', 'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount', 'MolLogP', 'MolMR', 'fr_Al_OH', 'fr_Ar_N', 'fr_Ar_OH', 'fr_C_O', 'fr_C_O_noCOO', 'fr_NH0', 'fr_NH1', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_aniline', 'fr_benzene', 'fr_bicyclic', 'fr_ester', 'fr_ether', 'fr_halogen', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'nAcid', 'nBase', 'nAromAtom', 'nAromBond', 'nAtom', 'nHeavyAtom', 'nBridgehead', 'nHetero', 'nH', 'nC', 'nN', 'nO', 'nS', 'nF', 'nCl', 'nX', 'ATS0dv', 'ATS1dv', 'ATS2dv', 'ATS3dv', 'ATS4dv', 'ATS5dv', 'ATS6dv', 'ATS7dv', 'ATS8dv', 'ATS0d', 'ATS1d', 'ATS2d', 'ATS3d', 'ATS4d', 'ATS5d', 'ATS6d', 'ATS7d', 'ATS8d', 'ATS0Z', 'ATS1Z', 'ATS2Z', 'ATS3Z', 'ATS4Z', 'ATS5Z', 'ATS6Z', 'ATS7Z', 'ATS8Z', 'ATS0m', 'ATS1m', 'ATS2m', 'ATS3m', 'ATS4m', 'ATS5m', 'ATS6m', 'ATS7m', 'ATS8m', 'ATS0v', 'ATS1v', 'ATS2v', 'ATS3v', 'ATS4v', 'ATS5v', 'ATS6v', 'ATS7v', 'ATS8v', 'ATS0p', 'ATS1p', 'ATS2p', 'ATS3p', 'ATS4p', 'ATS5p', 'ATS6p', 'ATS7p', 'ATS8p', 'ATS0i', 'ATS1i', 'ATS2i', 'ATS3i', 'ATS4i', 'ATS5i', 'ATS6i', 'ATS7i', 'ATS8i', 'AATS0dv', 'AATS0d', 'AATS0Z', 'AATS0m', 'AATS0v', 'AATS0p', 'AATS0i', 'ATSC0dv', 'ATSC1dv', 'ATSC2dv', 'ATSC3dv', 'ATSC4dv', 'ATSC5dv', 'ATSC6dv', 'ATSC7dv', 'ATSC8dv', 'ATSC0d', 'ATSC1d', 'ATSC2d', 'ATSC3d', 'ATSC4d', 'ATSC5d', 'ATSC6d', 'ATSC7d', 'ATSC8d', 'ATSC0Z', 'ATSC1Z', 'ATSC2Z', 'ATSC3Z', 'ATSC4Z', 'ATSC5Z', 'ATSC6Z', 'ATSC7Z', 'ATSC8Z', 'ATSC0m', 'ATSC1m', 'ATSC2m', 'ATSC3m', 'ATSC4m', 'ATSC5m', 'ATSC6m', 'ATSC7m', 'ATSC8m', 'ATSC0v', 'ATSC1v', 'ATSC2v', 'ATSC3v', 'ATSC4v', 'ATSC5v', 'ATSC6v', 'ATSC7v', 'ATSC8v', 'ATSC0p', 'ATSC1p', 'ATSC2p', 'ATSC3p', 'ATSC4p', 'ATSC5p', 'ATSC6p', 'ATSC7p', 'ATSC8p', 'ATSC0i', 'ATSC1i', 'ATSC2i', 'ATSC3i', 'ATSC4i', 'ATSC5i', 'ATSC6i', 'ATSC7i', 'ATSC8i', 'AATSC0dv', 'AATSC0Z', 'AATSC0m', 'AATSC0v', 'AATSC0p', 'AATSC0i', 'BalabanJ', 'BertzCT', 'nBonds', 'nBondsO', 'nBondsS', 'nBondsD', 'nBondsA', 'nBondsM', 'nBondsKS', 'nBondsKD', 'C1SP2', 'C2SP2', 'C3SP2', 'C1SP3', 'C2SP3', 'C3SP3', 'Xc-3d', 'Xc-5d', 'Xc-3dv', 'Xpc-4d', 'Xpc-5d', 'Xpc-6d', 'Xpc-4dv', 'Xpc-5dv', 'Xpc-6dv', 'Xp-1d', 'Xp-2d', 'Xp-3d', 'Xp-4d', 'Xp-5d', 'Xp-6d', 'Xp-7d', 'Xp-1dv', 'Xp-2dv', 'Xp-3dv', 'Xp-4dv', 'Xp-5dv', 'Xp-6dv', 'Xp-7dv', 'SZ', 'Sm', 'Sv', 'Sp', 'Si', 'Mm', 'NsCH3', 'NssCH2', 'NdsCH', 'NaaCH', 'NsssCH', 'NdssC', 'NaasC', 'NaaaC', 'NssssC', 'NssNH', 'NaaN', 'NsssN', 'NsOH', 'NdO', 'NssO', 'NsF', 'NsCl', 'SsCH3', 'SdCH2', 'SssCH2', 'SdsCH', 'SaaCH', 'SsssCH', 'StsC', 'SdssC', 'SaasC', 'SaaaC', 'SssssC', 'SsNH2', 'SssNH', 'SaaNH', 'StN', 'SdsN', 'SaaN', 'SsssN', 'SsOH', 'SdO', 'SssO', 'SaaO', 'SsF', 'SdsssP', 'SdS', 'SddssS', 'SsCl', 'SsBr', 'ECIndex', 'fragCpx', 'nHBAcc', 'nHBDon', 'IC1', 'IC2', 'IC3', 'IC4', 'IC5', 'TIC0', 'TIC1', 'TIC2', 'TIC3', 'TIC4', 'TIC5', 'CIC0', 'CIC1', 'CIC2', 'CIC3', 'CIC4', 'CIC5', 'MIC0', 'MIC1', 'MIC2', 'MIC3', 'MIC4', 'MIC5', 'ZMIC0', 'ZMIC1', 'ZMIC2', 'ZMIC3', 'ZMIC4', 'ZMIC5', 'FilterItLogS', 'VMcGowan', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'SMR_VSA1', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA10', 'SlogP_VSA11', 'EState_VSA1', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'EState_VSA10', 'VSA_EState1', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'MPC2', 'MPC3', 'MPC4', 'MPC5', 'MPC6', 'MPC7', 'MPC8', 'MPC9', 'MPC10', 'TMPC10', 'piPC1', 'piPC2', 'piPC3', 'piPC4', 'piPC5', 'piPC6', 'piPC7', 'piPC8', 'piPC9', 'piPC10', 'TpiPC10', 'apol', 'bpol', 'nRing', 'n5Ring', 'n6Ring', 'nHRing', 'n6HRing', 'naRing', 'n6aRing', 'nARing', 'n6ARing', 'nAHRing', 'nRot', 'SLogP', 'SMR', 'TopoPSA(NO)', 'TopoPSA', 'GGI1', 'GGI2', 'GGI3', 'GGI4', 'GGI5', 'Diameter', 'Radius', 'MWC01', 'MWC02', 'MWC03', 'MWC04', 'MWC05', 'MWC06', 'MWC07', 'MWC08', 'MWC09', 'MWC10', 'TMWC10', 'SRW02', 'SRW04', 'SRW05', 'SRW06', 'SRW07', 'SRW08', 'SRW09', 'SRW10', 'TSRW10', 'MW', 'AMW', 'WPath', 'WPol', 'Zagreb1', 'Zagreb2', 'mZagreb2']

In [75]:
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

In [76]:
X_train.columns[X_train.isna().any()].tolist()

[]

In [77]:
X_test.columns[X_test.isna().any()].tolist()

[]

# Feature selection using correlation

In [78]:
corr_coeffs = X_train.corrwith(train_Y)
corr_coeffs = corr_coeffs.sort_values(ascending=False)
print('Feature rankings based on correlation with', train_Y)
for i, (feature, corr) in enumerate(corr_coeffs.items()):
    print(f'{i+1}. {feature}: {corr:.3f}')

Feature rankings based on correlation with 0       2
1       2
2       2
3       2
4       2
       ..
3219    1
3220    2
3221    1
3222    2
3223    1
Name: Expected, Length: 3224, dtype: int64
1. AATSC0dv: 0.074
2. FilterItLogS: 0.069
3. AATS0dv: 0.055
4. SdO: 0.052
5. NdO: 0.052
6. TopoPSA(NO): 0.047
7. TPSA: 0.047
8. FpDensityMorgan2: 0.044
9. fr_amide: 0.044
10. fr_NH1: 0.042
11. NOCount: 0.042
12. PEOE_VSA3: 0.042
13. PEOE_VSA3: 0.042
14. PEOE_VSA3: 0.042
15. PEOE_VSA3: 0.042
16. EState_VSA10: 0.042
17. EState_VSA10: 0.042
18. EState_VSA10: 0.041
19. EState_VSA10: 0.041
20. AATSC0i: 0.040
21. nO: 0.040
22. NssNH: 0.040
23. VSA_EState2: 0.039
24. VSA_EState2: 0.039
25. C1SP2: 0.038
26. AATSC0p: 0.038
27. SMR_VSA3: 0.037
28. SMR_VSA3: 0.037
29. SMR_VSA3: 0.037
30. SMR_VSA3: 0.037
31. ATSC2dv: 0.037
32. FpDensityMorgan3: 0.037
33. SMR_VSA1: 0.036
34. SMR_VSA1: 0.036
35. TopoPSA: 0.036
36. NumHAcceptors: 0.035
37. VSA_EState2: 0.035
38. VSA_EState2: 0.035
39. ATSC2p: 0.035
40. PEOE_

In [79]:
features = ['AATSC0dv', 'FilterItLogS', 'AATS0dv', 'SdO', 'NdO', 'Xpc-4dv', 'SlogP_VSA6', 'SlogP_VSA6', 'SlogP_VSA6', 'SlogP_VSA6', 'ZMIC1', 'NdsCH', 'Chi0n', 'PEOE_VSA8', 'PEOE_VSA8', 'C2SP2', 'Xc-3dv', 'Chi1n', 'ATS3p', 'MolMR', 'SMR', 'SaaCH', 'nC', 'Xpc-5dv', 'nH', 'Xp-6dv', 'ATS1p', 'VSA_EState6', 'VSA_EState6', 'SdssC', 'CIC0', 'ATS2p', 'Xp-1dv', 'CIC2', 'PEOE_VSA6', 'PEOE_VSA6', 'PEOE_VSA6', 'PEOE_VSA6', 'VSA_EState7', 'VSA_EState7', 'Assay_Id', 'CIC1', 'Xp-5dv', 'SssCH2', 'Xp-3dv', 'Xp-2dv', 'EState_VSA9', 'EState_VSA9', 'Xp-4dv', 'SLogP', 'MolLogP', 'EState_VSA8', 'EState_VSA8']

In [80]:
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# Classifiers

In [81]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier

In [82]:
# from sklearn import tree
# model = BaggingClassifier(estimator=tree.DecisionTreeClassifier(random_state=1,
#                                                                      criterion='entropy',
#                                                                      max_depth=35,
#                                                                      class_weight='balanced'),
#                                                                      random_state=1,
#                                                                      n_estimators=20)

In [83]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# # RandomForestClassifier
# model = RandomForestClassifier(random_state=1, max_depth=35, criterion='entropy')

In [86]:
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier

clf1 = xgb.XGBClassifier(random_state=1,booster="gbtree",learning_rate=0.25,n_estimators=250,max_depth=12, min_child_weight=4)
clf2 = lgb.LGBMClassifier(num_threads=1,boosting_type= 'goss',learning_rate=0.1,n_estimators=1000,max_depth=10,num_leaves=100,max_bin = 5000)

model = VotingClassifier(estimators=[('XGB',clf1),('LGBM',clf2)],voting='hard')

In [87]:
# from catboost import CatBoostClassifier
# model = CatBoostClassifier(learning_rate=0.1, boosting_type= 'Ordered',max_depth=10,max_bin = 5000, iterations=50) 

In [88]:
print(np.isnan(X_train).any())

print(np.isinf(X_train).any())
X_train.shape, train_Y.shape

AATSC0dv        False
FilterItLogS    False
AATS0dv         False
SdO             False
NdO             False
                ...  
MolLogP         False
EState_VSA8     False
EState_VSA8     False
EState_VSA8     False
EState_VSA8     False
Length: 71, dtype: bool
AATSC0dv        False
FilterItLogS    False
AATS0dv         False
SdO             False
NdO             False
                ...  
MolLogP         False
EState_VSA8     False
EState_VSA8     False
EState_VSA8     False
EState_VSA8     False
Length: 71, dtype: bool


((3224, 71), (3224,))

In [89]:
X_train

Unnamed: 0,AATSC0dv,FilterItLogS,AATS0dv,SdO,NdO,Xpc-4dv,SlogP_VSA6,SlogP_VSA6.1,SlogP_VSA6.2,SlogP_VSA6.3,...,EState_VSA9,EState_VSA9.1,EState_VSA9.2,Xp-4dv,SLogP,MolLogP,EState_VSA8,EState_VSA8.1,EState_VSA8.2,EState_VSA8.3
0,3.182222,-4.478931,7.093827,0.000000,0,2.112428,48.339350,48.530937,48.339350,48.530937,...,34.802820,50.002046,34.802820,2.659887,4.5999,4.5999,10.217616,0.000000,10.217616,0.000000
1,1.941727,-3.422244,2.645161,10.532611,1,0.102062,0.000000,0.000000,0.000000,0.000000,...,0.000000,27.415171,0.000000,1.154276,3.3260,3.3260,0.000000,6.923737,0.000000,6.923737
2,0.900053,-9.341691,1.288752,0.000000,0,1.207107,0.000000,0.000000,0.000000,0.000000,...,0.000000,65.796411,0.000000,3.599874,4.3482,4.3482,0.000000,27.942818,0.000000,27.942818
3,4.776033,-2.854764,10.355738,10.355080,1,0.733696,23.250137,23.321982,23.250137,23.321982,...,11.600940,25.349609,11.600940,1.675092,0.6879,0.6879,9.976383,15.402175,9.976383,15.402175
4,0.007901,-0.886729,0.015802,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,-5.9920,-5.9920,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3219,2.788752,-1.467969,5.691358,0.000000,0,0.226280,30.212094,30.331835,30.212094,30.331835,...,5.733667,11.089319,5.733667,0.781596,1.3421,1.3421,12.217873,17.534662,12.217873,17.534662
3220,2.750000,0.157293,3.750000,0.000000,0,0.600000,0.000000,0.000000,0.000000,0.000000,...,0.000000,20.561378,0.000000,1.370560,-0.3321,-0.3321,0.000000,35.842745,0.000000,35.842745
3221,3.354167,-1.963890,4.916667,10.594398,1,0.301280,0.000000,0.000000,0.000000,0.000000,...,4.736863,19.190620,4.736863,1.217641,1.8822,1.8822,4.736863,6.923737,4.736863,6.923737
3222,3.580247,-2.928905,6.000000,10.994151,1,0.302749,30.212094,30.331835,30.212094,30.331835,...,4.736863,19.190620,4.736863,1.124233,2.1823,2.1823,4.736863,0.000000,4.736863,0.000000


In [90]:
X_test

Unnamed: 0,AATSC0dv,FilterItLogS,AATS0dv,SdO,NdO,Xpc-4dv,SlogP_VSA6,SlogP_VSA6.1,SlogP_VSA6.2,SlogP_VSA6.3,...,EState_VSA9,EState_VSA9.1,EState_VSA9.2,Xp-4dv,SLogP,MolLogP,EState_VSA8,EState_VSA8.1,EState_VSA8.2,EState_VSA8.3
0,2.811224,-2.956344,4.285714,0.000000,0,1.421503,18.127256,18.199101,18.127256,18.199101,...,0.000000,21.992578,0.000000,1.275888,2.99812,2.99812,5.108808,20.771212,5.108808,20.771212
1,5.397311,-4.035809,11.086420,61.249160,5,3.045142,34.221595,34.317388,34.221595,34.317388,...,9.473726,23.385062,9.473726,4.486481,0.19280,0.19280,24.425661,20.268724,24.425661,20.268724
2,4.070267,-9.148849,7.477834,38.450935,3,5.907306,65.319671,65.559154,65.319671,65.559154,...,16.584918,34.944919,16.584918,9.648527,6.61662,6.61662,4.983979,39.311943,4.983979,39.311943
3,6.738765,-0.591591,14.829630,31.228542,3,0.536245,11.811237,11.835185,11.811237,11.835185,...,0.000000,5.483034,0.000000,1.146072,-2.93050,-2.93050,8.905180,8.905180,8.905180,8.905180
4,3.346021,-3.561542,5.176471,24.565087,2,3.595500,23.729320,23.801165,23.729320,23.801165,...,9.473726,48.661578,9.473726,5.079222,4.58560,4.58560,9.845671,32.075535,9.845671,32.075535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,5.576389,-5.427800,11.416667,11.941280,1,1.316209,58.650775,58.842362,58.650775,58.842362,...,14.412040,26.044413,14.412040,2.433466,4.16970,4.16970,24.723466,10.311425,24.723466,10.311425
10990,3.660000,-3.866220,8.500000,11.617073,1,1.288359,72.509024,72.796405,72.509024,72.796405,...,0.000000,20.742702,0.000000,2.828610,3.97620,3.97620,15.326425,0.000000,15.326425,0.000000
10991,5.301474,-3.698481,11.571816,48.527108,4,1.742432,29.065158,29.160952,29.065158,29.160952,...,4.736863,20.643545,4.736863,2.991776,0.48562,0.48562,24.425661,25.005587,24.425661,25.005587
10992,5.779454,-1.223539,9.584362,11.026605,1,0.857359,10.729600,10.753549,10.729600,10.753549,...,23.201880,32.797190,23.201880,1.034023,2.68040,2.68040,13.571241,13.571241,13.571241,13.571241


In [91]:
train_Y

0       2
1       2
2       2
3       2
4       2
       ..
3219    1
3220    2
3221    1
3222    2
3223    1
Name: Expected, Length: 3224, dtype: int64

In [92]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)

In [93]:
model = model.fit(X_train, train_Y)



In [94]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
X_test = scaler.fit_transform(X_test)

In [95]:
val_predictions = model.predict(X_test)

In [None]:
accuracy_scores = cross_val_score(model, X_train, train_Y, cv=5)
f1_macro_scores = cross_val_score(model, X_train, train_Y, cv=5, scoring='f1_macro')

print("%0.4f accuracy with a standard deviation of %0.4f" % (accuracy_scores.mean(), accuracy_scores.std()))
print("%0.4f f1_score with a standard deviation of %0.4f" % (f1_macro_scores.mean(), f1_macro_scores.std()))



In [None]:
output = pd.DataFrame({'Id': test_data.Id, 'Predicted': val_predictions})
output.to_csv('Output.csv', index=False)