In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors

In [2]:
train_data = pd.read_csv('train_II.csv')
test_data = pd.read_csv('test_II.csv')

In [3]:
test_data = test_data.rename(columns = {"x": "Id"})
test_data.head()

Unnamed: 0,Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...


In [4]:
train_data[["Chemical_Id", "Assay_Id"]] = train_data.Id.str.split(";", expand = True)
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16
4,[Na+].[I-];1856,2,[Na+].[I-],1856


In [5]:
test_data[["Chemical_Id", "Assay_Id"]] = test_data.Id.str.split(";", expand = True)
test_data.head()

Unnamed: 0,Id,Chemical_Id,Assay_Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30


In [6]:
train_data['Assay_Id'] = train_data['Assay_Id'].astype(int)
test_data['Assay_Id'] = test_data['Assay_Id'].astype(int)

In [7]:
from rdkit.Chem import Descriptors
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        if mol:
            mol=Chem.AddHs(mol)
            descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(train_data['Chemical_Id'])

[21:05:12] Explicit valence for atom # 1 Si, 8, is greater than permitted
[21:05:16] Explicit valence for atom # 1 Si, 8, is greater than permitted
[21:05:19] Explicit valence for atom # 1 Si, 8, is greater than permitted
[21:05:21] Explicit valence for atom # 1 Si, 8, is greater than permitted
[21:05:24] Explicit valence for atom # 1 Si, 8, is greater than permitted
[21:05:25] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [8]:
df_descriptors_train = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_descriptors_train

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.874650,-2.987783,8.874650,0.765809,0.794714,317.599,306.511,315.982463,100,0,...,0,0,0,0,0,0,0,0,0,0
1,11.875100,-4.547647,11.875100,2.480493,0.516641,156.269,136.109,156.151415,66,0,...,0,0,0,0,0,0,0,0,0,0
2,8.736945,-6.030543,8.736945,0.000000,0.251327,362.086,313.702,361.347528,148,0,...,0,0,0,0,0,0,0,0,0,0
3,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,245.585,255.052302,90,0,...,0,0,0,0,0,0,0,0,0,0
4,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,149.894,149.894242,8,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75378,12.226619,-5.339907,12.226619,0.265375,0.712426,230.245,220.165,230.036128,82,0,...,0,1,0,0,0,0,0,0,0,0
75379,8.048957,-5.193498,8.048957,0.110750,0.720533,313.747,296.611,313.041677,104,0,...,0,0,0,0,0,0,0,0,0,0
75380,7.612361,-0.332130,7.612361,0.131389,0.596343,167.258,162.218,166.986341,50,0,...,0,0,0,0,0,1,0,0,0,0
75381,11.548424,-4.352546,11.548424,2.482554,0.519485,128.215,112.087,128.120115,54,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_descriptors_train.columns[df_descriptors_train.isnull().any()].tolist()

['MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW']

In [10]:
df_descriptors_train = df_descriptors_train.replace([np.inf, -np.inf], np.nan)

In [11]:
df_descriptors_train = df_descriptors_train.fillna(df_descriptors_train.mean())

In [12]:
df_descriptors_train.columns[df_descriptors_train.isnull().any()].tolist()

[]

In [13]:
train_data = train_data.reset_index(drop=True)
train_data = pd.concat([train_data, df_descriptors_train], axis = 1)

In [14]:
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644,8.87465,-2.987783,8.87465,0.765809,0.794714,317.599,...,0,0,0,0,0,0,0,0,0,0
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451,11.8751,-4.547647,11.8751,2.480493,0.516641,156.269,...,0,0,0,0,0,0,0,0,0,0
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384,8.736945,-6.030543,8.736945,0.0,0.251327,362.086,...,0,0,0,0,0,0,0,0,0,0
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,...,0,0,0,0,0,0,0,0,0,0
4,[Na+].[I-];1856,2,[Na+].[I-],1856,0.0,0.0,0.0,0.0,0.237972,149.894,...,0,0,0,0,0,0,0,0,0,0


In [15]:
from rdkit.Chem import Descriptors
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        if mol:
            mol=Chem.AddHs(mol)
            descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(test_data['Chemical_Id'])

In [16]:
df_descriptors_test = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_descriptors_test

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,164.120115,66,0,...,0,0,0,0,0,0,0,0,0,0
1,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,431.056940,152,0,...,0,1,1,0,0,0,0,0,0,1
2,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,695.250845,254,0,...,0,1,0,0,0,0,0,0,0,0
3,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,200.949810,56,0,...,0,0,0,0,0,0,0,0,0,0
4,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,418.271924,168,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,13.677486,-5.541970,13.677486,1.263580,0.391778,408.376,389.224,408.129692,154,0,...,0,0,0,0,0,0,0,0,0,0
10990,12.722294,-3.388339,12.722294,0.917255,0.635519,320.344,304.216,320.104859,120,0,...,0,0,0,0,0,0,0,0,0,0
10991,13.186679,-5.902467,13.186679,0.638923,0.705120,381.370,366.250,381.074304,138,0,...,0,1,0,0,0,0,0,0,0,1
10992,11.701188,-5.111667,11.701188,0.820825,0.540194,220.976,213.920,219.945901,66,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_descriptors_test.columns[df_descriptors_test.isnull().any()].tolist()

['MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW']

In [18]:
df_descriptors_test = df_descriptors_test.replace([np.inf, -np.inf], np.nan)

In [19]:
df_descriptors_test = df_descriptors_test.fillna(df_descriptors_test.mean())

In [20]:
df_descriptors_test.columns[df_descriptors_test.isnull().any()].tolist()

[]

In [21]:
test_data = test_data.reset_index(drop=True)
test_data = pd.concat([test_data, df_descriptors_test], axis = 1)

In [22]:
test_data

Unnamed: 0,Id,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,...,0,0,0,0,0,0,0,0,0,0
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,...,0,1,1,0,0,0,0,0,0,1
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,...,0,1,0,0,0,0,0,0,0,0
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,...,0,0,0,0,0,0,0,0,0,0
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,38,13.677486,-5.541970,13.677486,1.263580,0.391778,408.376,389.224,...,0,0,0,0,0,0,0,0,0,0
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,34,12.722294,-3.388339,12.722294,0.917255,0.635519,320.344,304.216,...,0,0,0,0,0,0,0,0,0,0
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,1640,13.186679,-5.902467,13.186679,0.638923,0.705120,381.370,366.250,...,0,1,0,0,0,0,0,0,0,1
10992,COP(=O)(OC)OC=C(Cl)Cl;28,COP(=O)(OC)OC=C(Cl)Cl,28,11.701188,-5.111667,11.701188,0.820825,0.540194,220.976,213.920,...,0,0,0,0,0,0,0,0,0,0


In [23]:
train_data.shape

(75383, 212)

In [24]:
test_data.shape

(10994, 211)

In [26]:
from rdkit.Chem import AllChem
def smiles_to_fingerprints(smiles_list):
    fingerprints = []
    for smiles in smiles_list:
        m = Chem.MolFromSmiles(smiles)
        if m is None:
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(m, 2, 2048)
        mfpts = np.array(fp)
        fingerprints.append(mfpts)
    return np.array(fingerprints)

In [27]:
chemical_Id_list_train = train_data['Chemical_Id'].to_list()

In [28]:
train_fingerprints = smiles_to_fingerprints(chemical_Id_list_train)

[21:29:24] Explicit valence for atom # 1 Si, 8, is greater than permitted
[21:29:44] Explicit valence for atom # 1 Si, 8, is greater than permitted
[21:30:03] Explicit valence for atom # 1 Si, 8, is greater than permitted
[21:30:09] Explicit valence for atom # 1 Si, 8, is greater than permitted
[21:30:28] Explicit valence for atom # 1 Si, 8, is greater than permitted
[21:30:39] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [29]:
train_Mfingerprints = pd.DataFrame(train_fingerprints,columns=['V{}'.format(i) for i in range(train_fingerprints.shape[1])])

In [30]:
train_Mfingerprints.columns[train_Mfingerprints.isnull().any()].tolist()

[]

In [31]:
train_Mfingerprints.columns[train_Mfingerprints.isna().any()].tolist()

[]

In [32]:
train_data = train_data.reset_index(drop=True)
train_data = pd.concat([train_data, train_Mfingerprints], axis = 1)

In [33]:
train_data.shape

(75383, 2260)

In [34]:
chemical_Id_list_test = test_data['Chemical_Id'].to_list()

In [35]:
test_fingerprints = smiles_to_fingerprints(chemical_Id_list_test)

In [36]:
test_Mfingerprints = pd.DataFrame(test_fingerprints,columns=['V{}'.format(i) for i in range(test_fingerprints.shape[1])])

In [37]:
test_Mfingerprints.columns[test_Mfingerprints.isnull().any()].tolist()

[]

In [38]:
test_Mfingerprints.columns[test_Mfingerprints.isna().any()].tolist()

[]

In [39]:
test_data = test_data.reset_index(drop=True)
test_data = pd.concat([test_data, test_Mfingerprints], axis = 1)

In [40]:
test_data.shape

(10994, 2259)

In [41]:
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,V2038,V2039,V2040,V2041,V2042,V2043,V2044,V2045,V2046,V2047
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644,8.87465,-2.987783,8.87465,0.765809,0.794714,317.599,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451,11.8751,-4.547647,11.8751,2.480493,0.516641,156.269,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384,8.736945,-6.030543,8.736945,0.0,0.251327,362.086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,[Na+].[I-];1856,2,[Na+].[I-],1856,0.0,0.0,0.0,0.0,0.237972,149.894,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
test_data.head()

Unnamed: 0,Id,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,V2038,V2039,V2040,V2041,V2042,V2043,V2044,V2045,V2046,V2047
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.12,...,0,0,0,0,0,0,0,0,0,0
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,...,0,0,0,0,0,0,0,0,0,0
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,...,0,0,0,0,0,0,0,0,0,0
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850,10.745579,-4.559583,10.745579,0.0,0.393203,201.244,197.212,...,0,0,0,0,0,0,0,0,0,0
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.27,...,0,1,0,0,0,0,0,0,0,0


In [43]:
test_data.columns[test_data.isna().any()].tolist()

[]

In [44]:
train_data.columns[train_data.isna().any()].tolist()

['V0',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V86',
 'V87',
 'V88',
 'V89',
 'V90',
 'V91',
 'V92',
 'V93',
 'V94',
 'V95',
 'V96',
 'V97',
 'V98',
 'V99',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V120',
 'V121',
 'V122',
 'V

In [45]:
train_data = train_data.fillna(train_data.mean())

  train_data = train_data.fillna(train_data.mean())


In [46]:
train_data.columns[train_data.isna().any()].tolist()

[]

In [50]:
train_data.columns[train_data.isna().any()].tolist()

[]

feature selection using variance threshold

In [47]:
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing

In [48]:
le = preprocessing.LabelEncoder()
train_chemical_id =  list(train_data.Chemical_Id.unique())
le.fit(train_chemical_id)
train_data['Chemical_Id']=le.transform(train_data['Chemical_Id'])

In [49]:
#split target variable from rest of the data
train_Y = train_data["Expected"]

In [50]:
train_Y

0        2
1        2
2        2
3        2
4        2
        ..
75378    2
75379    1
75380    1
75381    2
75382    1
Name: Expected, Length: 75383, dtype: int64

In [51]:
train_data_1 = train_data
train_data_1

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,V2038,V2039,V2040,V2041,V2042,V2043,V2044,V2045,V2046,V2047
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,392,1644,8.874650,-2.987783,8.874650,0.765809,0.794714,317.599,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,CCCCCCCCC(=O)C;2451,2,2230,2451,11.875100,-4.547647,11.875100,2.480493,0.516641,156.269,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,2344,1384,8.736945,-6.030543,8.736945,0.000000,0.251327,362.086,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,909,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,[Na+].[I-];1856,2,3220,1856,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75378,COC(=O)NS(=O)(=O)C1=CC=C(C=C1)N;33,2,2926,33,12.226619,-5.339907,12.226619,0.265375,0.712426,230.245,...,0.005904,0.004882,0.005612,0.003794,0.008517,0.006912,0.012537,0.009486,0.003224,0.003038
75379,CCOP(=S)(OCC)OC1=NN(C(=N1)Cl)C(C)C;1632,1,2707,1632,8.048957,-5.193498,8.048957,0.110750,0.720533,313.747,...,0.005904,0.004882,0.005612,0.003794,0.008517,0.006912,0.012537,0.009486,0.003224,0.003038
75380,C1=CC=C2C(=C1)NC(=S)S2;1373,1,737,1373,7.612361,-0.332130,7.612361,0.131389,0.596343,167.258,...,0.005904,0.004882,0.005612,0.003794,0.008517,0.006912,0.012537,0.009486,0.003224,0.003038
75381,CCCCC(CC)C=O;2,2,2105,2,11.548424,-4.352546,11.548424,2.482554,0.519485,128.215,...,0.005904,0.004882,0.005612,0.003794,0.008517,0.006912,0.012537,0.009486,0.003224,0.003038


In [52]:
train_data.drop(['Expected'], axis = 1, inplace=True) 
train_X = pd.get_dummies(train_data)

In [53]:
train_X

Unnamed: 0,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,...,Id_[O-]S(=O)(=O)OOS(=O)(=O)[O-].[Na+].[Na+];1852,Id_[O-]S(=O)(=O)OOS(=O)(=O)[O-].[Na+].[Na+];1855,Id_[O-]S(=O)(=O)OOS(=O)(=O)[O-].[Na+].[Na+];1856,Id_[O-]S(=O)(=O)OOS(=O)(=O)[O-].[Na+].[Na+];1857,Id_[O-]S(=O)(=O)OOS(=O)(=O)[O-].[Na+].[Na+];2,Id_[O-]S(=O)(=O)[O-].[Na+].[Na+];1850,Id_[O-]S(=O)(=O)[O-].[Na+].[Na+];1852,Id_[O-]S(=O)(=O)[O-].[Na+].[Na+];1855,Id_[O-]S(=O)(=O)[O-].[Na+].[Na+];1856,Id_[O-]S(=O)(=O)[O-].[Na+].[Na+];2
0,392,1644,8.874650,-2.987783,8.874650,0.765809,0.794714,317.599,306.511,315.982463,...,0,0,0,0,0,0,0,0,0,0
1,2230,2451,11.875100,-4.547647,11.875100,2.480493,0.516641,156.269,136.109,156.151415,...,0,0,0,0,0,0,0,0,0,0
2,2344,1384,8.736945,-6.030543,8.736945,0.000000,0.251327,362.086,313.702,361.347528,...,0,0,0,0,0,0,0,0,0,0
3,909,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,245.585,255.052302,...,0,0,0,0,0,0,0,0,0,0
4,3220,1856,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,149.894,149.894242,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75378,2926,33,12.226619,-5.339907,12.226619,0.265375,0.712426,230.245,220.165,230.036128,...,0,0,0,0,0,0,0,0,0,0
75379,2707,1632,8.048957,-5.193498,8.048957,0.110750,0.720533,313.747,296.611,313.041677,...,0,0,0,0,0,0,0,0,0,0
75380,737,1373,7.612361,-0.332130,7.612361,0.131389,0.596343,167.258,162.218,166.986341,...,0,0,0,0,0,0,0,0,0,0
75381,2105,2,11.548424,-4.352546,11.548424,2.482554,0.519485,128.215,112.087,128.120115,...,0,0,0,0,0,0,0,0,0,0


In [54]:
# Setting up variance threshold
sel = VarianceThreshold(threshold=0.25)
X_new = sel.fit_transform(train_X)

In [55]:
# Retrieve list of features from the index of features selected by variance threshold.
list_x = []
col_names = []
selected_features = []

In [56]:
list_x = sel.get_support(indices=True)
col_names = train_X.columns.tolist()

In [57]:
for i in list_x: 
    selected_features.append(col_names[i])

print("Selected features:")
print(selected_features)

Selected features:
['Chemical_Id', 'Assay_Id', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_MRHI', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EStat

In [58]:
le = preprocessing.LabelEncoder()
test_chemical_id =  list(test_data.Chemical_Id.unique())
le.fit(test_chemical_id)
test_data['Chemical_Id']=le.transform(test_data['Chemical_Id'])

In [59]:
features = ['Chemical_Id', 'Assay_Id', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_MRHI', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'HeavyAtomCount', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds', 'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount', 'MolLogP', 'MolMR', 'fr_Al_OH', 'fr_Ar_N', 'fr_Ar_OH', 'fr_C_O', 'fr_C_O_noCOO', 'fr_NH0', 'fr_NH1', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_aniline', 'fr_benzene', 'fr_bicyclic', 'fr_ester', 'fr_ether', 'fr_halogen', 'fr_methoxy', 'fr_phenol', 'fr_phenol_noOrthoHbond']

In [60]:
temp_X = train_data[features]

In [61]:
print(np.isnan(train_data[features]).any())

Chemical_Id               False
Assay_Id                  False
MaxEStateIndex            False
MinEStateIndex            False
MaxAbsEStateIndex         False
                          ...  
fr_ether                  False
fr_halogen                False
fr_methoxy                False
fr_phenol                 False
fr_phenol_noOrthoHbond    False
Length: 129, dtype: bool


In [62]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# create feature and target variables
X = train_data[features]
y = train_Y

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=1)

# Setting up SFS for DecisionTree
clf = DecisionTreeClassifier()

sfs1 = sfs(clf,
           k_features=50,
           forward=True,
           floating=False,
           verbose=2,
           scoring='f1_macro',
           cv=5, 
           n_jobs=-1)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

sfs1.fit(X_train_scaled, y_train)

print('Best accuracy score: %.2f' % sfs1.k_score_)   # k_score_ shows the best score
print('Best subset (indices):', sfs1.k_feature_idx_) # k_feature_idx_ shows the index of features
print('Best subset (corresponding names):', sfs1.k_feature_names_) # k_feature_names_ shows the feature names

feature_cols = pd.DataFrame(sfs1.subsets_).transpose()
print(feature_cols)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 129 out of 129 | elapsed:    8.1s finished

[2023-04-08 21:57:19] Features: 1/50 -- score: 0.6395938933709667[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 128 out of 128 | elapsed:    5.3s finished

[2023-04-08 21:57:24] Features: 2/50 -- score: 0.6905568718824409[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 127 out of 127 | elapsed:    9.1s finished

[2023-04-08 21:57:34] Features: 3/50 -- score: 0.7111420569080866[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 126 o

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done  99 out of  99 | elapsed:   40.5s finished

[2023-04-08 22:11:29] Features: 31/50 -- score: 0.7315697563997545[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  98 out of  98 | elapsed:   39.2s finished

[2023-04-08 22:12:08] Features: 32/50 -- score: 0.7303598510820837[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  97 out of  97 | elapsed:   42.0s finished

[2023-04-08 22:12:50] Features: 33/50 -- score: 0.7303839577188407[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:   42.3s finished

[2023-04-08 22:13:33] Features: 34/

Best accuracy score: 0.74
Best subset (indices): (1, 6, 10, 11, 14, 15, 17, 20, 27, 30, 38, 39, 44, 45, 49, 50, 57, 58, 65, 68, 69, 72, 73, 75, 76, 77, 78, 85, 91, 94, 95, 96, 99, 101, 103, 104, 107, 109, 110, 112, 113, 114, 118, 119, 120, 122, 123, 126, 127, 128)
Best subset (corresponding names): ('Assay_Id', 'MolWt', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MRHI', 'BalabanJ', 'Chi0', 'Chi1', 'Chi4n', 'Ipc', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA5', 'PEOE_VSA6', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA5', 'SlogP_VSA8', 'TPSA', 'EState_VSA11', 'EState_VSA2', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'VSA_EState4', 'HeavyAtomCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticRings', 'NumHDonors', 'NumRotatableBonds', 'NumSaturatedCarbocycles', 'RingCount', 'MolMR', 'fr_Al_OH', 'fr_Ar_OH', 'fr_C_O', 'fr_C_O_noCOO', 'fr_allylic_oxid', 'fr_amide', 'fr_aniline', 'fr_bicyclic', 'fr_ester', 'fr_methoxy'


[2023-04-08 22:52:38] Features: 50/50 -- score: 0.735822710926638

In [63]:
feature_cols['feature_names'][50]

('Assay_Id',
 'MolWt',
 'FpDensityMorgan2',
 'FpDensityMorgan3',
 'BCUT2D_MRHI',
 'BalabanJ',
 'Chi0',
 'Chi1',
 'Chi4n',
 'Ipc',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'SMR_VSA1',
 'SMR_VSA10',
 'SMR_VSA9',
 'SlogP_VSA1',
 'SlogP_VSA5',
 'SlogP_VSA8',
 'TPSA',
 'EState_VSA11',
 'EState_VSA2',
 'EState_VSA4',
 'EState_VSA5',
 'EState_VSA6',
 'EState_VSA7',
 'VSA_EState4',
 'HeavyAtomCount',
 'NumAliphaticCarbocycles',
 'NumAliphaticHeterocycles',
 'NumAliphaticRings',
 'NumAromaticRings',
 'NumHDonors',
 'NumRotatableBonds',
 'NumSaturatedCarbocycles',
 'RingCount',
 'MolMR',
 'fr_Al_OH',
 'fr_Ar_OH',
 'fr_C_O',
 'fr_C_O_noCOO',
 'fr_allylic_oxid',
 'fr_amide',
 'fr_aniline',
 'fr_bicyclic',
 'fr_ester',
 'fr_methoxy',
 'fr_phenol',
 'fr_phenol_noOrthoHbond')

In [65]:
selected_features = ['Assay_Id',
 'MolWt',
 'FpDensityMorgan2',
 'FpDensityMorgan3',
 'BCUT2D_MRHI',
 'BalabanJ',
 'Chi0',
 'Chi1',
 'Chi4n',
 'Ipc',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'SMR_VSA1',
 'SMR_VSA10',
 'SMR_VSA9',
 'SlogP_VSA1',
 'SlogP_VSA5',
 'SlogP_VSA8',
 'TPSA',
 'EState_VSA11',
 'EState_VSA2',
 'EState_VSA4',
 'EState_VSA5',
 'EState_VSA6',
 'EState_VSA7',
 'VSA_EState4',
 'HeavyAtomCount',
 'NumAliphaticCarbocycles',
 'NumAliphaticHeterocycles',
 'NumAliphaticRings',
 'NumAromaticRings',
 'NumHDonors',
 'NumRotatableBonds',
 'NumSaturatedCarbocycles',
 'RingCount',
 'MolMR',
 'fr_Al_OH',
 'fr_Ar_OH',
 'fr_C_O',
 'fr_C_O_noCOO',
 'fr_allylic_oxid',
 'fr_amide',
 'fr_aniline',
 'fr_bicyclic',
 'fr_ester',
 'fr_methoxy',
 'fr_phenol',
 'fr_phenol_noOrthoHbond']

In [66]:
X_train = train_data[selected_features]
X_test = test_data[selected_features]

In [67]:
X_train.shape, train_Y.shape

((75383, 50), (75383,))

In [68]:
X_test.shape

(10994, 50)

In [69]:
X_test

Unnamed: 0,Assay_Id,MolWt,FpDensityMorgan2,FpDensityMorgan3,BCUT2D_MRHI,BalabanJ,Chi0,Chi1,Chi4n,Ipc,...,fr_C_O,fr_C_O_noCOO,fr_allylic_oxid,fr_amide,fr_aniline,fr_bicyclic,fr_ester,fr_methoxy,fr_phenol,fr_phenol_noOrthoHbond
0,1682,164.248,2.333333,2.916667,-4.008192e-17,4.430176,22.671208,12.174756,0.653093,2.224332e+05,...,0,0,0,0,0,0,0,0,1,1
1,1656,431.452,1.964286,2.571429,1.991416e+00,2.797891,35.463737,19.880570,1.428363,5.961442e+08,...,1,1,0,2,1,0,0,2,0,0
2,36,696.264,1.895833,2.625000,1.059760e+00,1.691562,69.063513,38.735510,4.276783,1.770047e+21,...,1,1,0,1,0,3,0,0,0,0
3,1850,201.244,2.454545,3.000000,1.417918e+00,0.000002,11.146264,6.133646,0.366265,6.442779e+02,...,1,1,1,1,0,0,0,0,0,0
4,30,418.574,2.400000,3.133333,-1.028560e-16,3.006114,55.085422,29.164391,2.121737,4.956791e+12,...,2,2,3,0,0,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,38,408.376,2.000000,2.724138,2.345787e+00,3.046077,37.695788,21.347583,1.609244,2.206533e+09,...,1,1,0,0,0,0,1,1,0,0
10990,34,320.344,1.416667,1.916667,2.509654e+00,2.850779,30.590975,18.550977,1.880155,2.533823e+08,...,1,0,0,0,0,0,0,0,2,2
10991,1640,381.370,2.038462,2.769231,2.103991e+00,2.704189,32.041087,18.291254,1.385173,1.283830e+08,...,2,2,0,2,1,0,1,2,0,0
10992,28,220.976,2.181818,2.545455,8.925823e-01,5.181297,14.776021,7.741399,0.146307,2.077625e+03,...,0,0,0,0,0,0,0,0,0,0


In [105]:
# from sklearn.impute import SimpleImputer
# from sklearn.model_selection import train_test_split
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import mutual_info_classif

# selected_features = list(train_X.columns[train_X.columns.isin(train_X.columns[SelectKBest(mutual_info_classif, k=50).fit(train_X, train_Y).get_support()])])

In [106]:
# selected_features

In [107]:
# X_train = pd.get_dummies(train_data[selected_features])
# X_test = pd.get_dummies(test_data[selected_features])

Feature selection using correlation

In [108]:
# corr_coeffs = X_train.corrwith(train_Y)
# corr_coeffs = corr_coeffs.sort_values(ascending=False)
# print('Feature rankings based on correlation with', train_Y)
# for i, (feature, corr) in enumerate(corr_coeffs.items()):
#     print(f'{i+1}. {feature}: {corr:.3f}')

In [109]:
# features = ['SlogP_VSA5', 'VSA_EState1', 'PEOE_VSA6', 'NumAromaticRings', 'NumValenceElectrons', 'EState_VSA9', 'Chi0n', 'Chi0', 'RingCount', 'Chi1n', 'HeavyAtomCount', 'Chi1', 'PEOE_VSA7', 'HeavyAtomMolWt', 'ExactMolWt', 'MolWt', 'Chi4v', 'BertzCT', 'EState_VSA1', 'SlogP_VSA6', 'LabuteASA', 'SMR_VSA7', 'Chi2n', 'Chi4n', 'Chi3n', 'Chi3v', 'Chi0v', 'fr_benzene', 'NumAromaticCarbocycles', 'MolLogP', 'Chi2v', 'MolMR', 'Chi1v', 'Assay_Id', 'VSA_EState6', 'BalabanJ', 'VSA_EState4', 'FpDensityMorgan2']

In [110]:
# X_train = pd.get_dummies(train_data[features])
# X_test = pd.get_dummies(test_data[features])

Classifiers

In [70]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier

In [112]:
# from sklearn import tree
# model = BaggingClassifier(estimator=tree.DecisionTreeClassifier(random_state=1,
#                                                                      criterion='entropy',
#                                                                      max_depth=35,
#                                                                      class_weight='balanced'),
#                                                                      random_state=1,
#                                                                      n_estimators=20)

In [113]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# # RandomForestClassifier
# model = RandomForestClassifier(random_state=1, max_depth=35, criterion='entropy')

In [114]:
# !pip install lightgbm

In [115]:
# !pip install catboost

In [71]:
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier

clf1 = xgb.XGBClassifier(random_state=1,booster="gbtree",learning_rate=0.25,n_estimators=250,max_depth=12, min_child_weight=4)
clf2 = lgb.LGBMClassifier(num_threads=1,boosting_type= 'goss',learning_rate=0.1,n_estimators=1000,max_depth=10,num_leaves=100,max_bin = 5000)

model = VotingClassifier(estimators=[('XGB',clf1),('LGBM',clf2)],voting='hard')

In [117]:
# from catboost import CatBoostClassifier
# model = CatBoostClassifier(learning_rate=0.1, boosting_type= 'Ordered',max_depth=10,max_bin = 5000, iterations=50) 

In [72]:
print(np.isnan(X_train).any())

print(np.isinf(X_train).any())
X_train.shape, train_Y.shape

Assay_Id                    False
MolWt                       False
FpDensityMorgan2            False
FpDensityMorgan3            False
BCUT2D_MRHI                 False
BalabanJ                    False
Chi0                        False
Chi1                        False
Chi4n                       False
Ipc                         False
PEOE_VSA12                  False
PEOE_VSA13                  False
PEOE_VSA5                   False
PEOE_VSA6                   False
SMR_VSA1                    False
SMR_VSA10                   False
SMR_VSA9                    False
SlogP_VSA1                  False
SlogP_VSA5                  False
SlogP_VSA8                  False
TPSA                        False
EState_VSA11                False
EState_VSA2                 False
EState_VSA4                 False
EState_VSA5                 False
EState_VSA6                 False
EState_VSA7                 False
VSA_EState4                 False
HeavyAtomCount              False
NumAliphaticCa

((75383, 50), (75383,))

In [73]:
X_train

Unnamed: 0,Assay_Id,MolWt,FpDensityMorgan2,FpDensityMorgan3,BCUT2D_MRHI,BalabanJ,Chi0,Chi1,Chi4n,Ipc,...,fr_C_O,fr_C_O_noCOO,fr_allylic_oxid,fr_amide,fr_aniline,fr_bicyclic,fr_ester,fr_methoxy,fr_phenol,fr_phenol_noOrthoHbond
0,1644,317.599,1.263158,1.526316,2.880028e+00,3.161117,23.342417,13.676863,1.260535,1.627086e+06,...,0,0,0,0,0,0,0,0,2,2
1,2451,156.269,2.000000,2.727273,-4.807584e-16,6.874718,26.077350,12.904701,0.213016,4.317587e+05,...,1,1,0,0,0,0,0,0,0,0
2,1384,362.086,0.875000,1.208333,-4.520307e-16,0.000000,59.500000,29.500000,0.626558,3.515819e+12,...,0,0,0,0,0,0,0,0,0,0
3,16,255.665,2.705882,3.470588,2.650535e+00,2.779782,21.110366,12.069109,0.882774,3.384906e+05,...,0,0,0,0,0,0,0,0,0,0
4,1856,149.894,1.000000,1.000000,1.427374e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75378,33,230.245,2.333333,2.866667,1.729313e+00,3.575561,19.903259,11.124621,0.585813,9.474901e+04,...,1,1,0,1,1,0,0,1,0,0
75379,1632,313.747,2.222222,2.777778,4.155244e-01,3.659032,28.267585,15.008367,0.724082,4.154661e+06,...,0,0,0,0,0,0,0,0,0,0
75380,1373,167.258,2.400000,3.300000,3.680342e+00,3.144664,11.325909,6.947265,0.648428,2.670736e+03,...,0,0,0,0,0,1,0,0,0,0
75381,2,128.215,2.888889,3.444444,-6.085678e-04,6.849077,21.077350,10.443376,0.176031,3.799028e+04,...,1,1,0,0,0,0,0,0,0,0


In [74]:
X_test

Unnamed: 0,Assay_Id,MolWt,FpDensityMorgan2,FpDensityMorgan3,BCUT2D_MRHI,BalabanJ,Chi0,Chi1,Chi4n,Ipc,...,fr_C_O,fr_C_O_noCOO,fr_allylic_oxid,fr_amide,fr_aniline,fr_bicyclic,fr_ester,fr_methoxy,fr_phenol,fr_phenol_noOrthoHbond
0,1682,164.248,2.333333,2.916667,-4.008192e-17,4.430176,22.671208,12.174756,0.653093,2.224332e+05,...,0,0,0,0,0,0,0,0,1,1
1,1656,431.452,1.964286,2.571429,1.991416e+00,2.797891,35.463737,19.880570,1.428363,5.961442e+08,...,1,1,0,2,1,0,0,2,0,0
2,36,696.264,1.895833,2.625000,1.059760e+00,1.691562,69.063513,38.735510,4.276783,1.770047e+21,...,1,1,0,1,0,3,0,0,0,0
3,1850,201.244,2.454545,3.000000,1.417918e+00,0.000002,11.146264,6.133646,0.366265,6.442779e+02,...,1,1,1,1,0,0,0,0,0,0
4,30,418.574,2.400000,3.133333,-1.028560e-16,3.006114,55.085422,29.164391,2.121737,4.956791e+12,...,2,2,3,0,0,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,38,408.376,2.000000,2.724138,2.345787e+00,3.046077,37.695788,21.347583,1.609244,2.206533e+09,...,1,1,0,0,0,0,1,1,0,0
10990,34,320.344,1.416667,1.916667,2.509654e+00,2.850779,30.590975,18.550977,1.880155,2.533823e+08,...,1,0,0,0,0,0,0,0,2,2
10991,1640,381.370,2.038462,2.769231,2.103991e+00,2.704189,32.041087,18.291254,1.385173,1.283830e+08,...,2,2,0,2,1,0,1,2,0,0
10992,28,220.976,2.181818,2.545455,8.925823e-01,5.181297,14.776021,7.741399,0.146307,2.077625e+03,...,0,0,0,0,0,0,0,0,0,0


In [75]:
train_Y

0        2
1        2
2        2
3        2
4        2
        ..
75378    2
75379    1
75380    1
75381    2
75382    1
Name: Expected, Length: 75383, dtype: int64

In [76]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)

In [None]:
model = model.fit(X_train, train_Y)



In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
X_test = scaler.fit_transform(X_test)

In [None]:
val_predictions = model.predict(X_test)

In [None]:
accuracy_scores = cross_val_score(model, X_train, train_Y, cv=5)
f1_macro_scores = cross_val_score(model, X_train, train_Y, cv=5, scoring='f1_macro')

print("%0.4f accuracy with a standard deviation of %0.4f" % (accuracy_scores.mean(), accuracy_scores.std()))
print("%0.4f f1_score with a standard deviation of %0.4f" % (f1_macro_scores.mean(), f1_macro_scores.std()))

In [None]:
output = pd.DataFrame({'Id': test_data.Id, 'Predicted': val_predictions})
output.to_csv('Output.csv', index=False)