In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors

In [2]:
train_data = pd.read_csv('train_II.csv')
test_data = pd.read_csv('test_II.csv')

In [3]:
test_data = test_data.rename(columns = {"x": "Id"})
test_data.head()

Unnamed: 0,Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...


In [4]:
train_data[["Chemical_Id", "Assay_Id"]] = train_data.Id.str.split(";", expand = True)
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16
4,[Na+].[I-];1856,2,[Na+].[I-],1856


In [5]:
test_data[["Chemical_Id", "Assay_Id"]] = test_data.Id.str.split(";", expand = True)
test_data.head()

Unnamed: 0,Id,Chemical_Id,Assay_Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30


In [6]:
train_data['Assay_Id'] = train_data['Assay_Id'].astype(int)
test_data['Assay_Id'] = test_data['Assay_Id'].astype(int)

In [7]:
train_data = train_data[train_data['Chemical_Id'] != 'F[Si-2](F)(F)(F)(F)F.[Na+].[Na+]']
train_data.shape

(75377, 4)

In [8]:
from rdkit.Chem import Descriptors
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        if mol:
        # add hydrogens to molecules
            mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
            descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(train_data['Chemical_Id'])

In [9]:
df_descriptors_train = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_descriptors_train

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.874650,-2.987783,8.874650,0.765809,0.794714,317.599,306.511,315.982463,100,0,...,0,0,0,0,0,0,0,0,0,0
1,11.875100,-4.547647,11.875100,2.480493,0.516641,156.269,136.109,156.151415,66,0,...,0,0,0,0,0,0,0,0,0,0
2,8.736945,-6.030543,8.736945,0.000000,0.251327,362.086,313.702,361.347528,148,0,...,0,0,0,0,0,0,0,0,0,0
3,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,245.585,255.052302,90,0,...,0,0,0,0,0,0,0,0,0,0
4,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,149.894,149.894242,8,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75372,12.226619,-5.339907,12.226619,0.265375,0.712426,230.245,220.165,230.036128,82,0,...,0,1,0,0,0,0,0,0,0,0
75373,8.048957,-5.193498,8.048957,0.110750,0.720533,313.747,296.611,313.041677,104,0,...,0,0,0,0,0,0,0,0,0,0
75374,7.612361,-0.332130,7.612361,0.131389,0.596343,167.258,162.218,166.986341,50,0,...,0,0,0,0,0,1,0,0,0,0
75375,11.548424,-4.352546,11.548424,2.482554,0.519485,128.215,112.087,128.120115,54,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df_descriptors_train.columns[df_descriptors_train.isnull().any()].tolist()

['MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW']

In [11]:
df_descriptors_train = df_descriptors_train.replace([np.inf, -np.inf], np.nan)

In [12]:
df_descriptors_train = df_descriptors_train.fillna(df_descriptors_train.mean())

In [13]:
df_descriptors_train.columns[df_descriptors_train.isnull().any()].tolist()

[]

In [14]:
train_data = train_data.reset_index(drop=True)
train_data = pd.concat([train_data, df_descriptors_train], axis = 1)

In [15]:
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644,8.87465,-2.987783,8.87465,0.765809,0.794714,317.599,...,0,0,0,0,0,0,0,0,0,0
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451,11.8751,-4.547647,11.8751,2.480493,0.516641,156.269,...,0,0,0,0,0,0,0,0,0,0
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384,8.736945,-6.030543,8.736945,0.0,0.251327,362.086,...,0,0,0,0,0,0,0,0,0,0
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,...,0,0,0,0,0,0,0,0,0,0
4,[Na+].[I-];1856,2,[Na+].[I-],1856,0.0,0.0,0.0,0.0,0.237972,149.894,...,0,0,0,0,0,0,0,0,0,0


In [16]:
from rdkit.Chem import Descriptors
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        if mol:
        # add hydrogens to molecules
            mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
            descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(test_data['Chemical_Id'])

In [17]:
df_descriptors_test = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_descriptors_test

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,164.120115,66,0,...,0,0,0,0,0,0,0,0,0,0
1,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,431.056940,152,0,...,0,1,1,0,0,0,0,0,0,1
2,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,695.250845,254,0,...,0,1,0,0,0,0,0,0,0,0
3,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,200.949810,56,0,...,0,0,0,0,0,0,0,0,0,0
4,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,418.271924,168,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,13.677486,-5.541970,13.677486,1.263580,0.391778,408.376,389.224,408.129692,154,0,...,0,0,0,0,0,0,0,0,0,0
10990,12.722294,-3.388339,12.722294,0.917255,0.635519,320.344,304.216,320.104859,120,0,...,0,0,0,0,0,0,0,0,0,0
10991,13.186679,-5.902467,13.186679,0.638923,0.705120,381.370,366.250,381.074304,138,0,...,0,1,0,0,0,0,0,0,0,1
10992,11.701188,-5.111667,11.701188,0.820825,0.540194,220.976,213.920,219.945901,66,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_descriptors_test.columns[df_descriptors_test.isnull().any()].tolist()

['MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW']

In [19]:
df_descriptors_test = df_descriptors_test.replace([np.inf, -np.inf], np.nan)

In [20]:
df_descriptors_test = df_descriptors_test.fillna(df_descriptors_test.mean())

In [21]:
df_descriptors_test.columns[df_descriptors_test.isnull().any()].tolist()

[]

In [22]:
test_data = test_data.reset_index(drop=True)
test_data = pd.concat([test_data, df_descriptors_test], axis = 1)

In [23]:
test_data

Unnamed: 0,Id,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,...,0,0,0,0,0,0,0,0,0,0
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,...,0,1,1,0,0,0,0,0,0,1
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,...,0,1,0,0,0,0,0,0,0,0
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,...,0,0,0,0,0,0,0,0,0,0
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,38,13.677486,-5.541970,13.677486,1.263580,0.391778,408.376,389.224,...,0,0,0,0,0,0,0,0,0,0
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,34,12.722294,-3.388339,12.722294,0.917255,0.635519,320.344,304.216,...,0,0,0,0,0,0,0,0,0,0
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,1640,13.186679,-5.902467,13.186679,0.638923,0.705120,381.370,366.250,...,0,1,0,0,0,0,0,0,0,1
10992,COP(=O)(OC)OC=C(Cl)Cl;28,COP(=O)(OC)OC=C(Cl)Cl,28,11.701188,-5.111667,11.701188,0.820825,0.540194,220.976,213.920,...,0,0,0,0,0,0,0,0,0,0


In [24]:
train_data.shape

(75377, 212)

In [25]:
test_data.shape

(10994, 211)

In [26]:
from rdkit.Chem import AllChem
def smiles_to_fingerprints(smiles_list):
    fingerprints = []
    for smiles in smiles_list:
        m = Chem.MolFromSmiles(smiles)
        if m is None:
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(m, 2, 2048)
        mfpts = np.array(fp)
        fingerprints.append(mfpts)
    return np.array(fingerprints)

In [27]:
chemical_Id_list_train = train_data['Chemical_Id'].to_list()

In [28]:
train_fingerprints = smiles_to_fingerprints(chemical_Id_list_train)

In [29]:
train_Mfingerprints = pd.DataFrame(train_fingerprints,columns=['V{}'.format(i) for i in range(train_fingerprints.shape[1])])

In [30]:
train_Mfingerprints.columns[train_Mfingerprints.isnull().any()].tolist()

[]

In [31]:
train_Mfingerprints.columns[train_Mfingerprints.isna().any()].tolist()

[]

In [32]:
train_data = train_data.reset_index(drop=True)
train_data = pd.concat([train_data, train_Mfingerprints], axis = 1)

In [33]:
train_data.shape

(75377, 2260)

In [34]:
chemical_Id_list_test = test_data['Chemical_Id'].to_list()

In [35]:
test_fingerprints = smiles_to_fingerprints(chemical_Id_list_test)

In [36]:
test_Mfingerprints = pd.DataFrame(test_fingerprints,columns=['V{}'.format(i) for i in range(test_fingerprints.shape[1])])

In [37]:
test_Mfingerprints.columns[test_Mfingerprints.isnull().any()].tolist()

[]

In [38]:
test_Mfingerprints.columns[test_Mfingerprints.isna().any()].tolist()

[]

In [39]:
test_data = test_data.reset_index(drop=True)
test_data = pd.concat([test_data, test_Mfingerprints], axis = 1)

In [40]:
test_data.shape

(10994, 2259)

In [41]:
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,V2038,V2039,V2040,V2041,V2042,V2043,V2044,V2045,V2046,V2047
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644,8.87465,-2.987783,8.87465,0.765809,0.794714,317.599,...,0,0,0,0,0,0,0,0,0,0
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451,11.8751,-4.547647,11.8751,2.480493,0.516641,156.269,...,0,0,0,0,0,0,0,0,0,0
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384,8.736945,-6.030543,8.736945,0.0,0.251327,362.086,...,0,0,0,0,0,0,0,0,0,0
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,...,0,0,0,0,0,0,0,0,0,0
4,[Na+].[I-];1856,2,[Na+].[I-],1856,0.0,0.0,0.0,0.0,0.237972,149.894,...,0,0,0,0,0,0,0,0,0,0


In [42]:
test_data.head()

Unnamed: 0,Id,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,V2038,V2039,V2040,V2041,V2042,V2043,V2044,V2045,V2046,V2047
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.12,...,0,0,0,0,0,0,0,0,0,0
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,...,0,0,0,0,0,0,0,0,0,0
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,...,0,0,0,0,0,0,0,0,0,0
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850,10.745579,-4.559583,10.745579,0.0,0.393203,201.244,197.212,...,0,0,0,0,0,0,0,0,0,0
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.27,...,0,1,0,0,0,0,0,0,0,0


In [43]:
test_data.columns[test_data.isna().any()].tolist()

[]

In [44]:
train_data.columns[train_data.isna().any()].tolist()

[]

In [45]:
train_data = train_data.fillna(train_data.mean())

  train_data = train_data.fillna(train_data.mean())


In [46]:
train_data.columns[train_data.isna().any()].tolist()

[]

In [47]:
from mordred import Calculator, descriptors
def All_Mordred_descriptors(data):
    calc = Calculator(descriptors, ignore_3D=False)
    mols = [Chem.MolFromSmiles(smi) for smi in data]
    
    # pandas df
    df = calc.pandas(mols)
    return df

In [48]:
train_mordred_descriptors = All_Mordred_descriptors(train_data['Chemical_Id'])

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [49]:
train_mordred_descriptors

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,23.128032,2.424173,4.848346,23.128032,1.217265,3.862863,...,9.861675,52.418777,315.982463,10.532749,678,28,98.0,112.0,7.618056,4.055556
1,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,12.784906,1.975377,3.950753,12.784906,1.162264,3.210676,...,7.832411,37.843620,156.151415,5.037142,212,8,40.0,38.0,4.861111,2.833333
2,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,1,1,multiple fragments (SpAbs_A/SpAbs),multiple fragments (SpMax_A/SpMax),multiple fragments (SpDiam_A/SpDiam),multiple fragments (SpAD_A/SpAD),multiple fragments (SpMAD_A/SpMAD),multiple fragments (LogEE_A/LogEE),...,9.034200,55.641246,361.347528,5.018716,2300001804,22,92.0,92.0,divide by zero encountered in power (mZagreb1),5.750000
3,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,3,21.30202,2.337073,4.596189,21.30202,1.25306,3.748471,...,9.430840,62.679417,255.052302,9.446382,559,20,84.0,94.0,5.805556,3.777778
4,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,1,1,multiple fragments (SpAbs_A/SpAbs),multiple fragments (SpMax_A/SpMax),multiple fragments (SpDiam_A/SpDiam),multiple fragments (SpAD_A/SpAD),multiple fragments (SpMAD_A/SpMAD),multiple fragments (LogEE_A/LogEE),...,0.000000,2.000000,149.894242,74.947121,100000000,0,0.0,0.0,divide by zero encountered in power (mZagreb1),0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75372,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,17.63789,2.357084,4.714168,17.63789,1.175859,3.598213,...,9.461955,46.722121,230.036128,9.201445,378,20,72.0,80.0,6.895833,3.375000
75373,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,20.82664,2.393853,4.671424,20.82664,1.157036,3.777197,...,9.599473,64.450105,313.041677,8.944048,630,24,86.0,97.0,8.256944,4.180556
75374,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,13.043049,2.363609,4.557136,13.043049,1.304305,3.261033,...,9.085117,53.938362,166.986341,11.132423,108,11,52.0,60.0,2.833333,2.194444
75375,module 'numpy' has no attribute 'float' (ABC),module 'numpy' has no attribute 'float' (ABCGG...,0,0,10.565187,2.042079,4.084158,10.565187,1.17391,3.008457,...,7.884953,35.263065,128.120115,5.124805,104,8,32.0,32.0,4.361111,2.500000


In [50]:
train_mordred_descriptors.columns[train_mordred_descriptors.isnull().any()].tolist()

[]

In [51]:
train_data = train_data.reset_index(drop=True)
train_data = pd.concat([train_data, train_mordred_descriptors], axis = 1)

In [52]:
train_data

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644,8.874650,-2.987783,8.874650,0.765809,0.794714,317.599,...,9.861675,52.418777,315.982463,10.532749,678,28,98.0,112.0,7.618056,4.055556
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451,11.875100,-4.547647,11.875100,2.480493,0.516641,156.269,...,7.832411,37.843620,156.151415,5.037142,212,8,40.0,38.0,4.861111,2.833333
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384,8.736945,-6.030543,8.736945,0.000000,0.251327,362.086,...,9.034200,55.641246,361.347528,5.018716,2300001804,22,92.0,92.0,divide by zero encountered in power (mZagreb1),5.750000
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,...,9.430840,62.679417,255.052302,9.446382,559,20,84.0,94.0,5.805556,3.777778
4,[Na+].[I-];1856,2,[Na+].[I-],1856,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,...,0.000000,2.000000,149.894242,74.947121,100000000,0,0.0,0.0,divide by zero encountered in power (mZagreb1),0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75372,COC(=O)NS(=O)(=O)C1=CC=C(C=C1)N;33,2,COC(=O)NS(=O)(=O)C1=CC=C(C=C1)N,33,12.226619,-5.339907,12.226619,0.265375,0.712426,230.245,...,9.461955,46.722121,230.036128,9.201445,378,20,72.0,80.0,6.895833,3.375000
75373,CCOP(=S)(OCC)OC1=NN(C(=N1)Cl)C(C)C;1632,1,CCOP(=S)(OCC)OC1=NN(C(=N1)Cl)C(C)C,1632,8.048957,-5.193498,8.048957,0.110750,0.720533,313.747,...,9.599473,64.450105,313.041677,8.944048,630,24,86.0,97.0,8.256944,4.180556
75374,C1=CC=C2C(=C1)NC(=S)S2;1373,1,C1=CC=C2C(=C1)NC(=S)S2,1373,7.612361,-0.332130,7.612361,0.131389,0.596343,167.258,...,9.085117,53.938362,166.986341,11.132423,108,11,52.0,60.0,2.833333,2.194444
75375,CCCCC(CC)C=O;2,2,CCCCC(CC)C=O,2,11.548424,-4.352546,11.548424,2.482554,0.519485,128.215,...,7.884953,35.263065,128.120115,5.124805,104,8,32.0,32.0,4.361111,2.500000


In [53]:
test_mordred_descriptors = All_Mordred_descriptors(test_data['Chemical_Id'])

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


KeyboardInterrupt: 

In [None]:
test_mordred_descriptors

In [None]:
test_mordred_descriptors.columns[test_mordred_descriptors.isnull().any()].tolist()

In [None]:
test_data = test_data.reset_index(drop=True)
test_data = pd.concat([test_data, test_mordred_descriptors], axis = 1)

In [None]:
test_data

# Feature selection using variance threshold

In [None]:
from sklearn.feature_selection import VarianceThreshold
from
sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
train_chemical_id =  list(train_data.Chemical_Id.unique())
le.fit(train_chemical_id)
train_data['Chemical_Id']=le.transform(train_data['Chemical_Id'])

In [None]:
#split target variable from rest of the data
train_Y = train_data["Expected"]

In [None]:
train_Y

In [None]:
train_data.drop(['Expected'], axis = 1, inplace=True) 
train_X = pd.get_dummies(train_data)

In [None]:
train_X

In [None]:
# Setting up variance threshold
sel = VarianceThreshold(threshold=0.25)
X_new = sel.fit_transform(train_X)

In [None]:
# Retrieve list of features from the index of features selected by variance threshold.
list_x = []
col_names = []
selected_features = []

In [None]:
list_x = sel.get_support(indices=True)
col_names = train_X.columns.tolist()

In [None]:
for i in list_x: 
    selected_features.append(col_names[i])

print("Selected features:")
print(selected_features)

In [None]:
le = preprocessing.LabelEncoder()
test_chemical_id =  list(test_data.Chemical_Id.unique())
le.fit(test_chemical_id)
test_data['Chemical_Id']=le.transform(test_data['Chemical_Id'])

In [None]:
features = ['Chemical_Id', 'Assay_Id', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_MRHI', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1']

In [None]:
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])