In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors

In [2]:
train_data = pd.read_csv('train_II.csv')
test_data = pd.read_csv('test_II.csv')

In [3]:
test_data = test_data.rename(columns = {"x": "Id"})
test_data.head()

Unnamed: 0,Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...


In [4]:
train_data[["Chemical_Id", "Assay_Id"]] = train_data.Id.str.split(";", expand = True)
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16
4,[Na+].[I-];1856,2,[Na+].[I-],1856


In [5]:
test_data[["Chemical_Id", "Assay_Id"]] = test_data.Id.str.split(";", expand = True)
test_data.head()

Unnamed: 0,Id,Chemical_Id,Assay_Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30


In [6]:
train_data['Assay_Id'] = train_data['Assay_Id'].astype(int)
test_data['Assay_Id'] = test_data['Assay_Id'].astype(int)

In [7]:
from rdkit.Chem import Descriptors
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        if mol:
        # add hydrogens to molecules
            mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
            descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(train_data['Chemical_Id'])

[00:14:34] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:14:37] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:14:39] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:14:40] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:14:42] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:14:43] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [8]:
df_descriptors_train = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_descriptors_train

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.874650,-2.987783,8.874650,0.765809,0.794714,317.599,306.511,315.982463,100,0,...,0,0,0,0,0,0,0,0,0,0
1,11.875100,-4.547647,11.875100,2.480493,0.516641,156.269,136.109,156.151415,66,0,...,0,0,0,0,0,0,0,0,0,0
2,8.736945,-6.030543,8.736945,0.000000,0.251327,362.086,313.702,361.347528,148,0,...,0,0,0,0,0,0,0,0,0,0
3,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,245.585,255.052302,90,0,...,0,0,0,0,0,0,0,0,0,0
4,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,149.894,149.894242,8,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75378,12.226619,-5.339907,12.226619,0.265375,0.712426,230.245,220.165,230.036128,82,0,...,0,1,0,0,0,0,0,0,0,0
75379,8.048957,-5.193498,8.048957,0.110750,0.720533,313.747,296.611,313.041677,104,0,...,0,0,0,0,0,0,0,0,0,0
75380,7.612361,-0.332130,7.612361,0.131389,0.596343,167.258,162.218,166.986341,50,0,...,0,0,0,0,0,1,0,0,0,0
75381,11.548424,-4.352546,11.548424,2.482554,0.519485,128.215,112.087,128.120115,54,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_descriptors_train.columns[df_descriptors_train.isnull().any()].tolist()

['MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW']

In [10]:
df_descriptors_train = df_descriptors_train.replace([np.inf, -np.inf], np.nan)

In [11]:
df_descriptors_train = df_descriptors_train.fillna(df_descriptors_train.mean())

In [12]:
df_descriptors_train.columns[df_descriptors_train.isnull().any()].tolist()

[]

In [13]:
train_data = train_data.reset_index(drop=True)
train_data = pd.concat([train_data, df_descriptors_train], axis = 1)

In [14]:
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644,8.87465,-2.987783,8.87465,0.765809,0.794714,317.599,...,0,0,0,0,0,0,0,0,0,0
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451,11.8751,-4.547647,11.8751,2.480493,0.516641,156.269,...,0,0,0,0,0,0,0,0,0,0
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384,8.736945,-6.030543,8.736945,0.0,0.251327,362.086,...,0,0,0,0,0,0,0,0,0,0
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,...,0,0,0,0,0,0,0,0,0,0
4,[Na+].[I-];1856,2,[Na+].[I-],1856,0.0,0.0,0.0,0.0,0.237972,149.894,...,0,0,0,0,0,0,0,0,0,0


In [15]:
from rdkit.Chem import Descriptors
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        if mol:
        # add hydrogens to molecules
            mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
            descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(test_data['Chemical_Id'])

In [16]:
df_descriptors_test = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_descriptors_test

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,164.120115,66,0,...,0,0,0,0,0,0,0,0,0,0
1,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,431.056940,152,0,...,0,1,1,0,0,0,0,0,0,1
2,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,695.250845,254,0,...,0,1,0,0,0,0,0,0,0,0
3,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,200.949810,56,0,...,0,0,0,0,0,0,0,0,0,0
4,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,418.271924,168,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,13.677486,-5.541970,13.677486,1.263580,0.391778,408.376,389.224,408.129692,154,0,...,0,0,0,0,0,0,0,0,0,0
10990,12.722294,-3.388339,12.722294,0.917255,0.635519,320.344,304.216,320.104859,120,0,...,0,0,0,0,0,0,0,0,0,0
10991,13.186679,-5.902467,13.186679,0.638923,0.705120,381.370,366.250,381.074304,138,0,...,0,1,0,0,0,0,0,0,0,1
10992,11.701188,-5.111667,11.701188,0.820825,0.540194,220.976,213.920,219.945901,66,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_descriptors_test.columns[df_descriptors_test.isnull().any()].tolist()

['MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW']

In [18]:
df_descriptors_test = df_descriptors_test.replace([np.inf, -np.inf], np.nan)

In [19]:
df_descriptors_test = df_descriptors_test.fillna(df_descriptors_test.mean())

In [20]:
df_descriptors_test.columns[df_descriptors_test.isnull().any()].tolist()

[]

In [21]:
test_data = test_data.reset_index(drop=True)
test_data = pd.concat([test_data, df_descriptors_test], axis = 1)

In [22]:
test_data

Unnamed: 0,Id,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,...,0,0,0,0,0,0,0,0,0,0
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,...,0,1,1,0,0,0,0,0,0,1
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,...,0,1,0,0,0,0,0,0,0,0
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,...,0,0,0,0,0,0,0,0,0,0
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,38,13.677486,-5.541970,13.677486,1.263580,0.391778,408.376,389.224,...,0,0,0,0,0,0,0,0,0,0
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,34,12.722294,-3.388339,12.722294,0.917255,0.635519,320.344,304.216,...,0,0,0,0,0,0,0,0,0,0
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,1640,13.186679,-5.902467,13.186679,0.638923,0.705120,381.370,366.250,...,0,1,0,0,0,0,0,0,0,1
10992,COP(=O)(OC)OC=C(Cl)Cl;28,COP(=O)(OC)OC=C(Cl)Cl,28,11.701188,-5.111667,11.701188,0.820825,0.540194,220.976,213.920,...,0,0,0,0,0,0,0,0,0,0


In [23]:
train_data.shape

(75383, 212)

In [24]:
test_data.shape

(10994, 211)

In [25]:
from rdkit.Chem import AllChem
def smiles_to_fingerprints(smiles_list):
    fingerprints = []
    for smiles in smiles_list:
        m = Chem.MolFromSmiles(smiles)
        if m is None:
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(m, 2, 2048)
        mfpts = np.array(fp)
        fingerprints.append(mfpts)
    return np.array(fingerprints)

In [26]:
chemical_Id_list_train = train_data['Chemical_Id'].to_list()

In [27]:
train_fingerprints = smiles_to_fingerprints(chemical_Id_list_train)

[00:34:08] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:34:25] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:34:41] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:34:46] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:35:03] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:35:13] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [28]:
train_Mfingerprints = pd.DataFrame(train_fingerprints,columns=['V{}'.format(i) for i in range(train_fingerprints.shape[1])])

In [29]:
train_Mfingerprints.columns[train_Mfingerprints.isnull().any()].tolist()

[]

In [30]:
train_Mfingerprints.columns[train_Mfingerprints.isna().any()].tolist()

[]

In [31]:
train_data = train_data.reset_index(drop=True)
train_data = pd.concat([train_data, train_Mfingerprints], axis = 1)

In [32]:
train_data.shape

(75383, 2260)

In [33]:
chemical_Id_list_test = test_data['Chemical_Id'].to_list()

In [34]:
test_fingerprints = smiles_to_fingerprints(chemical_Id_list_test)

In [35]:
test_Mfingerprints = pd.DataFrame(test_fingerprints,columns=['V{}'.format(i) for i in range(test_fingerprints.shape[1])])

In [36]:
test_Mfingerprints.columns[test_Mfingerprints.isnull().any()].tolist()

[]

In [37]:
test_Mfingerprints.columns[test_Mfingerprints.isna().any()].tolist()

[]

In [38]:
test_data = test_data.reset_index(drop=True)
test_data = pd.concat([test_data, test_Mfingerprints], axis = 1)

In [39]:
test_data.shape

(10994, 2259)

In [40]:
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,V2038,V2039,V2040,V2041,V2042,V2043,V2044,V2045,V2046,V2047
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644,8.87465,-2.987783,8.87465,0.765809,0.794714,317.599,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451,11.8751,-4.547647,11.8751,2.480493,0.516641,156.269,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384,8.736945,-6.030543,8.736945,0.0,0.251327,362.086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,[Na+].[I-];1856,2,[Na+].[I-],1856,0.0,0.0,0.0,0.0,0.237972,149.894,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
test_data.head()

Unnamed: 0,Id,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,V2038,V2039,V2040,V2041,V2042,V2043,V2044,V2045,V2046,V2047
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.12,...,0,0,0,0,0,0,0,0,0,0
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,...,0,0,0,0,0,0,0,0,0,0
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,...,0,0,0,0,0,0,0,0,0,0
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850,10.745579,-4.559583,10.745579,0.0,0.393203,201.244,197.212,...,0,0,0,0,0,0,0,0,0,0
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.27,...,0,1,0,0,0,0,0,0,0,0


In [42]:
test_data.columns[test_data.isna().any()].tolist()

[]

In [43]:
train_data.columns[train_data.isna().any()].tolist()

['V0',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V86',
 'V87',
 'V88',
 'V89',
 'V90',
 'V91',
 'V92',
 'V93',
 'V94',
 'V95',
 'V96',
 'V97',
 'V98',
 'V99',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V120',
 'V121',
 'V122',
 'V

In [44]:
train_data = train_data.fillna(train_data.mean())

  train_data = train_data.fillna(train_data.mean())


In [45]:
train_data.columns[train_data.isna().any()].tolist()

[]

feature selection using variance threshold

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
train_chemical_id =  list(train_data.Chemical_Id.unique())
le.fit(train_chemical_id)
train_data['Chemical_Id']=le.transform(train_data['Chemical_Id'])

In [None]:
#split target variable from rest of the data
train_Y = train_data["Expected"]

In [None]:
train_Y

In [None]:
train_data.drop(['Expected'], axis = 1, inplace=True) 
train_X = pd.get_dummies(train_data)

In [None]:
train_X

In [None]:
# Setting up variance threshold
sel = VarianceThreshold(threshold=0.25)
X_new = sel.fit_transform(train_X)

In [None]:
# Retrieve list of features from the index of features selected by variance threshold.
list_x = []
col_names = []
selected_features = []

In [None]:
list_x = sel.get_support(indices=True)
col_names = train_X.columns.tolist()

In [None]:
for i in list_x: 
    selected_features.append(col_names[i])

print("Selected features:")
print(selected_features)

In [None]:
le = preprocessing.LabelEncoder()
test_chemical_id =  list(test_data.Chemical_Id.unique())
le.fit(test_chemical_id)
test_data['Chemical_Id']=le.transform(test_data['Chemical_Id'])

In [None]:
features = ['Chemical_Id', 'Assay_Id', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_MRHI', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1']

In [None]:
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

Feature selection using correlation

In [None]:
# corr_coeffs = X_train.corrwith(train_Y)
# corr_coeffs = corr_coeffs.sort_values(ascending=False)
# print('Feature rankings based on correlation with', train_Y)
# for i, (feature, corr) in enumerate(corr_coeffs.items()):
#     print(f'{i+1}. {feature}: {corr:.3f}')

In [None]:
# features = ['SlogP_VSA5', 'VSA_EState1', 'PEOE_VSA6', 'NumAromaticRings', 'NumValenceElectrons', 'EState_VSA9', 'Chi0n', 'Chi0', 'RingCount', 'Chi1n', 'HeavyAtomCount', 'Chi1', 'PEOE_VSA7', 'HeavyAtomMolWt', 'ExactMolWt', 'MolWt', 'Chi4v', 'BertzCT', 'EState_VSA1', 'SlogP_VSA6', 'LabuteASA', 'SMR_VSA7', 'Chi2n', 'Chi4n', 'Chi3n', 'Chi3v', 'Chi0v', 'fr_benzene', 'NumAromaticCarbocycles', 'MolLogP', 'Chi2v', 'MolMR', 'Chi1v', 'Assay_Id', 'VSA_EState6', 'BalabanJ', 'VSA_EState4', 'FpDensityMorgan2']

In [None]:
# X_train = pd.get_dummies(train_data[features])
# X_test = pd.get_dummies(test_data[features])

Classifiers

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier

In [None]:
# from sklearn import tree
# model = BaggingClassifier(estimator=tree.DecisionTreeClassifier(random_state=1,
#                                                                      criterion='entropy',
#                                                                      max_depth=35,
#                                                                      class_weight='balanced'),
#                                                                      random_state=1,
#                                                                      n_estimators=20)

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# # RandomForestClassifier
# model = RandomForestClassifier(random_state=1, max_depth=35, criterion='entropy')

In [None]:
# !pip install lightgbm

In [None]:
# !pip install catboost

In [None]:
# from sklearn.ensemble import VotingClassifier
# import xgboost as xgb
# import lightgbm as lgb/
# from sklearn.tree import DecisionTreeClassifier

# clf1 = xgb.XGBClassifier(random_state=1,booster="gbtree",learning_rate=0.25,n_estimators=250,max_depth=12, min_child_weight=4)
# clf1 = lgb.LGBMClassifier(num_threads=1,boosting_type= 'goss',learning_rate=0.1,n_estimators=1000,max_depth=10,num_leaves=100,max_bin = 5000)

# model = VotingClassifier(estimators=[('XGB',clf1),('CAT',clf2)],voting='hard')

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(learning_rate=0.1, boosting_type= 'Ordered',max_depth=10,max_bin = 5000, iterations=50) 

In [None]:
print(np.isnan(X_train).any())

print(np.isinf(X_train).any())
X_train.shape, train_Y.shape

In [None]:
X_train

In [None]:
X_test

In [None]:
train_Y

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)

In [None]:
model = model.fit(X_train, train_Y)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
X_test = scaler.fit_transform(X_test)

In [None]:
val_predictions = model.predict(X_test)

In [None]:
accuracy_scores = cross_val_score(model, X_train, train_Y, cv=5)
f1_macro_scores = cross_val_score(model, X_train, train_Y, cv=5, scoring='f1_macro')

print("%0.4f accuracy with a standard deviation of %0.4f" % (accuracy_scores.mean(), accuracy_scores.std()))
print("%0.4f f1_score with a standard deviation of %0.4f" % (f1_macro_scores.mean(), f1_macro_scores.std()))

In [None]:
output = pd.DataFrame({'Id': test_data.Id, 'Predicted': val_predictions})
output.to_csv('Output.csv', index=False)