In [1]:
import pandas as pd
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit import Chem
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
from tqdm import tqdm

In [5]:
pd.set_option('display.max_rows', None)

In [2]:
base_path = Path().cwd().joinpath('content/datasets/')
train_dataset = pd.read_csv(base_path.joinpath('train_II.csv'))
test_dataset = pd.read_csv(base_path.joinpath('test_II.csv'))

# Reference - https://stackoverflow.com/questions/14745022/how-to-split-a-dataframe-string-column-into-two-columns
# Dated - 03/02/2023
train_dataset[["Chemical Id", "Assay Id"]] = train_dataset.Id.str.split(
    ";", expand=True
)
train_dataset.drop("Id", axis=1, inplace=True)

# Reference - https://stackoverflow.com/questions/35321812/move-column-in-pandas-dataframe
# Dated - 03/02/2023
train_dataset = train_dataset[["Assay Id", "Chemical Id", "Expected"]]

test_dataset[["Chemical Id", "Assay Id"]] = test_dataset.x.str.split(";", expand=True)
test_dataset.drop("x", axis=1, inplace=True)
test_dataset = test_dataset[["Assay Id", "Chemical Id"]]


In [3]:
def get_molecule_descriptors(smiles):
    
    '''
        Generate molecular descriptors and returning descriptor names and descriptor values
        Reference - https://www.youtube.com/watch?v=9i9SY6Nd1Zw
    '''
    
    mols = [Chem.MolFromSmiles(smile) for smile in smiles]
    cal_desc = MoleculeDescriptors.MolecularDescriptorCalculator([d[0] for d in Descriptors._descList])
    desc_names = cal_desc.GetDescriptorNames()
    desc_data = []
    for mol in tqdm(mols):
        descriptors = cal_desc.CalcDescriptors(mol)
        desc_data.append(descriptors)
  
    return desc_names, desc_data

In [4]:
train_desc_names, train_desc_data = get_molecule_descriptors(train_dataset['Chemical Id'])

[18:32:40] Explicit valence for atom # 1 Si, 8, is greater than permitted
[18:32:41] Explicit valence for atom # 1 Si, 8, is greater than permitted
[18:32:41] Explicit valence for atom # 1 Si, 8, is greater than permitted
[18:32:41] Explicit valence for atom # 1 Si, 8, is greater than permitted
[18:32:42] Explicit valence for atom # 1 Si, 8, is greater than permitted
[18:32:42] Explicit valence for atom # 1 Si, 8, is greater than permitted
 13%|████▊                               | 10128/75383 [01:18<07:58, 136.25it/s]Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line 76, in MaxEStateIndex
    return max(EStateIndices(mol, force))
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/ESta

Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/Crippen.py", line 170, in <lambda>
    MolLogP = lambda *x, **y: rdMolDescriptors.CalcCrippenDescriptors(*x, **y)[0]
Boost.Python.ArgumentError: Python argument types in
    rdkit.Chem.rdMolDescriptors.CalcCrippenDescriptors(NoneType)
did not match C++ signature:
    CalcCrippenDescriptors(RDKit::ROMol mol, bool includeHs=True, bool force=False)
Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/Crippen.py", line 186, in <lambda>
    MolMR = lambda *x, **y: rdMolDe

 35%|████████████▌                       | 26293/75383 [03:28<06:20, 129.01it/s]Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line 76, in MaxEStateIndex
    return max(EStateIndices(mol, force))
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line 46, in EStateIndices
    nAtoms = mol.GetNumAtoms()
AttributeError: 'NoneType' object has no attribute 'GetNumAtoms'
Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line

 56%|████████████████████▏               | 42321/75383 [05:35<04:13, 130.40it/s]Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line 76, in MaxEStateIndex
    return max(EStateIndices(mol, force))
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line 46, in EStateIndices
    nAtoms = mol.GetNumAtoms()
AttributeError: 'NoneType' object has no attribute 'GetNumAtoms'
Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line

 63%|██████████████████████▌             | 47218/75383 [06:14<03:43, 126.24it/s]Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line 76, in MaxEStateIndex
    return max(EStateIndices(mol, force))
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line 46, in EStateIndices
    nAtoms = mol.GetNumAtoms()
AttributeError: 'NoneType' object has no attribute 'GetNumAtoms'
Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line

Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/Lipinski.py", line 57, in <lambda>
    NumHeteroatoms = lambda x: rdMolDescriptors.CalcNumHeteroatoms(x)
Boost.Python.ArgumentError: Python argument types in
    rdkit.Chem.rdMolDescriptors.CalcNumHeteroatoms(NoneType)
did not match C++ signature:
    CalcNumHeteroatoms(RDKit::ROMol mol)
Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/Lipinski.py", line 61, in <lambda>
    NumRotatableBonds = lambda x: rdMolDescriptors.CalcNumRotatableBonds(x)
Boost.Python.Argum

 83%|██████████████████████████████      | 62931/75383 [08:22<01:31, 135.49it/s]Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line 76, in MaxEStateIndex
    return max(EStateIndices(mol, force))
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line 46, in EStateIndices
    nAtoms = mol.GetNumAtoms()
AttributeError: 'NoneType' object has no attribute 'GetNumAtoms'
Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line

Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/GraphDescriptors.py", line 405, in <lambda>
    Chi1v = lambda x: rdMolDescriptors.CalcChi1v(x)
Boost.Python.ArgumentError: Python argument types in
    rdkit.Chem.rdMolDescriptors.CalcChi1v(NoneType)
did not match C++ signature:
    CalcChi1v(RDKit::ROMol mol, bool force=False)
Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/GraphDescriptors.py", line 420, in <lambda>
    Chi2n = lambda x: rdMolDescriptors.CalcChi2n(x)
Boost.Python.ArgumentError: Python argumen

 96%|██████████████████████████████████▍ | 71995/75383 [09:35<00:26, 125.99it/s]Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line 76, in MaxEStateIndex
    return max(EStateIndices(mol, force))
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line 46, in EStateIndices
    nAtoms = mol.GetNumAtoms()
AttributeError: 'NoneType' object has no attribute 'GetNumAtoms'
Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/EState/EState.py", line

Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/Lipinski.py", line 90, in <lambda>
    _fn = lambda x, y=_cfn: y(x)
Boost.Python.ArgumentError: Python argument types in
    rdkit.Chem.rdMolDescriptors.CalcFractionCSP3(NoneType)
did not match C++ signature:
    CalcFractionCSP3(RDKit::ROMol mol)
Traceback (most recent call last):
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/ML/Descriptors/MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
  File "/Users/rahuldev/miniforge3/envs/toxic/lib/python3.10/site-packages/rdkit/Chem/Lipinski.py", line 78, in HeavyAtomCount
    return mol.GetNumHeavyAtoms()
AttributeError: 'NoneType' object has no attribute 'GetNumHeavyAtoms'
Traceback (most recent ca

100%|████████████████████████████████████| 75383/75383 [10:02<00:00, 125.08it/s]


In [6]:
train_data = pd.DataFrame(data=train_desc_data, columns=train_desc_names)

train_data.insert(loc=0, column='Assay Id', value=train_dataset['Assay Id'])

train_data['Expected'] = train_dataset['Expected']

In [7]:
train_data.to_csv('train_molecular_data.csv', index=False)

In [8]:
train_data.isna().sum()

Assay Id                       0
MaxEStateIndex                 0
MinEStateIndex                 0
MaxAbsEStateIndex              0
MinAbsEStateIndex              0
qed                            0
MolWt                          0
HeavyAtomMolWt                 0
ExactMolWt                     0
NumValenceElectrons            0
NumRadicalElectrons            0
MaxPartialCharge             420
MinPartialCharge             420
MaxAbsPartialCharge          420
MinAbsPartialCharge          420
FpDensityMorgan1               0
FpDensityMorgan2               0
FpDensityMorgan3               0
BCUT2D_MWHI                 3819
BCUT2D_MWLOW                3819
BCUT2D_CHGHI                3819
BCUT2D_CHGLO                3819
BCUT2D_LOGPHI               3819
BCUT2D_LOGPLOW              3819
BCUT2D_MRHI                 3819
BCUT2D_MRLOW                3819
BalabanJ                       0
BertzCT                        0
Chi0                           0
Chi0n                          0
Chi0v     

In [10]:
train_data.dropna(inplace=True, axis=0)
train_data = train_data.reset_index(drop=True)

In [11]:
train_data.isna().sum()

Assay Id                    0
MaxEStateIndex              0
MinEStateIndex              0
MaxAbsEStateIndex           0
MinAbsEStateIndex           0
qed                         0
MolWt                       0
HeavyAtomMolWt              0
ExactMolWt                  0
NumValenceElectrons         0
NumRadicalElectrons         0
MaxPartialCharge            0
MinPartialCharge            0
MaxAbsPartialCharge         0
MinAbsPartialCharge         0
FpDensityMorgan1            0
FpDensityMorgan2            0
FpDensityMorgan3            0
BCUT2D_MWHI                 0
BCUT2D_MWLOW                0
BCUT2D_CHGHI                0
BCUT2D_CHGLO                0
BCUT2D_LOGPHI               0
BCUT2D_LOGPLOW              0
BCUT2D_MRHI                 0
BCUT2D_MRLOW                0
BalabanJ                    0
BertzCT                     0
Chi0                        0
Chi0n                       0
Chi0v                       0
Chi1                        0
Chi1n                       0
Chi1v     

In [12]:
y_train = train_data.loc[:, 'Expected']
X_train = train_data.drop('Expected', axis=1)

In [13]:
X_train.head()

Unnamed: 0,Assay Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,1644,9.3162,-1.533785,9.3162,0.150485,0.794714,317.599,306.511,315.982463,100,...,0,0,0,0,0,0,0,0,0,0
1,2451,10.532611,0.333788,10.532611,0.333788,0.516641,156.269,136.109,156.151415,66,...,0,0,0,0,0,0,0,0,4,0
2,1384,2.433032,0.0,2.433032,0.0,0.251327,362.086,313.702,361.347528,148,...,0,0,0,0,0,0,0,0,12,0
3,16,10.35508,-0.613825,10.35508,0.282361,0.487998,255.665,245.585,255.052302,90,...,0,0,0,0,0,0,0,0,0,0
4,1646,12.034676,-2.419051,12.034676,0.546336,0.588579,242.346,223.194,242.056408,80,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X_train.tail()

Unnamed: 0,Assay Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
71559,33,11.460021,-3.868472,11.460021,0.053611,0.712426,230.245,220.165,230.036128,82,...,0,1,0,0,0,0,0,0,0,0
71560,1632,5.928972,-2.841623,5.928972,0.082346,0.720533,313.747,296.611,313.041677,104,...,0,0,0,0,0,0,0,0,0,0
71561,1373,4.975926,0.848333,4.975926,0.848333,0.596343,167.258,162.218,166.986341,50,...,0,0,0,0,0,1,0,0,0,0
71562,2,10.241948,0.324028,10.241948,0.324028,0.519485,128.215,112.087,128.120115,54,...,0,0,0,0,0,0,0,0,0,0
71563,2,12.552405,-0.188577,12.552405,0.048913,0.165457,636.962,572.45,636.486609,258,...,0,0,0,0,0,0,0,0,5,0


In [15]:
test_dataset.describe()

Unnamed: 0,Assay Id,Chemical Id
count,10994,10994
unique,150,1999
top,1852,CC(C)(C1=CC=C(C=C1)O)C2=CC=C(C=C2)O
freq,400,39


In [16]:
test_desc_names, test_desc_data = get_molecule_descriptors(test_dataset.loc[:, 'Chemical Id'])

100%|████████████████████████████████████| 10994/10994 [01:28<00:00, 123.95it/s]


In [17]:
test_mol_frame = pd.DataFrame(data=test_desc_data, columns=test_desc_names)
test_mol_frame.insert(loc=0, column='Assay Id', value=test_dataset['Assay Id'])
test_mol_frame.to_csv('test_molecular_data.csv', index=False)

In [18]:
test_mol_frame.head()

Unnamed: 0,Assay Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,1682,9.626968,0.025579,9.626968,0.025579,0.624614,164.248,148.12,164.120115,66,...,0,0,0,0,0,0,0,0,0,0
1,1656,12.473362,-4.605249,12.473362,0.061165,0.617511,431.452,414.316,431.05694,152,...,0,1,1,0,0,0,0,0,0,1
2,36,14.627193,-4.140552,14.627193,0.064351,0.224134,696.264,655.944,695.250845,254,...,0,1,0,0,0,0,0,0,0,0
3,1850,10.420833,-3.973958,10.420833,0.0,0.393203,201.244,197.212,200.94981,56,...,0,0,0,0,0,0,0,0,0,0
4,30,12.865865,-0.601027,12.865865,0.094949,0.639062,418.574,380.27,418.271924,168,...,0,0,0,0,0,0,0,0,0,0


In [19]:
test_mol_frame.isnull().sum()

Assay Id                      0
MaxEStateIndex                0
MinEStateIndex                0
MaxAbsEStateIndex             0
MinAbsEStateIndex             0
qed                           0
MolWt                         0
HeavyAtomMolWt                0
ExactMolWt                    0
NumValenceElectrons           0
NumRadicalElectrons           0
MaxPartialCharge             48
MinPartialCharge             48
MaxAbsPartialCharge          48
MinAbsPartialCharge          48
FpDensityMorgan1              0
FpDensityMorgan2              0
FpDensityMorgan3              0
BCUT2D_MWHI                 493
BCUT2D_MWLOW                493
BCUT2D_CHGHI                493
BCUT2D_CHGLO                493
BCUT2D_LOGPHI               493
BCUT2D_LOGPLOW              493
BCUT2D_MRHI                 493
BCUT2D_MRLOW                493
BalabanJ                      0
BertzCT                       0
Chi0                          0
Chi0n                         0
Chi0v                         0
Chi1    

In [20]:
test_mol_frame.fillna(value=0, inplace=True, axis=0)
# test_mol_frame = test_mol_frame.reset_index(drop=True)

In [21]:
test_mol_frame.head()

Unnamed: 0,Assay Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,1682,9.626968,0.025579,9.626968,0.025579,0.624614,164.248,148.12,164.120115,66,...,0,0,0,0,0,0,0,0,0,0
1,1656,12.473362,-4.605249,12.473362,0.061165,0.617511,431.452,414.316,431.05694,152,...,0,1,1,0,0,0,0,0,0,1
2,36,14.627193,-4.140552,14.627193,0.064351,0.224134,696.264,655.944,695.250845,254,...,0,1,0,0,0,0,0,0,0,0
3,1850,10.420833,-3.973958,10.420833,0.0,0.393203,201.244,197.212,200.94981,56,...,0,0,0,0,0,0,0,0,0,0
4,30,12.865865,-0.601027,12.865865,0.094949,0.639062,418.574,380.27,418.271924,168,...,0,0,0,0,0,0,0,0,0,0


In [22]:
test_mol_frame.isna().sum()

Assay Id                    0
MaxEStateIndex              0
MinEStateIndex              0
MaxAbsEStateIndex           0
MinAbsEStateIndex           0
qed                         0
MolWt                       0
HeavyAtomMolWt              0
ExactMolWt                  0
NumValenceElectrons         0
NumRadicalElectrons         0
MaxPartialCharge            0
MinPartialCharge            0
MaxAbsPartialCharge         0
MinAbsPartialCharge         0
FpDensityMorgan1            0
FpDensityMorgan2            0
FpDensityMorgan3            0
BCUT2D_MWHI                 0
BCUT2D_MWLOW                0
BCUT2D_CHGHI                0
BCUT2D_CHGLO                0
BCUT2D_LOGPHI               0
BCUT2D_LOGPLOW              0
BCUT2D_MRHI                 0
BCUT2D_MRLOW                0
BalabanJ                    0
BertzCT                     0
Chi0                        0
Chi0n                       0
Chi0v                       0
Chi1                        0
Chi1n                       0
Chi1v     

In [23]:
rfc = RandomForestClassifier(n_estimators=200)
model = rfc.fit(X_train, y_train)

In [24]:
pred = model.predict(test_mol_frame)
pred.reshape(-1, 1)

array([[2],
       [2],
       [2],
       ...,
       [2],
       [2],
       [1]])

In [26]:
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = pred
final_df.columns = ['Id', 'Predicted']

In [27]:
final_df.head()

Unnamed: 0,Id,Predicted
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,2
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,2
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,2
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,2
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,1


In [28]:
final_df.shape

(10994, 2)

In [29]:
final_df.to_csv('Initial_submission.csv', index=False)