In [33]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [21]:
chem_data = pd.read_csv('data/chem_data/cleaned_data.csv')
hopv15 = pd.read_csv('data/hopv15/hopv15.csv')
qm9 = pd.read_csv('data/qm9/qm9.csv')

In [22]:
invalid_mols = ['[NH2+]=C1O[C-](C=O)C=C1',
'[NH2+]=C1NC=C[C-]1C#N',
'C[NH2+]C[C-]1OC(=O)C=C1',
'[NH3+]C[C-]1OC(=O)C=C1',
'[NH3+]CC[C-]1OC(=O)C=C1',
'CN1C=C[C-](C#N)C1=[NH2+]',
'NC1=[NH+][C-](C=O)C(=N)N1',
'NC1=CC(=[NH2+])O[C-]1C=O',
'CC1=CC(=[NH2+])O[C-]1C#N',
'[NH3+]C1=CC(=O)O[C-]1C=O',
'NC1=CC(=O)O[C-]1C[NH3+]',
'NC(=[NH2+])[C-]1OC(=O)C=C1',
'C[NH2+]C[C-]1NC=CC1=O',
'CC1=C[C-](C#N)C(=[NH2+])N1',
'NC(=[NH2+])[C-]1NC=CC1=O',
'C[NH+]=C1NC=C[C-]1C#N',
'[NH2+]=C1OC(=O)O[C-]1C=O',
'CC1=CC(=[NH2+])O[C-]1C=O',
'[NH2+]=C1O[C-](C=C1)C#N',
'CC1=CC(=[NH2+])O[C-]1C=O',
'NC(=[NH2+])[C-]1N=COC1=O',
'NC1=[NH+][C-](C#N)C(=O)N1',
'CC1=C[C-](OC1=[NH2+])C#N',
'CC1=C[C-](OC1=[NH2+])C=O',
'N=C1NC(=[NH2+])N[C-]1C#N',
'NC1=[NH+][C-](C#N)C(=N)N1',
'CC(=O)[C-]1OC(=[NH2+])C=C1',
'CC1=CC(=O)O[C-]1C[NH3+]',
'O=C1C2OC1C1CC2C1',
'O=C1C2OC1C1CC2O1',
'OC1C2NC1C1CC2O1',
'OC1C2CC1C1CC2C1',
'C1C2OC1C1CC2O1',
'C1C2CC1C1CC(C2)O1',
'CC1C2OC1C1CC2O1',
'O=C1C2NC1C1CC2C1',
'C1C2CC1C1CC2O1',
'C1C2CC1C1CC2C1',
'O=C1C2NC1C1CC2O1',
'O=C1C2CC1C1CC2C1',
'C1C2CC1C1CC(C1)O2',
'CC1C2CC1C1CC2O1',
'C1C2CC1C1CC(C1)C2',
'OC1C2OC1C1CC2C1',
'CC1C2CC1C1CC2C1',
'C1C2CC3CC(O3)C1O2',
'OC1C2CC1C1CC2O1',
'O=C1C2CC1C1CC2O1',
'CC1C2OC1C1CC2C1',
'CN1C2CC1C1CC2O1',
'OC1C2OC1C1CC2O1',
'C', 'N', 'O']

In [23]:
# clean chem_data
print('Original data shape:', chem_data.shape)

# drop na rows
chem_data = chem_data.dropna().reset_index(drop=True)
print('Data shape after filtering null rows:', chem_data.shape)

# remove invalid molecules
chem_data = chem_data[~chem_data['SMILES'].isin(invalid_mols)]
print('Data shape after filtering out invalid mols:', chem_data.shape)

Original data shape: (60, 3)
Data shape after filtering null rows: (46, 3)
Data shape after filtering out invalid mols: (46, 3)


In [29]:
# clean hopv15 data
print('Original data shape:', hopv15.shape)

# uppercase SMILES
#hopv15['SMILES'] = hopv15['SMILES'].str.upper()

# drop na rows
hopv15 = hopv15.dropna().reset_index(drop=True)
print('Data shape after filtering null rows:', hopv15.shape)

# remove invalid molecules
hopv15 = hopv15[~hopv15['SMILES'].isin(invalid_mols)]
print('Data shape after filtering out invalid mols:', hopv15.shape)

Original data shape: (350, 3)
Data shape after filtering null rows: (350, 3)
Data shape after filtering out invalid mols: (350, 3)


In [30]:
# clean qm9 data
print('Original data shape:', qm9.shape)

# drop na rows
qm9 = qm9.dropna().reset_index(drop=True)
print('Data shape after filtering null rows:', qm9.shape)

# remove invalid molecules
qm9 = qm9[~qm9['SMILES'].isin(invalid_mols)]
print('Data shape after filtering out invalid mols:', qm9.shape)

Original data shape: (133845, 3)
Data shape after filtering null rows: (133845, 3)
Data shape after filtering out invalid mols: (133832, 3)


In [45]:
# split test ratios
ratios = [0.5, 0.3, 0.1] # chem_data, hopv15, and qm9

# split data
train_data, test_data = [], []
for d, r in zip([chem_data, hopv15, qm9], ratios):
    train, test = train_test_split(d, test_size=r, random_state=1)
    train_data.append(train)
    test_data.append(test)

In [46]:
# merge data
train_data = pd.concat(train_data).reset_index(drop=True)
test_data = pd.concat(test_data).reset_index(drop=True)

# display train and test data shape
print('Shape: train_data {} and test_data {}'.format(train_data.shape, test_data.shape))

Shape: train_data (120716, 3) and test_data (13512, 3)


In [47]:
# save data
train_data.to_csv('trio_chem_supervised_train_data.csv', index=False)
test_data.to_csv('trio_chem_supervised_test_data.csv', index=False)