## Data cleaning and splitting of data into train and test

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook

In [2]:
# Loading the main data file
df = pd.read_csv('../data/b_g_labels.csv')
print(f'Total initial of the data is {len(df)}')
# Remove the duplicate rows
ndf = df.drop_duplicates()
print(f'Total duplicates removed {len(df)-len(ndf)}')
# Remove if any SMILE is assigned with two different labels
ndf = ndf.drop_duplicates(subset=['Smiles'], keep='first')
print(f'Final length of dataset {len(ndf)}')

Total initial of the data is 437
Total duplicates removed 59
Final length of dataset 376


### Convertion of SMILES into Canonical SMILES

In [3]:
%%time
smiles = ndf['Smiles'].tolist()
not_converted = []
Canonical_smiles= []
from rdkit import Chem
for i in tqdm_notebook(range(len(smiles))):
    try:
        mol = Chem.MolFromSmiles(smiles[i])
        Cano_Smiles = Chem.MolToSmiles(mol, True)
        Canonical_smiles.append(Cano_Smiles)
    except:
        Canonical_smiles.append('-')
        not_converted.append(i)
        pass

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=376.0), HTML(value='')))


CPU times: user 477 ms, sys: 33.5 ms, total: 510 ms
Wall time: 537 ms


In [4]:
ndf.insert(loc=1, column='Canonical_Smiles', value=Canonical_smiles)
print(ndf.head(3))
ndf.drop(not_converted, axis=0,inplace=True)
ndf.reset_index(drop=True, inplace=True)
# Saving the data containing Canonical Smiles
ndf.to_csv('data/b_g_labels_clean.csv')

                                              Smiles  \
0  BrC1=CC=C(C(C)N2CCC(CC2)N3C(NC4=CC=CC=C43)=O)C...   
1  BrC1=CC=C(CN2CCC(CC2)N3C(NC4=C(C(Cl)=CC(Cl)=C3...   
2  BrC1=CC=C(CN2CCC(CC2)N3C(NC4=C(C=C(C=C34)Cl)Cl...   

                                    Canonical_Smiles  Labels  
0    CC(c1ccc(Br)cc1F)N1CCC(n2c(=O)[nH]c3ccccc32)CC1       0  
1  O=c1[nH]c2c(Cl)c(Cl)cc(Cl)c2n1C1CCN(Cc2ccc(Br)...       0  
2  O=c1[nH]c2c(Cl)cc(Cl)cc2n1C1CCN(Cc2ccc(Br)cc2F...       0  


### Stratified splitting

In [5]:
data = pd.read_csv('data/b_g_labels_clean.csv', index_col = 0)
print('total data', len(data))
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=1, stratify=y)
print('type', type(x_train))
total = sum(y_train)+sum(y_test)

total_neg = len(y)-total

print('y_test_positive_percentage', (sum(y_test)/total)*100)
print('y_test_negative_percentage', ((len(y_test)-sum(y_test))/total_neg)*100)
newTrain = []

for i in range(len(x_train)):
    newTrain.append(x_train[i][0])
    
print('Training length', len(newTrain))

newdf = pd.DataFrame(list(zip(newTrain, y_train)), columns = ['Canonical_Smiles', 'Label'])
newdf.to_csv('data/b_g_labels_clean_train.csv', index = False)
newTest = []
for j in range(len(x_test)):
    newTest.append(x_test[j][0])

print('Test length', len(newTest))


newdf = pd.DataFrame(list(zip(newTest, y_test)), columns = ['Canonical_Smiles', 'Label'])
newdf.to_csv('data/b_g_labels_clean_test.csv', index = False)

print('Final total', len(newTrain)+len(newTest))



total data 376
type <class 'numpy.ndarray'>
y_test_positive_percentage 19.841269841269842
y_test_negative_percentage 20.4
Training length 300
Test length 76
Final total 376
