In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestClassifier


In [3]:
df = pd.read_csv('cmpd.csv')
df.describe()

Unnamed: 0,inchikey,smiles,group,activity
count,5530,5530,5530,5530
unique,5489,5507,2,4
top,PHXJVRSECIGDHY-UHFFFAOYSA-N,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,train,active
freq,4,3,3977,2704


In [4]:
df.isnull().sum()

inchikey    0
smiles      0
group       0
activity    0
dtype: int64

In [5]:
df.head()

Unnamed: 0,inchikey,smiles,group,activity
0,FNHKPVJBJVTLMP-UHFFFAOYSA-N,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,train,active
1,CUDVHEFYRIWYQD-UHFFFAOYSA-N,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,train,active
2,TTZSNFLLYPYKIL-UHFFFAOYSA-N,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,test,active
3,UOVCGJXDGOGOCZ-UHFFFAOYSA-N,COc1cc2c(cc1F)C(c1ccccc1Cl)=Nc1c(n[nH]c1C)N2,train,active
4,CUIHSIWYWATEQL-UHFFFAOYSA-N,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,active


In [6]:
train_set = pd.DataFrame(columns=['inchikey', 'smiles', 'activity'])
test_set = pd.DataFrame(columns=['ichikey', 'smiles', 'activity'])
train_set
test_set

Unnamed: 0,ichikey,smiles,activity


In [7]:
df.iloc[0, [0, 1, 3]].values

array(['FNHKPVJBJVTLMP-UHFFFAOYSA-N',
       'CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)c3)c(F)c2)ccn1',
       'active'], dtype=object)

In [8]:
for idx in range(len(df)):
    if df.iloc[idx, 2] == 'train':
        train_set.loc[idx] = df.iloc[idx, [0, 1, 3]].values
    else:
        test_set.loc[idx] = df.iloc[idx, [0, 1, 3]].values

In [9]:
train_set.reset_index(drop=True)

Unnamed: 0,inchikey,smiles,activity
0,FNHKPVJBJVTLMP-UHFFFAOYSA-N,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,active
1,CUDVHEFYRIWYQD-UHFFFAOYSA-N,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,active
2,UOVCGJXDGOGOCZ-UHFFFAOYSA-N,COc1cc2c(cc1F)C(c1ccccc1Cl)=Nc1c(n[nH]c1C)N2,active
3,IFPPYSWJNWHOLQ-UHFFFAOYSA-N,CCN(CC)CCOc1ccc(Nc2ncc3cc(-c4c(Cl)cccc4Cl)c(=O...,active
4,WOSKHXYHFSIKNG-UHFFFAOYSA-N,COc1cc2nccc(Oc3ccc(NC(=O)NC4CC4)c(Cl)c3)c2cc1C...,active
...,...,...,...
3972,INSBKYCYLCEBOD-UHFFFAOYSA-N,O=C(O)Cc1c2ccccc2n2c1[nH]c(=O)c1ccccc12,active
3973,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,inactive
3974,SVRAGOOKTLUHES-UHFFFAOYSA-N,CN1CCC(n2cnc(-c3ccc(F)cc3)c2-c2ccnc(N)n2)CC1,unknown
3975,QQJUCFIPZAVTEU-UHFFFAOYSA-N,CC1(C)CC(n2cnc(-c3ccc(F)cc3)c2-c2ccnc(N)n2)CC(...,unknown


In [10]:
train_set.describe()

Unnamed: 0,inchikey,smiles,activity
count,3977,3977,3977
unique,3950,3962,4
top,PHXJVRSECIGDHY-UHFFFAOYSA-N,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,active
freq,4,3,2146


In [11]:
test_set.reset_index(drop=True)

Unnamed: 0,ichikey,smiles,activity
0,TTZSNFLLYPYKIL-UHFFFAOYSA-N,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,active
1,CUIHSIWYWATEQL-UHFFFAOYSA-N,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,active
2,JMGXJHWTVBGOKG-UHFFFAOYSA-N,Cc1cc(-c2cc(OC(=O)c3ccccc3)ccc2Cl)cc2nnc(Nc3cc...,active
3,DXCUKNQANPLTEJ-UHFFFAOYSA-N,CCN(CC)CCCCNc1ncc2cc(-c3cc(OC)cc(OC)c3)c(NC(=O...,active
4,PIQCTGMSNWUMAF-UHFFFAOYSA-N,CN1CCN(c2ccc3nc(-c4c(N)c5c(F)cccc5[nH]c4=O)[nH...,active
...,...,...,...
1548,UBAHPEHGSJRHGA-UHFFFAOYSA-N,Cc1cccc2nc(-c3ccc(-c4cccc(CN5CCC(C(N)=O)CC5)c4...,inactive
1549,RTTIKBHDHKOSNI-UHFFFAOYSA-N,Cc1c2ccccc2nc2c1c1cc(NCCN(C)C)ccc1n2CCN(C)C,inactive
1550,HVUOSZANYULBJR-UHFFFAOYSA-N,Cc1ccc(-c2ccn(-c3ccc4c5c(n(C)c4c3)CCCNC5)c(=O)...,inactive
1551,SNFWCJIVWUVRNO-UHFFFAOYSA-N,N#Cc1c(-c2ccccc2C(F)(F)F)nc(SCc2ccc(OC(F)(F)F)...,inactive


In [12]:
test_set.describe()

Unnamed: 0,ichikey,smiles,activity
count,1553,1553,1553
unique,1541,1546,4
top,DXCUKNQANPLTEJ-UHFFFAOYSA-N,Cc1cc(CNc2nccc(Nc3cc(CCc4ccccc4)[nH]n3)n2)on1,inactive
freq,3,2,811


In [13]:
# data split
x_train = pd.DataFrame(train_set.iloc[:, :-1])
y_train = pd.DataFrame(train_set.iloc[:, -1])
x_test = pd.DataFrame(test_set.iloc[:, :-1])
y_test = pd.DataFrame(test_set.iloc[:, -1])

In [14]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

In [15]:
x_train.shape

(3181, 2)

In [16]:
x_train.values

array([['UOOZSMNSBYVUGZ-UHFFFAOYSA-N',
        'Cn1cc(-c2cnn3c(N)c(-c4ccc(NC(=O)Nc5cccc(C(F)(F)F)c5)cc4)cnc23)cn1'],
       ['JRVSFZKYQCETAH-UHFFFAOYSA-N',
        'COc1cccc(C(C)NC(=O)c2ccc(-c3ccncc3)cc2)c1'],
       ['CVLRLLPCDQDBIQ-UHFFFAOYSA-N', 'O=Cc1ccc2c(c1)ncn2-c1ccccc1'],
       ...,
       ['GNZPMJDEUIVORU-UHFFFAOYSA-N',
        'O=C(c1cc(-c2ccc3[nH]ncc3c2)on1)N1CCCC(O)C1'],
       ['ZBNZXTGUTAYRHI-UHFFFAOYSA-N',
        'Cc1nc(Nc2ncc(C(=O)Nc3c(C)cccc3Cl)s2)cc(N2CCN(CCO)CC2)n1'],
       ['DAHUEDOCTJXJLO-UHFFFAOYSA-N', 'CC(C)Nc1n[nH]c2ccc(N)cc12']],
      dtype=object)

In [17]:
print(x_train.dtypes)
print(x_val.dtypes)
print(x_test.dtypes)
print(y_train.dtypes)
print(y_val.dtypes)
print(y_test.dtypes)

inchikey    object
smiles      object
dtype: object
inchikey    object
smiles      object
dtype: object
ichikey    object
smiles     object
dtype: object
activity    object
dtype: object
activity    object
dtype: object
activity    object
dtype: object


In [18]:
x_train.values.shape

(3181, 2)

In [19]:
# input = Input(shape=x_train.shape[1])
# dense = Dense(32, activation='relu')(input)
# dense = Dense(64, activation='relu')(dense)
# dense = Dense(32, activation='relu')(dense)
# dense = Dense(16, activation='relu')(dense)
# output = Dense(1)(dense)

# model = Model(input, output)

In [20]:
# es = EarlyStopping(monitor='val_loss', patience=20, mode='auto')
# rl = ReduceLROnPlateau(monitor='val_loss', patience=15, mode='auto')
# mc = ModelCheckpoint(monitor='val_loss', mode='auto')

In [21]:
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='acc')
# model.fit(x_train.values, y_train, epochs=500, validation_data=(x_val.values, y_val), callbacks=[es, rl])

# loss = model.evaluate(x_test, y_test)
# y_pred = model.predict(x_test)

In [22]:
# print(loss)
# print(y_pred[:5])

In [23]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
model.score(x_test, y_test)

ValueError: could not convert string to float: 'UOOZSMNSBYVUGZ-UHFFFAOYSA-N'