In [2]:
# general
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# rdkits
from rdkit.Chem import AllChem
from rdkit import Chem,DataStructs
from rdkit.Chem import MACCSkeys
# ml
import tensorflow as tf
# etcs
import warnings 
warnings.filterwarnings(action='ignore')

In [3]:
train_df = pd.read_csv('./data/train_set.ReorgE.csv',index_col=0)
test_df = pd.read_csv('./data/test_set.csv',index_col=0)
submission = pd.read_csv('./data/sample_submission.csv',index_col=0)

In [4]:
train_df

Unnamed: 0_level_0,SMILES,Reorg_g,Reorg_ex
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
train_0,CC[C@H]1CCCCN1C(=O)[C@@H](C)OC(=O)c1c(C)oc(-n2...,0.631486,0.535060
train_1,O[C@@H](CNC1CC1)CN1CCc2sccc2C1,0.825901,1.116781
train_2,N#CCCNC(=O)[C@@]1(O)CCSC1,1.463943,0.964848
train_3,COC[C@H]1CN(c2ccc(OCC[C@@H](C)O)cc2)C(=O)O1,0.166669,0.161458
train_4,N#Cc1c(-c2ccccc2OCC(N)=O)[nH]c(C(N)=O)c1N,0.313820,0.338862
...,...,...,...
train_18152,CC(=O)Nc1ccc2ccc3cccc4ccc1c2c34,0.146917,0.143084
train_18153,CC(C)(C)c1ccccc1N(c1ccccc1)c1ccc(S(=O)(=O)c2cc...,0.612898,0.500668
train_18154,CN(C)c1ccc(C(=O)Nc2ccccc2)cc1,1.218777,1.048954
train_18155,c1ccc(N(c2ccccc2)c2ccc(-c3ncc(-c4ccc(-c5cnc(-c...,0.145292,0.182589


In [6]:
ffpp = 'pattern'
train_fps = []
train_y_g = []
train_y_ex = []

for index,row in train_df.iterrows():
    try:
        mol = Chem.MolFromSmiles(row['SMILES'])
        if ffpp == 'maccs':
            fp = MACCSkeys.GenMACCSKeys(mol)
        elif ffpp == 'morgan':
            fp = Chem.AllChem.GetMorganFingerprintAsBitVect(mol,4)
        elif ffpp == 'rdkit':
            fp = Chem.RDKFingerprint(mol)
        elif ffpp == 'pattern':
            fp = Chem.rdmolops.PatternFingerprint(mol)
        elif ffpp == 'layerd':
            fp = Chem.rdmolops.LayeredFingerprint(mol)
        
        train_fps.append(fp)
        train_y_g.append(row['Reorg_g'])
        train_y_ex.append(row['Reorg_ex'])
    except:
        pass
np_train_fps = []
for fp in train_fps:
    arr = np.zeros((0,))
    DataStructs.ConvertToNumpyArray(fp,arr)
    np_train_fps.append(arr)

np_train_fps_array = np.array(np_train_fps)

In [7]:
np_train_fps = []
np_sum = []
for fp in train_fps:
    arr = np.zeros((0,))
    DataStructs.ConvertToNumpyArray(fp,arr)
    np_train_fps.append(arr)

np_train_fps_array = np.array(np_train_fps)

In [8]:
train_data = pd.DataFrame(np_train_fps_array)
train_data.to_csv('train_data.csv',index=False)

In [9]:
train_data.head()

In [10]:
np_train_fps_array.shape

In [11]:
pd.Series(np_train_fps_array[:,0]).value_counts()

In [12]:
ffpp = 'pattern'
test_fps = []
test_y = []
for index,row in test_df.iterrows():
    try:
        mol = Chem.MolFromSmiles(row['SMILES'])
        if ffpp == 'maccs':
            fp = MACCSkeys.GenMACCSKeys(mol)
        elif ffpp == 'morgan':
            fp = Chem.AllChem.GetMorganFingerprintAsBitVect(mol,4)
        elif ffpp == 'rdkit':
            fp = Chem.RDKFingerprint(mol)
        elif ffpp == 'pattern':
            fp = Chem.rdmolops.PatternFingerprint(mol)
        elif ffpp == 'layerd':
            fp = Chem.rdmolops.LayeredFingerprint(mol)
        
        test_fps.append(fp)
        test_y.append(row['y'])
    except:
        pass
        
np_test_fps = []
np_test_sum = []
for fp in test_fps:
    arr = np.zeros((0,))
    DataStructs.ConvertToNumpyArray(fp,arr)
    np_test_fps.append(arr)
    
np_test_fps_array = np.array(np_test_fps)

In [13]:
np_test_fps_array.shape

In [14]:
test_data = pd.DataFrame(np_test_fps_array)
test_data.to_csv('test_data.csv',index=False)

In [15]:
pd.Series(np_test_fps_array[:,0]).value_counts()

In [16]:
from keras.layers import Dense,Dropout,BatchNormalization,LeakyReLU,PReLU,Embedding,LSTM,GRU,SimpleRNN
from tensorflow_addons.layers import Maxout,GELU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential,Model
from keras.optimizers import Adam,SGD
from tensorflow.keras.losses import MAE
from tensorflow.nn import gelu

from sklearn.model_selection import KFold,cross_val_score,cross_validate,train_test_split,StratifiedKFold
from sklearn.metrics import mean_absolute_error

In [17]:
X_train, Y_train = np_train_fps_array, np.array(train_y)
print(X_train.shape,Y_train.shape)

In [18]:
X_train = X_train.reshape(30335,1,2048)
X_train.shape

In [19]:
np_test_fps_array = np_test_fps_array.reshape(602,1,2048)
np_test_fps_array.shape

## Model

# KFold validation

In [20]:
kf = KFold(n_splits=5,shuffle=True)

In [23]:
fold_num = 1
MAE_score = []
history_loss = []
train_loss = []
epochs = 100
for train_idx, valid_idx in kf.split(X_train,Y_train):
    print(f"------------------fold{fold_num}----------------------")
    x_train, x_val = X_train[train_idx], X_train[valid_idx]
    y_train, y_val = Y_train[train_idx], Y_train[valid_idx]
    
    model = Sequential()
    model.add(GRU(2048,input_shape=(1,2048),return_sequences=True,activation='swish'))
    model.add(Dropout(0.1))
    model.add(GRU(2048,return_sequences=True,activation='swish'))
    model.add(Dropout(0.1))
    model.add(GRU(1024,return_sequences=True,activation='swish'))
    model.add(Dropout(0.1))
    model.add(GRU(1024))
    model.add(Dropout(0.2))

    model.add(Dense(1024,activation='swish'))
    model.add(GELU(True))
    model.add(Dropout(0.1))
    model.add(Dense(2048,activation='swish'))
    model.add(GELU(True))
    model.add(Dropout(0.1))
    model.add(Dense(1024,activation='swish'))
    model.add(GELU(True))
    model.add(Dropout(0.1))

    model.add(Dense(512,activation='swish'))
    model.add(GELU(True))
    model.add(Dropout(0.1))

    model.add(Dense(256,activation='swish'))
    model.add(GELU(True))
    model.add(Dense(1,kernel_initializer='normal'))
    model.compile(optimizer=Adam(learning_rate=0.00035),
             loss='mean_absolute_error',
             metrics=['mae'])
        
    earlystopper = tf.keras.callbacks.EarlyStopping(patience=10)
    callbacks = tf.keras.callbacks.ModelCheckpoint(f'my_model_{fold_num}.h5',monitor='val_mae')
    scheduler = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_mae',
        factor=0.6,
        patience=5,
        verbose=1,
        mode='auto',
        min_lr=0.0001,
        cooldown=3
    )
    
    history = model.fit(x_train,y_train,validation_data=(x_val,y_val),
                       batch_size=32,epochs=epochs,callbacks=[earlystopper,callbacks,scheduler])
    history_loss.append(history.history['val_loss'])
    train_loss.append(history.history['loss'])
    model = tf.keras.models.load_model(f'./my_model_{fold_num}.h5')
    
    fold_num += 1
    

In [25]:
plt.figure(figsize=(20,15))
plt.subplot(5,1,1)
plt.plot(history_loss[0],label='deep_val_loss')
plt.plot(train_loss[0],label='deep_train_loss')
plt.xlabel('epochs')
plt.ylabel('val_loss')
plt.legend()
plt.grid()
plt.subplot(5,1,2)
plt.plot(history_loss[1],label='deep_val_loss')
plt.plot(train_loss[1],label='deep_train_loss')
plt.xlabel('epochs')
plt.ylabel('val_loss')
plt.legend()
plt.grid()
plt.subplot(5,1,3)
plt.plot(history_loss[2],label='deep_val_loss')
plt.plot(train_loss[2],label='deep_train_loss')
plt.xlabel('epochs')
plt.ylabel('val_loss')
plt.legend()
plt.grid()
plt.subplot(5,1,4)
plt.plot(history_loss[3],label='deep_val_loss')
plt.plot(train_loss[3],label='deep_train_loss')
plt.xlabel('epochs')
plt.ylabel('val_loss')
plt.legend()
plt.grid()
plt.subplot(5,1,5)
plt.plot(history_loss[4],label='deep_val_loss')
plt.plot(train_loss[4],label='deep_train_loss')
plt.xlabel('epochs')
plt.ylabel('val_loss')
plt.legend()
plt.grid()

In [26]:
pred_1 = []
pred_2 = []
pred_3 = []
pred_4 = []
pred_5 = []
for i in range(5):
    model = tf.keras.models.load_model(f'my_model_{i+1}.h5')
    pred = model.predict(np_test_fps_array)
    if i == 0:
        pred_1 = pred
    if i == 1:
        pred_2 = pred
    if i == 2:
        pred_3 = pred
    if i == 3:
        pred_4 = pred
    if i == 4:
        pred_5 = pred

final_pred = (pred_1+pred_2+pred_3+pred_4+pred_5)/5
print(final_pred.shape)
    

In [27]:
submission['ST1_GAP(eV)'] = final_pred
submission

In [28]:
submission.to_csv('scheduler_leakyrelu_kfold_submission.csv',index=False)

In [29]:
submission