# Imports

In [88]:
import os
import pickle
from typing import Tuple

import pandas as pd
from pandas import DataFrame

import numpy as np
from numpy import ndarray

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import (
    Conv2D,
    Dense,
    Dropout,
    MaxPooling2D,
    Flatten
)
from keras.optimizers import Adam

# Code

In [89]:
def get_model(
    input_shape: Tuple[int, ...]
):
    
    _model: Sequential = Sequential()
    
    _model.add(
        Conv2D(
            filters=64,
            kernel_size=(3, 3),
            input_shape=input_shape,
            activation='relu'
        )
    )
    _model.add(
        MaxPooling2D(pool_size=(2, 2))
    )
        
    _model.add(Flatten())

    _model.add(Dense(512, activation='relu'))
        
    _model.add(Dropout(0.1))
    
    _model.add(Dense(256, activation='relu'))
    _model.add(Dense(128, activation='relu'))
    
    _model.add(Dense(1, activation='linear'))

    _model.compile(
        loss='mean_squared_error',
        optimizer=Adam(0.01)
    )

    return _model

# Load Data

In [57]:
CROSS_COLUMN_QSRR_PATH: str = os.path.dirname(os.getcwd())
DATA_PATH: str = os.path.join(CROSS_COLUMN_QSRR_PATH, "data")
SMRT_DATASET: str = os.path.join(DATA_PATH, "2023-11-18-smrt_dataset.csv")
SMRT_DATASET_SMILES: str = os.path.join(DATA_PATH, "2023-11-18-smrt_dataset_smiles.csv")
SMRT_DATASET_ENCODED_SMILES: str = os.path.join(DATA_PATH, "2023-11-18-smrt_dataset_encoded_smiles.pkl")

In [58]:
_data_df = pd.read_csv(
    SMRT_DATASET_SMILES, 
    delimiter=','
)

In [59]:
display(_data_df)

Unnamed: 0,smiles,rt
0,COC(=O)N1CCN(C(=O)Cc2ccc(Cl)c(Cl)c2)[C@H](CN2C...,687.8
1,CCN1CCC[C@@H]1CN=C(O)c1cc(S(=O)(=O)CC)c(N)cc1OC,590.7
2,Oc1cccc2c(O)nccc12,583.6
3,OC1=NCC2(CCN(CCc3ccccc3)CC2)O1,579.0
4,CC(C)(C)NC[C@H](O)COc1cccc2nc(O)[nH]c12,603.1
...,...,...
77890,CCOc1ccc(OCC)c(NS(=O)(=O)c2ccc(/C=C/c3onc(C)c3...,946.4
77891,COCCCN=C(O)N1C[C@@H]2CN(S(=O)(=O)c3cccc(F)c3)C...,653.1
77892,COCCN=C(O)N1C[C@@H]2CN(S(=O)(=O)c3cccc(C)c3)C[...,648.2
77893,Cc1ccc(S(=O)(=O)N2C[C@@H]3CN(C(O)=Nc4ccccc4C)C...,783.9


In [60]:
with open(SMRT_DATASET_ENCODED_SMILES, "rb") as f:
    _encoded_smiles_arr: ndarray = pickle.load(f)

In [61]:
_encoded_smiles_arr

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

# Train/Test Split

In [62]:
TEST_RATIO: float = 0.3
RANDOM_STATE: int = 12345

In [63]:
_y_train, _y_test, _x_train, _x_test = train_test_split(
    _data_df["rt"],
    _encoded_smiles_arr,
    test_size=TEST_RATIO,
    random_state=RANDOM_STATE
)

In [64]:
display(_x_train.shape)
display(_x_test.shape)

(54526, 90, 54)

(23369, 90, 54)

In [65]:
display(_y_train.shape)
display(_y_test.shape)

(54526,)

(23369,)

In [69]:
max_length, vocabulary_size = _x_train.shape[1:]

In [76]:
_x_train = _x_train.reshape(_x_train.shape[0], max_length, vocabulary_size, 1)
_x_test = _x_test.reshape(_x_test.shape[0], max_length, vocabulary_size, 1)
_input_shape = (max_length, vocabulary_size, 1)

In [77]:
display(_input_shape)

(90, 54, 1)

# Get & Train Model

In [90]:
_model = get_model(input_shape=_input_shape)

In [95]:
_model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_10 (Conv2D)          (None, 88, 52, 64)        640       
                                                                 
 max_pooling2d_6 (MaxPoolin  (None, 44, 26, 64)        0         
 g2D)                                                            
                                                                 
 flatten_6 (Flatten)         (None, 73216)             0         
                                                                 
 dense_15 (Dense)            (None, 512)               37487104  
                                                                 
 dropout_3 (Dropout)         (None, 512)               0         
                                                                 
 dense_16 (Dense)            (None, 256)               131328    
                                                     

In [None]:
# 10 + 10 + 20 + 

In [94]:
_history = _model.fit(
    _x_train,
    _y_train,
    epochs=20,
    batch_size=64
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
