<a href="https://colab.research.google.com/github/pojo-25/drugProject/blob/main/SequenceTemelModelCalisan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, confusion_matrix
import numpy as np

In [None]:
# data_path = '/content/data/HIV.csv'
# col_smiles = 'smiles'
# col_target = 'HIV_active'


# # read data
# df = pd.read_csv(data_path, sep=',')
# df_no_na = df[[col_smiles, col_target]].dropna()

# X = df_no_na[col_smiles]
# y = df_no_na[col_target].values

# print(X)

In [None]:
col_smiles = 'smiles'
col_target = 'HIV_active'

METRIC_ACCURACY = 'accuracy'
METRIC_F1_SCORE = 'f1-score'
METRIC_COHEN_KAPPA = 'Cohen kappa'
METRIC_CONFUSION_MATRIX = 'Confusion Matrix'


CLASSES = ['benign', 'malignant']
TEST_RATIO = 0.2
SEED = 0

data_path = '/content/data/HIV.csv'

In [None]:
def read_data(data_path, col_smiles='smiles', col_target='HIV_active'):
    """Split original data into train data and test data.
    :param data_path: str, path to the a CSV data file
    :param col_smiles: str, name of smiles column
    :param col_target: str, name of target column
    :param test_ratio: float, proportion of the original data for testset, must be from 0 to 1
    :param seed: int, randomization seed for reproducibility
    :return (X, y)
    """
    

    # read data
    df = pd.read_csv(data_path, sep=',')
    df_no_na = df[[col_smiles, col_target]].dropna()

    X = df_no_na[col_smiles]
    y = df_no_na[col_target].values
    
    return X, y

                
def get_prediction_score(y_label, y_predict):
    """Evaluate predictions using different evaluation metrics.
    :param y_label: list, contains true label
    :param y_predict: list, contains predicted label
    :return scores: dict, evaluation metrics on the prediction
    """
    scores = {}
    scores[METRIC_ACCURACY] = accuracy_score(y_label, y_predict)
    scores[METRIC_F1_SCORE] = f1_score(y_label, y_predict, labels=None, average='macro', sample_weight=None)
    scores[METRIC_COHEN_KAPPA] = cohen_kappa_score(y_label, y_predict)
    scores[METRIC_CONFUSION_MATRIX] = confusion_matrix(y_label, y_predict)
    
    return scores

In [None]:
import os
import sys
sys.path.insert(0, os.getcwd()) # add current working directory to pythonpath



import tensorflow as tf

from tensorflow.python.keras import backend as K
#os.environ["CUDA_VISIBLE_DEVICES"]="0" # Use only the 1st GPU
tf_config = tf.compat.v1.ConfigProto()
sess = tf.compat.v1.Session(config=tf_config)
K.set_session(sess)


from tensorflow.keras import callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Flatten
from tensorflow.keras.layers import Embedding, LSTM, GlobalMaxPooling1D, SpatialDropout1D, Conv1D, MaxPooling1D, GRU, Bidirectional
from tensorflow.keras import optimizers
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight


import warnings
import gc


In [None]:
# import os
# import sys
# sys.path.insert(0, os.getcwd()) # add current working directory to pythonpath



# import keras.backend as K
# import numpy as np
# import tensorflow as tf
# from keras import callbacks
# from keras.optimizers import Adam
# from keras.models import load_model
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.models import Sequential
# from keras.models import Model
# from keras.layers import Input, Dense, Dropout, BatchNormalization, Flatten
# from keras.layers import Embedding, LSTM, GlobalMaxPooling1D, SpatialDropout1D, Conv1D, MaxPooling1D, GRU, Bidirectional
# from keras import optimizers
# from sklearn.model_selection import train_test_split
# from sklearn.utils import class_weight

# import warnings
# import argparse
# import gc


In [None]:
def generate_tokens(smiles, len_percentile=100):
    """
    Generate character tokens from smiles
    :param smiles: Pandas series, containing smiles
    :param len_percentile: percentile of smiles length to set as max length
    :return tokens
    :return num_words
    :return max_phrase_len
    """ 
    
    # Get max length of smiles
    smiles_len = smiles.apply(lambda p: len(p))
    max_phrase_len = int(np.percentile(smiles_len, len_percentile))
    print('True max length is ' + str(np.max(smiles_len)) + ', ' + str(max_phrase_len) + ' is set the length cutoff.')
        
    # Get unique words
    unique_words = np.unique(np.concatenate(smiles.apply(lambda p: np.array(list(p))).values, axis=0))
    num_words = len(unique_words)
    print('Vocab size is ' + str(num_words))
    
    tokenizer = Tokenizer(
        num_words = num_words,
        filters = '$',
        char_level = True,
        oov_token = '_'
    )

    #print(num_words)
    
    tokenizer.fit_on_texts(smiles)
    sequences = tokenizer.texts_to_sequences(smiles)
    tokens = pad_sequences(sequences, maxlen = max_phrase_len, padding='post', truncating='post')
    
    return tokens, num_words, max_phrase_len

In [None]:
smiles, y = read_data(data_path, col_smiles='smiles', col_target='HIV_active')
tokens, num_words, max_phrase_len = generate_tokens(smiles, len_percentile=100)

True max length is 580, 580 is set the length cutoff.
Vocab size is 56


In [None]:
def create_model(model_type, num_words, input_length, output_dim=1, dropout_rate=0.0):
    """Build different sequence model
    :param model_type: str, can be 'cnn-gru', 'cnn', 'gru', 'lstm'
    :param num_words: int
    :param input_length: int
    :param output_dim: int
    :return model: Keras model
    """ 
    
    model = Sequential()
    if model_type == 'lstm': # LSTM - LSTM
        model.add(Embedding(num_words+1, 50, input_length=input_length))
        model.add(Bidirectional(LSTM(128, return_sequences=True)))
        model.add(Bidirectional(LSTM(128)))
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(output_dim, activation='sigmoid'))
    elif model_type == 'gru': # GRU - GRU
        model.add(Embedding(num_words+1, 50, input_length=input_length))
        model.add(Bidirectional(GRU(128, return_sequences=True)))
        model.add(Bidirectional(GRU(128)))
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(output_dim, activation='sigmoid'))
    elif model_type == 'cnn-gru': # 1D CNN - GRU
        model.add(Embedding(num_words+1, 50, input_length=input_length))
        model.add(Conv1D(192,3,activation='relu'))
        model.add(Bidirectional(GRU(224, return_sequences=True)))
        model.add(Bidirectional(GRU(384)))
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(output_dim, activation='sigmoid'))
    elif model_type == 'cnn': # 1D CNN
        model.add(Embedding(num_words+1, 50, input_length=input_length))
        model.add(Conv1D(192, 10, activation='relu'))
        model.add(BatchNormalization())
        model.add(Conv1D(192, 3, activation='relu'))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(output_dim, activation='sigmoid'))
    else:
        #raise ValueError(model_type + ' is not supported.')
        print('wrong model')
 
    model.summary()    
    return model

In [None]:
# def build_sequence_model(trainset, testset, model_type, num_words, input_length, output_dim=1, dropout_rate=0.0,
#                      batch_size=32, nb_epochs=100, lr=0.001):
#     """Train and evaluate CNN model
#     :param trainset: (X_train, y_train)
#     :param testset: (X_test, y_test)
#     :param model_type: str, can be 'cnn-gru', 'cnn', 'gru', 'lstm'
#     :param num_words: int
#     :param input_length: int
#     :param output_dim: int
#     :param batch_size: int, batch size for model training
#     :param nb_epochs: int, number of training epoches
#     :param lr: float, learning rate
#     :param save_path: path to save model
#     :return model: fitted Keras model
#     :return scores: dict, scores on test set for the fitted Keras model
#     """
    
#     # Create model
#     model = create_model(model_type=model_type, num_words=num_words, input_length=input_length, output_dim=output_dim,
#                          dropout_rate=dropout_rate)
    
#     # Callback list
#     callback_list = []
#     # monitor val_loss and terminate training if no improvement
#     early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.00001, \
#                 patience=20, verbose=2, mode='auto', restore_best_weights=True)
#     callback_list.append(early_stop)
    
#     # if save_path is not None:
#     #     # save best model based on val_acc during training
#     #     checkpoint = callbacks.ModelCheckpoint(os.path.join(save_path, '.h5'), monitor='val_acc', \
#     #                 verbose=0, save_best_only=True, save_weights_only=False, mode='auto')
#     #     callback_list.append(checkpoint)
        
#     # Get train and test set
#     (X_train, y_train) = trainset
#     (X_test, y_test) = testset
    
#     # Compute class weights
#     weight_list = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
#     weight_dict = {}
#     for i in range(len(np.unique(y_train))):
#         weight_dict[np.unique(y_train)[i]] = weight_list[i]
    
#     # Train only classification head
#     optimizer = Adam(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)
#     model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
#     model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=nb_epochs, \
#                         class_weight=weight_dict, callbacks=callback_list, verbose=2)
    
#     # Evaluate model    
#     prediction = model.predict(X_test)
#     y_val_predict = (prediction > 0.5).astype('uint8')
#     with warnings.catch_warnings():
#         warnings.simplefilter('ignore')  # disable the warning on f1-score with not all labels
#         scores = get_prediction_score(y_val, y_val_predict)
        
#     return model, scores

In [None]:
def build_sequence_model(trainset, testset, model_type, num_words, input_length, output_dim=1, dropout_rate=0.0,
                     batch_size=32, nb_epochs=100, lr=0.001,
                     save_path=None):
    """Train and evaluate CNN model
    :param trainset: (X_train, y_train)
    :param testset: (X_test, y_test)
    :param model_type: str, can be 'cnn-gru', 'cnn', 'gru', 'lstm'
    :param num_words: int
    :param input_length: int
    :param output_dim: int
    :param batch_size: int, batch size for model training
    :param nb_epochs: int, number of training epoches
    :param lr: float, learning rate
    :param save_path: path to save model
    :return model: fitted Keras model
    :return scores: dict, scores on test set for the fitted Keras model
    """
    
    # Create model
    model = create_model(model_type=model_type, num_words=num_words, input_length=input_length, output_dim=output_dim,
                         dropout_rate=dropout_rate)
    
    # Callback list
    callback_list = []
    # monitor val_loss and terminate training if no improvement
    early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.00001, \
                patience=20, verbose=2, mode='auto', restore_best_weights=True)
    callback_list.append(early_stop)
    
    if save_path is not None:
        # save best model based on val_acc during training
        checkpoint = callbacks.ModelCheckpoint(os.path.join(save_path, model_type + '.h5'), monitor='val_acc', \
                    verbose=0, save_best_only=True, save_weights_only=False, mode='auto')
        callback_list.append(checkpoint)
        
    # Get train and test set
    (X_train, y_train) = trainset
    (X_test, y_test) = testset
    
    # Compute class weights
    weight_list = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
    weight_dict = {}
    for i in range(len(np.unique(y_train))):
        weight_dict[np.unique(y_train)[i]] = weight_list[i]
    
    # Train only classification head
    optimizer = Adam(lr=lr, decay=1e-6)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=nb_epochs, \
                        class_weight=weight_dict, callbacks=callback_list, verbose=2)
    
    # Evaluate model    
    prediction = model.predict(X_test)
    y_val_predict = (prediction > 0.5).astype('uint8')
    y_val = y_test
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')  # disable the warning on f1-score with not all labels
        scores = get_prediction_score(y_val, y_val_predict)
        
    return model, scores

In [None]:
data_path = '/content/data/HIV.csv'


model_list = ['cnn', 'cnn-gru', 'gru', 'lstm']
batch_size = 16
nb_epochs = 1
lr = 0.001
save_path = '/content/data'

WORK_DIRECTORY = '/content/data'

# Make save_path
# if save_path is not None:
#     os.makedirs(os.path.join(save_path, 'sequence_models'), exist_ok=True)

# Read data
smiles, y = read_data(data_path, col_smiles='smiles', col_target='HIV_active')
tokens, num_words, max_phrase_len = generate_tokens(smiles, len_percentile=100)

# Get train and test set
X_train, X_test, y_train, y_test = train_test_split(tokens, y, test_size=TEST_RATIO, shuffle=True, stratify=y,
                                                  random_state=SEED)

# Build en evaluate graph models
model_scores = []
for model_type in model_list:
    model, scores = build_sequence_model((X_train, y_train), (X_test, y_test), model_type, num_words, max_phrase_len,
                                          output_dim=1, dropout_rate=0.0,
                                          batch_size=batch_size, nb_epochs=nb_epochs, lr=lr,
                             save_path=os.path.join(save_path, 'sequence_models', model_type + '.h5'))
    model_scores.append(scores)
        
    # force release memory
    K.clear_session()
    del model
    gc.collect()

model_df = pd.DataFrame({'model': model_list,

                    METRIC_ACCURACY: [score[METRIC_ACCURACY] for score in model_scores],
                    METRIC_F1_SCORE: [score[METRIC_F1_SCORE] for score in model_scores],
                    METRIC_COHEN_KAPPA: [score[METRIC_COHEN_KAPPA] for score in model_scores],
                    METRIC_CONFUSION_MATRIX: [score[METRIC_CONFUSION_MATRIX] for score in model_scores]                            
                      })
model_df = model_df[['model', METRIC_ACCURACY, METRIC_F1_SCORE, METRIC_COHEN_KAPPA,
                      METRIC_CONFUSION_MATRIX]]


model_df.to_csv(os.path.join(WORK_DIRECTORY, 'summary_sequence_model.csv'), index=False)
model_df.sort_values(by=[METRIC_ACCURACY, METRIC_F1_SCORE, METRIC_COHEN_KAPPA],
                      ascending=False, inplace=True)
print('Best model:\n' + str(model_df.iloc[0]))





True max length is 580, 580 is set the length cutoff.
Vocab size is 56
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 580, 50)           2850      
_________________________________________________________________
conv1d (Conv1D)              (None, 571, 192)          96192     
_________________________________________________________________
batch_normalization (BatchNo (None, 571, 192)          768       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 569, 192)          110784    
_________________________________________________________________
flatten (Flatten)            (None, 109248)            0         
_________________________________________________________________
dense (Dense)                (None, 128)               13983872  
___________________________________________________

In [None]:
print(model_df)

     model  accuracy  f1-score  Cohen kappa            Confusion Matrix
0      cnn  0.683929  0.464175     0.061529  [[5447, 2490], [110, 179]]
3     lstm  0.268417  0.232817     0.000468   [[1990, 5947], [71, 218]]
2      gru  0.067469  0.067467     0.000981     [[272, 7665], [6, 283]]
1  cnn-gru  0.035133  0.033940     0.000000       [[0, 7937], [0, 289]]
