# GRU (CITEseq)

In [None]:
import gc
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.utils import plot_model

from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold

## Import Raw Data

In [None]:
DATA_DIR = "../data/open-problems-multimodal"
feature_path = '../data/single_cell_features'

train_df = pd.read_feather(feature_path+'train_cite_inputs_id.feather')
test_df = pd.read_feather(feature_path+'test_cite_inputs_id.feather')

train_cite_X = np.load(feature_path+'train_cite_X.npy')
train_cite_y = np.load(feature_path+'train_cite_targets.npy') 

test_cite_X = np.load(feature_path+'test_cite_X.npy') 

## Evaluation Metric

In [None]:
def correlation_score(y_true, y_pred):
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

## Helper Functions

In [None]:
#function to standardize the dataset
def zscore(x):
    x_zscore = []
    for i in range(x.shape[0]):
        x_row = x[i]
        x_row = (x_row - np.mean(x_row)) / np.std(x_row)
        x_zscore.append(x_row)
    x_std = np.array(x_zscore)    
    return x_std

In [None]:
#function to compute the cosine similarity between predictions and the truth
def cosine_similarity_loss(y_true, y_pred):
    x = y_true
    y = y_pred
    #obtain the row means
    mx = tf.reduce_mean(x, axis=1, keepdims=True)
    my = tf.reduce_mean(y, axis=1, keepdims=True)
    #zero-mean transform the data
    xm, ym = x - mx, y - my
    #l2 normalization: divid each element by the l2 norm
    t1_norm = tf.math.l2_normalize(xm, axis = 1)
    t2_norm = tf.math.l2_normalize(ym, axis = 1)a
    #compute the cosine similarity
    cosine = tf.keras.losses.CosineSimilarity(axis = 1)(t1_norm, t2_norm)
    return cosine

## Data Generator

In [None]:
#use to generate batches of data for training the NN
class DataGenerator(tf.keras.utils.Sequence):
    
    def __init__(self, train_X, train_y, list_IDs, shuffle, batch_size, labels, ): 
        self.train_X = train_X
        self.train_y = train_y
        self.list_IDs = list_IDs        
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.labels = labels
        self.on_epoch_end()
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.list_IDs) // self.batch_size
        return ct
    
    def __getitem__(self, idx):
        'Generate one batch of data'
        indexes = self.list_IDs[idx*self.batch_size:(idx+1)*self.batch_size]
        #find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        #generate data
        X, y = self.__data_generation(list_IDs_temp)
        if self.labels: 
            return X, y
        else: 
            return X
 
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle: 
            np.random.shuffle(self.indexes)
            
    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'    
        X = self.train_X[list_IDs_temp]
        y = self.train_y[list_IDs_temp]        
        return X, y

## K-Fold Evaluation

In [None]:
def nn_kfold(train_df, train_cite_X, train_cite_y, test_df, test_cite_X, network, folds, model_name):
    
    oof_preds = np.zeros((train_df.shape[0],140))
    sub_preds = np.zeros((test_df.shape[0],140))
    
    cv_corr = []
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df,)):          
        
        print(n_fold)
        
        train_x = train_cite_X[train_idx]
        valid_x = train_cite_X[valid_idx]
        train_y = train_cite_y[train_idx]
        valid_y = train_cite_y[valid_idx]
        train_x_index = train_df.iloc[train_idx].reset_index(drop=True).index
        valid_x_index = train_df.iloc[valid_idx].reset_index(drop=True).index
        
        model = network(train_cite_X.shape[1])
        filepath = model_name + '_' + str(n_fold) + '.h5'
        es = tf.keras.callbacks.EarlyStopping(patience=10, mode='min', verbose=1) 
        checkpoint = tf.keras.callbacks.ModelCheckpoint(monitor='val_loss', filepath=filepath, 
                                                        save_best_only=True, save_weights_only=True, mode='min') 
        reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=LR_FACTOR, patience=6, verbose=1)
    
        train_dataset = DataGenerator(
            train_x,
            train_y,
            list_IDs=train_x_index, 
            shuffle=True, 
            batch_size=BATCH_SIZE, 
            labels=True,
        )
        
        valid_dataset = DataGenerator(
            valid_x,
            valid_y,
            list_IDs=valid_x_index, 
            shuffle=False, 
            batch_size=BATCH_SIZE, 
            labels=True,
        )
    
        hist = model.fit(train_dataset,
                         validation_data=valid_dataset,  
                         epochs=EPOCHS, 
                         callbacks=[checkpoint,es,reduce_lr_loss],
                         workers=4,
                         verbose=1)  
    
        model.load_weights(filepath)
        
        oof_preds[valid_idx] = model.predict(valid_x, batch_size=BATCH_SIZE, verbose=1)
        
        oof_corr = correlation_score(valid_y,  oof_preds[valid_idx])
        cv_corr.append(oof_corr)
        print(cv_corr)       
        
        sub_preds += model.predict(test_cite_X, batch_size=BATCH_SIZE, verbose=1) / folds.n_splits 
            
        del model
        gc.collect()
        tf.keras.backend.clear_session()   
        
    cv = correlation_score(train_cite_y,  oof_preds)
    print('Overall:', cv)   
    
    return oof_preds,sub_preds 

## GRU Models

In [5]:
def cite_cos_sim_model(len_num):
    
    input_num = tf.keras.Input(shape=(len_num))     
    
    x = input_num
    x0 = tf.keras.layers.Reshape((1,x.shape[1]))(x)
    x0 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(1800, activation='elu', 
                                                           kernel_initializer='Identity', return_sequences=False))(x0)
    x1 = tf.keras.layers.GaussianDropout(0.2)(x0)         
    x2 = tf.keras.layers.Dense(1800, activation ='elu', kernel_initializer='Identity',)(x1) 
    x3 = tf.keras.layers.GaussianDropout(0.2)(x2) 
    x4 = tf.keras.layers.Dense(1800, activation ='elu', kernel_initializer='Identity',)(x3) 
    x5 = tf.keras.layers.GaussianDropout(0.2)(x4)         
    x = tf.keras.layers.Concatenate()([x1,x3,x5])
    
    output = tf.keras.layers.Dense(140, activation='linear')(x) 
    
    model = tf.keras.models.Model(input_num, output)
    adam = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None)
    model.compile(loss=cosine_similarity_loss, optimizer=adam)
    model.summary()
    
    plot_model(model, to_file='model1.png', show_shapes=True)
    
    return model

In [None]:
def cite_mse_model(len_num):
    
    input_num = tf.keras.Input(shape=(len_num))     

    x = input_num
    x = tf.keras.layers.Dense(1500,activation ='swish',)(x)    
    x = tf.keras.layers.GaussianDropout(0.1)(x)   
    x = tf.keras.layers.Dense(1500,activation ='swish',)(x) 
    x = tf.keras.layers.GaussianDropout(0.1)(x)   
    x = tf.keras.layers.Dense(1500,activation ='swish',)(x) 
    x = tf.keras.layers.GaussianDropout(0.1)(x)    
    x = tf.keras.layers.Reshape((1,x.shape[1]))(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(700, activation='swish',return_sequences=False))(x)
    x = tf.keras.layers.GaussianDropout(0.1)(x)  
    
    output = tf.keras.layers.Dense(140, activation='linear')(x) 

    model = tf.keras.models.Model(input_num, output)
    opt = tfa.optimizers.AdamW(learning_rate=0.0005, weight_decay=0.0001)    
    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=opt)
    model.summary()
    
    plot_model(model, to_file='model2.png', show_shapes=True)
    
    return model

## Evaluate the Models and Make Predictions

In [None]:
BATCH_SIZE = 620
EPOCHS = 100
LR_FACTOR = 0.05
SEED = 666
N_FOLD = 5

folds = KFold(n_splits= N_FOLD, shuffle=True, random_state=SEED)     

oof_preds_cos, sub_preds_cos = nn_kfold(train_df, 
                                        train_cite_X, train_cite_y, 
                                        test_df, test_cite_X, 
                                        cite_cos_sim_model, folds, 'cite_cos_model')

In [None]:
BATCH_SIZE = 600
EPOCHS = 100
LR_FACTOR = 0.1
SEED = 666

folds = KFold(n_splits= 5, shuffle=True, random_state=SEED)    

#standardize the training data
train_cite_y = zscore(train_cite_y)

oof_preds_mse, sub_preds_mse = nn_kfold(train_df, 
                                        train_cite_X, train_cite_y, 
                                        test_df, test_cite_X, 
                                        cite_mse_model, folds, 'cite_mse_model')

In [None]:
#standardize the training predictions
oof_preds_cos = zscore(oof_preds_cos)
oof_preds_mse = zscore(oof_preds_mse)

#blend the training predictions of the two models
oof_preds = oof_preds_cos*0.55 + oof_preds_mse*0.45

#computet the blended cv corraltion score
cv = correlation_score(train_cite_y,  oof_preds)
print('Blend:',cv)

In [None]:
#standardize the test predictions
sub_preds_cos = zscore(sub_preds_cos)
sub_preds_mse = zscore(sub_preds_mse)

#blend the test predictions 
sub_preds = sub_preds_cos*0.55 + sub_preds_mse*0.45

In [None]:
del train_df, test_df, train_cite_X, test_cite_X, train_cite_y
del oof_preds_cos, oof_preds_mse, sub_preds_cos, sub_preds_mse
gc.collect()

In [None]:
submission = pd.read_csv( DATA_DIR +'/sample_submission.csv')   
submission.loc[:48663*140-1,'target'] = sub_preds.reshape(-1)
submission.to_csv(f'../result/cite/GRU_submission.csv', index=False) 