This Notebook is Machine_Learning Project that develop a model to predict the degradation rates of RNA molecules on current mRNAvaccines against COVID-19.
* We will first pre-process train data. 
* Then, we will use all train data to train a model. 
* Finally, we run our model on the public test set and get the error rate of our model.

In [None]:
import pandas as pd
import numpy as np
import sklearn
import lightgbm as lgb
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
print(tf.__version__)

# **File Path**

In [None]:
root_dir = '../input/stanford-covid-vaccine/'
aug_data_dir = '../input/how-to-generate-augmentation-data/'

# **Parameter**

In [None]:
sequence_length = 107
predicted_length = 68
embed_dim = 100
hidden_dim = 256
n_layers = 2
# tf.random.set_seed(2020)
# np.random.seed(2020)
# y_true = tf.random.normal((32, 68, 3))
# y_pred = tf.random.normal((32, 68, 3))

In [None]:
predict_columns = ['reactivity', 'deg_Mg_50C', 'deg_Mg_pH10', 'deg_pH10', 'deg_50C']
input_columns = ['sequence', 'structure', 'predicted_loop_type']

# **Several function for process data and build model**

In [None]:
def aug_data(df, aug_df):
    target_df = df.copy()
    new_df = aug_df[aug_df['id'].isin(target_df['id'])]
    
    del target_df['structure']
    del target_df['predicted_loop_type']
    
    new_df = new_df.merge(target_df, on=['id','sequence'], how='left')

    df['cnt'] = df['id'].map(new_df[['id','cnt']].set_index('id').to_dict()['cnt'])
    df['log_gamma'] = 100
    df['score'] = 1.0
    df = df.append(new_df[df.columns])
    return df

In [None]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0, 2, 1)
    )

In [None]:
def preprocess_inputs(df, token2int, cols):
    return pandas_list_to_array(
        df[cols].applymap(lambda seq: [token2int[x] for x in seq])
    )

In [None]:
def Post_process(name, submission_df, public_df, public_preds, private_df, private_preds):
    pred_list = []
    def process(df, predictions):
        for index, value in enumerate(df.id):
            pred = predictions[index]
            pre_df = pd.DataFrame(pred, columns=predict_columns)
            pre_df['id_seqpos'] = [f'{value}_{number}' for number in range(pre_df.shape[0])]
            pred_list.append(pre_df)
    for df, predictions in [(public_df, public_preds), (private_df, private_preds)]:
        process(df, predictions)
    preds_df = pd.concat(pred_list).groupby('id_seqpos').mean().reset_index()
    submission = submission_df[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
    submission.to_csv(name, index=False)
    print('done')

In [None]:
def MCRMSE_func(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

In [None]:
def MCRMSE_numpy(y_true, y_pred):
    colwise_mse = np.mean(np.square(y_true - y_pred), axis=1)
    return np.mean(np.sqrt(colwise_mse), axis=1)

In [None]:
def Single_GRU(embed_size, sequence_length, predicted_length, embed_dim, hidden_dim, n_layers, dropout=0.5, sp_dropout=0.2,):
    inputs = keras.layers.Input(shape=(sequence_length, 3))
    embed = keras.layers.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)
    
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )
    hidden = keras.layers.SpatialDropout1D(sp_dropout)(reshaped)
    
#     for x in range(n_layers):
#         hidden = keras.layers.TimeDistributed(keras.layers.Bidirectional(keras.layers.GRU(
#             hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal')))(hidden)
    for x in range(n_layers):
        hidden = keras.layers.Bidirectional(keras.layers.GRU(
            hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :predicted_length]
    out = keras.layers.Dense(5, activation='linear')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(), loss=MCRMSE_func)
    
    return model

In [None]:
def Single_LSTM(embed_size, sequence_length, predicted_length, embed_dim, hidden_dim, n_layers, dropout=0.5, sp_dropout=0.2,):
    inputs = keras.layers.Input(shape=(sequence_length, 3))
    embed = keras.layers.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)
    
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )
    hidden = keras.layers.SpatialDropout1D(sp_dropout)(reshaped)
    
    for x in range(n_layers):
        hidden = keras.layers.Bidirectional(keras.layers.LSTM(
            hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :predicted_length]
    out = keras.layers.Dense(5, activation='linear')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(), loss=MCRMSE_func)
    
    return model

In [None]:
def Conv_Lstm(embed_size, sequence_length, predicted_length, 
              embed_dim, hidden_dim, n_layers, dropout=0.5, sp_dropout=0.2):
    inputs = keras.layers.Input(shape=(sequence_length, 3))
    embed = keras.layers.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)
    
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )
    hidden = keras.layers.SpatialDropout1D(sp_dropout)(reshaped)
    
    for x in range(n_layers - 1):
        hidden = keras.layers.Conv1D(
            2 ** x * hidden_dim, kernel_size=3,padding='same',activation='relu', kernel_initializer='glorot_uniform')(hidden)
        hidden = keras.layers.BatchNormalization()(hidden)
#         hidden = keras.layers.MaxPool1D(2)(hidden)
    for y in range(n_layers - 1):
        hidden = keras.layers.Bidirectional(keras.layers.LSTM(
            hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :predicted_length]
    out = keras.layers.Dense(5, activation='linear')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(), loss=MCRMSE_func)
    
    return model
    

In [None]:
def GRU_Lstm(embed_size, sequence_length, predicted_length, 
              embed_dim, hidden_dim, n_layers, dropout=0.5, sp_dropout=0.2):
    inputs = keras.layers.Input(shape=(sequence_length, 3))
    embed = keras.layers.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)
    
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )
    hidden = keras.layers.SpatialDropout1D(sp_dropout)(reshaped)
    
    for x in range(n_layers - 1):
        hidden = keras.layers.Bidirectional(keras.layers.GRU(
            hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))(hidden)
#         hidden = keras.layers.MaxPool1D(2)(hidden)
    for y in range(n_layers - 1):
        hidden = keras.layers.Bidirectional(keras.layers.LSTM(
            hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :predicted_length]
    out = keras.layers.Dense(5, activation='linear')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(), loss=MCRMSE_func)
    
    return model
    

In [None]:
def xgboost(estimator_number, learning_rate):
    model = XGBRegressor(
    max_depth=8,
    n_estimators=estimator_number,
    min_child_weight=300,
    learning_rate=learning_rate,
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)
    return model

In [None]:
def light_lgb(estimator_number, learning_rate):
    model = lgb.LGBMRegressor(n_estimators=estimator_number,
                            learning_rate=learning_rate,
                            feature_fraction=0.8)
    return model

In [None]:
def fit_fuc(model, X_train, y_train, X_val, y_val, func_type):
    if func_type == 0:
        model.fit(X_train, y_train,
                  eval_set=(X_val, y_val),
                  early_stopping_rounds=100,
                  verbose=1000)
    elif func_type == 1:
        model.fit(
        X_train, 
        y_train, 
        eval_metric="rmse", 
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        verbose=True, 
        early_stopping_rounds = 10)

In [None]:
def model_process(model, name, columns, fit_func, func_type=0):
    train = pd.read_json('../input/stanford-covid-vaccine/train.json',lines=True)
    test = pd.read_json('../input/stanford-covid-vaccine/test.json', lines=True)
    sample_df = pd.read_csv('../input/stanford-covid-vaccine/sample_submission.csv')
    train['mean_reactivity'] = train['reactivity'].apply(lambda x: np.mean(x))
    train['mean_deg_Mg_pH10'] = train['deg_Mg_pH10'].apply(lambda x: np.mean(x))
    train['mean_deg_Mg_50C'] = train['deg_Mg_50C'].apply(lambda x: np.mean(x))
    train['mean_deg_pH10'] = train['deg_pH10'].apply(lambda x: np.mean(x))
    train['mean_deg_50C'] = train['deg_50C'].apply(lambda x: np.mean(x))
    if func_type == 1:
        for info in input_columns:
            train[f'{name}_{info}'] = train[input_columns].applymap(lambda seq: [token2int[x] for x in seq])[info]
            test[f'{name}_{info}'] = test[input_columns].applymap(lambda seq: [token2int[x] for x in seq])[info]

        for n in range(107):
            train[f'{name}_structure_{n}'] = train[f'{name}_structure'].apply(lambda x: x[n]).astype('int')
            test[f'{name}_structure_{n}'] = test[f'{name}_structure'].apply(lambda x: x[n]).astype('int')
            train[f'{name}_predicted_loop_type_{n}'] = train[f'{name}_predicted_loop_type'].apply(lambda x: x[n]).astype('int')
            test[f'{name}_predicted_loop_type_{n}'] = test[f'{name}_predicted_loop_type'].apply(lambda x: x[n]).astype('int')
            train[f'{name}_sequence_{n}'] = train[f'{name}_sequence'].apply(lambda x: x[n]).astype('int')
            test[f'{name}_sequence_{n}'] = test[f'{name}_sequence'].apply(lambda x: x[n]).astype('int')
    elif func_type == 0:
        for n in range(107):
            train[f'{name}_structure_{n}'] = train[f'structure'].apply(lambda x: x[n]).astype('category')
            test[f'{name}_structure_{n}'] = test[f'structure'].apply(lambda x: x[n]).astype('category')
            train[f'{name}_predicted_loop_type_{n}'] = train[f'predicted_loop_type'].apply(lambda x: x[n]).astype('category')
            test[f'{name}_predicted_loop_type_{n}'] = test[f'predicted_loop_type'].apply(lambda x: x[n]).astype('category')
            train[f'{name}_sequence_{n}'] = train[f'sequence'].apply(lambda x: x[n]).astype('category')
            test[f'{name}_sequence_{n}'] = test[f'sequence'].apply(lambda x: x[n]).astype('category')

    SEQUENCE_COLS = [c for c in train.columns if f'{name}_sequence_' in c]
    STRUCTURE_COLS = [c for c in train.columns if f'{name}_structure_' in c]
    PLT_COLS = [c for c in train.columns if f'{name}_predicted_loop_type_' in c]
    
    for target in predict_columns:
        X = train[SEQUENCE_COLS + STRUCTURE_COLS + PLT_COLS]
        y = train[f'mean_{target}']

        X_test = test[SEQUENCE_COLS + STRUCTURE_COLS + PLT_COLS]

        X_train, X_val, y_train, y_val = train_test_split(X, y)
        fit_func(model, X_train, y_train, X_val, y_val, func_type)
        test[f'mean_{target}_pred'] = model.predict(X_test)
    sample_df['id'] = 'id_' + sample_df['id_seqpos'].str.split('_', expand=True)[1]

            # Merge my predicted average values
    ss_new = sample_df. \
            drop(predict_columns, axis=1) \
            .merge(test[['id',
                        'mean_reactivity_pred',
                        'mean_deg_Mg_pH10_pred',
                        'mean_deg_Mg_50C_pred',
                        'mean_deg_50C_pred',
                        'mean_deg_pH10_pred']] \
                        .rename(columns={'mean_reactivity_pred' : 'reactivity',
                                        'mean_deg_Mg_pH10_pred': 'deg_Mg_pH10',
                                        'mean_deg_Mg_50C_pred' : 'deg_Mg_50C',
                                        'mean_deg_50C_pred' : 'deg_50C',
                                        'mean_deg_pH10_pred': 'deg_pH10'}
                                ),
                    on='id',
                validate='m:1')
    sample_df = pd.read_csv('../input/stanford-covid-vaccine/sample_submission.csv')
    ss_new[sample_df.columns].to_csv(f'{name}.csv', index=False)

# **Read Original_data and augmental_data**

In [None]:
aug_df = pd.read_csv(aug_data_dir + 'aug_data.csv')
aug_df.head()

In [None]:
test  = pd.read_json(root_dir + "test.json", lines=True)
train  = pd.read_json(root_dir + "train.json", lines=True)
train = train.query("signal_to_noise >= 1")
sample_df = pd.read_csv(root_dir + 'sample_submission.csv')
train

In [None]:
train = aug_data(train, aug_df)
test = aug_data(test, aug_df)

In [None]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}
len(token2int)

# **Several Inputs and labels**

In [None]:
#Train inputs and labels
train_inputs = preprocess_inputs(train, token2int, input_columns)
train_targets = pandas_list_to_array(train[predict_columns])

#public test inputs and private test inputs
public_df = test.query("seq_length == 107")
private_df = test.query("seq_length == 130")

public_inputs = preprocess_inputs(public_df, token2int, input_columns)
private_inputs = preprocess_inputs(private_df, token2int, input_columns)

# **Process training data**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_inputs, train_targets, test_size=.1, random_state=7,
                                                    stratify=train.SN_filter)

# **Deep Learning Model**

In [None]:
model = Single_GRU(embed_size=len(token2int), 
                    sequence_length=sequence_length, 
                    predicted_length=predicted_length,
                    embed_dim=embed_dim,
                    hidden_dim=hidden_dim,
                    n_layers=n_layers)
model.summary()

In [None]:
history = model.fit(x_train, y_train,validation_data=(x_test, y_test),
                    batch_size=64,epochs=50,verbose=1,
                    callbacks=[
                            tf.keras.callbacks.ReduceLROnPlateau(patience=5),
                            tf.keras.callbacks.ModelCheckpoint('Project.h5')
                    ]
)

In [None]:
def plot_learning_curves(history, label, epochs, min_value, max_value):
    data = {}
    data[label] = history.history[label]
    data['val_'+label] = history.history['val_'+label]
    pd.DataFrame(history.history).plot(figsize=(8,5))
    plt.grid(True)
    plt.axis([0, epochs, min_value, max_value])
    plt.show()

plot_learning_curves(history, 'loss', 100, 0, 1)

# **Predictions**

In [None]:
public_model = Single_GRU(embed_size=len(token2int), 
                           sequence_length=107, 
                           predicted_length=107,
                           embed_dim=embed_dim,
                           hidden_dim=hidden_dim,
                           n_layers=n_layers)
private_model = Single_GRU(embed_size=len(token2int), 
                           sequence_length=130, 
                           predicted_length=130,
                           embed_dim=embed_dim,
                           hidden_dim=hidden_dim,
                           n_layers=n_layers)

public_model.load_weights('Project.h5')
private_model.load_weights('Project.h5')

In [None]:
public_preds = public_model.predict(public_inputs)
private_preds = private_model.predict(private_inputs)

# Post_process

In [None]:
Post_process('submission.csv', sample_df, public_df, public_preds, private_df, private_preds)

# **LightGBM AND XGBOOST**

In [None]:
model1 = light_lgb(1000, 0.005)
model2 = xgboost(1000, 0.001)
model_process(model1, 'lightlgb1', predict_columns, fit_fuc, 0)
model_process(model2, 'xgboost', predict_columns, fit_fuc, 1)