this notebook based https://www.kaggle.com/masashisode/pytorch-implementation-of-mcrmseloss
special thanks for https://www.kaggle.com/masashisode

In [None]:
debug = False

## MCRMSELoss

In [None]:
import gc
import itertools
import warnings
warnings.filterwarnings('ignore')
import os
import random

#the basics
import pandas as pd, numpy as np, seaborn as sns
import math, json
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.cluster import KMeans

#for model evaluation
from sklearn.model_selection import train_test_split, KFold, GroupKFold

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ExponentialLR, CosineAnnealingLR
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SEED = 2020


def seed_everything(seed=2020):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


seed_everything(SEED)

In [None]:
class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss


class MCRMSELoss(nn.Module):
    def __init__(self, num_scored=3):
        super().__init__()
        self.rmse = RMSELoss()
        self.num_scored = num_scored

    def forward(self, yhat, y):
        score = 0
        for i in range(self.num_scored):
            score += self.rmse(yhat[:, :, i], y[:, :, i]) / self.num_scored

        return score

## usage

```python
criterion = MCRMSELoss()
predictions = model(data)
loss = criterion(predictions, targets)
```

if you use `SN_filter`, you can get the LB-like score. 
Let's see how it works.

### Dataload

In [None]:
import pandas as pd


def load_json(path):
    return pd.read_json(path, lines=True)

df = load_json('/kaggle/input/stanford-covid-vaccine/train.json')
df_test = load_json('/kaggle/input/stanford-covid-vaccine/test.json')
sample_sub = pd.read_csv('/kaggle/input/stanford-covid-vaccine/sample_submission.csv')

if debug:
    df = df[:200]
    df_test = df_test[:200]
df = df[df.SN_filter == 1]

In [None]:
print(set(df["sequence"].sum()))
print(set(df["structure"].sum()))
print(set(df["predicted_loop_type"].sum()))
# {'G', 'A', 'C', 'U'}
# {')', '(', '.'}
# {'X', 'E', 'M', 'B', 'H', 'S', 'I'}
sequence_and_structure = [i + j for i in "GACU" for j in "()."]
sequence_and_predicted_loop_type = [i + j for i in "GACU" for j in "XEMBHSI"]
structure_and_predicted_loop_type = [i + j for i in "()." for j in "XEMBHSI"]


### preprocess

In [None]:
def merge_seq_seq(seq):
    half = len(seq)//2
    new_seq = []
    for i in range(len(seq)//2):
        new_seq.append(seq[i] + seq[i+half])
    return new_seq


In [None]:
create_feture = True
preprocess_cols=["sequence", "structure", "predicted_loop_type"]
if create_feture:
    df["sequence_and_structure"] = (df["sequence"] + df["structure"]).apply(merge_seq_seq)
    #df["sequence_and_predicted_loop_type"] = (df["sequence"] + df["predicted_loop_type"]).apply(merge_seq_seq)
    #df["structure_and_predicted_loop_type"] = (df["structure"] + df["predicted_loop_type"]).apply(merge_seq_seq)
    df_test["sequence_and_structure"] = (df_test["sequence"] + df_test["structure"]).apply(merge_seq_seq)
    #df_test["sequence_and_predicted_loop_type"] = (df_test["sequence"] + df_test["predicted_loop_type"]).apply(merge_seq_seq)
    #df_test["structure_and_predicted_loop_type"] = (df_test["structure"] + df_test["predicted_loop_type"]).apply(merge_seq_seq)
    preprocess_cols=["sequence", "structure", "predicted_loop_type", "sequence_and_structure"]#, "sequence_and_predicted_loop_type", "structure_and_predicted_loop_type"]
    def read_bpps_sum(df):
        bpps_arr = []
        for mol_id in df.id.to_list():
            bpps_arr.append(np.load(f"../input/stanford-covid-vaccine/bpps/{mol_id}.npy").sum(axis=1))
        return bpps_arr

    def read_bpps_max(df):
        bpps_arr = []
        for mol_id in df.id.to_list():
            bpps_arr.append(np.load(f"../input/stanford-covid-vaccine/bpps/{mol_id}.npy").max(axis=1))
        return bpps_arr

    def read_bpps_nb(df):
        #mean and std from https://www.kaggle.com/symyksr/openvaccine-deepergcn 
        bpps_nb_mean = 0.077522
        bpps_nb_std = 0.08914
        bpps_arr = []
        for mol_id in df.id.to_list():
            bpps = np.load(f"../input/stanford-covid-vaccine/bpps/{mol_id}.npy")
            bpps_nb = (bpps > 0).sum(axis=0) / bpps.shape[0]
            bpps_nb = (bpps_nb - bpps_nb_mean) / bpps_nb_std
            bpps_arr.append(bpps_nb)
        return bpps_arr 
    df['bpps_sum'] = read_bpps_sum(df)
    df_test['bpps_sum'] = read_bpps_sum(df_test)
    df['bpps_max'] = read_bpps_max(df)
    df_test['bpps_max'] = read_bpps_max(df_test)
    df['bpps_nb'] = read_bpps_nb(df)
    df_test['bpps_nb'] = read_bpps_nb(df_test)


In [None]:
target_cols = ["reactivity", "deg_Mg_pH10", "deg_Mg_50C"]

tokens = [i for i in "().ACGUBEHIMSX"] + sequence_and_structure + sequence_and_predicted_loop_type + structure_and_predicted_loop_type
token2int = {x:i for i, x in enumerate(tokens)}


def preprocess_inputs(df, cols):
    base_fea = np.transpose(
        np.array(
            df[cols].applymap(lambda seq: [token2int[x] for x in seq]).values.tolist()
        ), (0, 2, 1)
    )
    if create_feture:
        bpps_sum_fea = np.array(df['bpps_sum'].to_list())[:,:,np.newaxis]
        bpps_max_fea = np.array(df['bpps_max'].to_list())[:,:,np.newaxis]
        bpps_nb_fea = np.array(df['bpps_nb'].to_list())[:,:,np.newaxis]
        return np.concatenate([base_fea,bpps_sum_fea,bpps_max_fea,bpps_nb_fea], 2)
    else:
        return base_fea

train_inputs = torch.tensor(preprocess_inputs(df, preprocess_cols)).to(device)
print("input shape: ", train_inputs.shape)
train_labels = torch.tensor(
    np.array(df[target_cols].values.tolist()).transpose(0, 2, 1)
).float().to(device)

In [None]:
df.head(3)

## Model

In [None]:
models = ["LSTM", "LSTM_short", "GRU", "blend1", "blend2", "blend3", "blend4"]
models = [
    {"model": "LSTM", "dropout": 0.4, "embed_dim": 100, "hidden_dim": 128, "hidden_layers": 3},
    {"model": "LSTM", "dropout": 0.4, "embed_dim": 100, "hidden_dim": 256, "hidden_layers": 3}
]


In [None]:
class Wave_Block(nn.Module):

    def __init__(self, in_channels, out_channels, dilation_rates, kernel_size):
        super(Wave_Block, self).__init__()
        self.num_rates = dilation_rates
        self.convs = nn.ModuleList()
        self.filter_convs = nn.ModuleList()
        self.gate_convs = nn.ModuleList()

        self.convs.append(nn.Conv1d(in_channels, out_channels, kernel_size=1))
        dilation_rates = [2 ** i for i in range(dilation_rates)]
        for dilation_rate in dilation_rates:
            self.filter_convs.append(
                nn.Conv1d(out_channels, out_channels, kernel_size=kernel_size, padding=int((dilation_rate*(kernel_size-1))/2), dilation=dilation_rate))
            self.gate_convs.append(
                nn.Conv1d(out_channels, out_channels, kernel_size=kernel_size, padding=int((dilation_rate*(kernel_size-1))/2), dilation=dilation_rate))
            self.convs.append(nn.Conv1d(out_channels, out_channels, kernel_size=1))

    def forward(self, x):
        x = self.convs[0](x)
        res = x
        for i in range(self.num_rates):
            x = torch.tanh(self.filter_convs[i](x)) * torch.sigmoid(self.gate_convs[i](x))
            x = self.convs[i + 1](x)
            res = res + torch.nn.functional.pad(x, (0, 1, 0, 0))
        return res


class MyModel(nn.Module):
    def __init__(
        self, seq_len=107, pred_len=68, dropout=0.4, embed_dim=100, hidden_dim=64, hidden_layers=3, main_layer="LSTM"
        ):
        super(MyModel, self).__init__()
        self.pred_len = pred_len
        self.main_layer = main_layer
        if main_layer == "GRU":
            hidden_dim //= 3
        
        self.embeding = nn.Embedding(num_embeddings=len(token2int), embedding_dim=embed_dim)
        self.lstm_layer = nn.LSTM(
            input_size=embed_dim * len(preprocess_cols)+ 3,
            hidden_size=hidden_dim,
            num_layers=hidden_layers,
            dropout=dropout,
            bidirectional=True,
            batch_first=True
        )
        self.gru_layer = nn.GRU(
            input_size=embed_dim * len(preprocess_cols) + 3,
            hidden_size=hidden_dim,
            num_layers=hidden_layers,
            dropout=dropout,
            bidirectional=True,
            batch_first=True
        )
        self.hidden_lstm_layer = nn.LSTM(
            input_size=hidden_dim*2,
            hidden_size=hidden_dim,
            num_layers=1,
            dropout=dropout,
            bidirectional=True,
            batch_first=True
        )
        self.hidden_gru_layer = nn.GRU(
            input_size=hidden_dim*2,
            hidden_size=hidden_dim,
            num_layers=1,
            dropout=dropout,
            bidirectional=True,
            batch_first=True
        )
        self.wave_block = Wave_Block(hidden_dim*2, hidden_dim*2, 12, hidden_dim)
        self.linear1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, len(target_cols))

    def forward(self, seqs):
        if create_feture:
            categorical_feats = seqs[:, :, :len(preprocess_cols)].long()
            numerical_feats = seqs[:, :, len(preprocess_cols):].float()
            
            embed = self.embeding(categorical_feats)
            reshaped = torch.reshape(embed, (-1, embed.shape[1], embed.shape[2] * embed.shape[3]))
            reshaped = torch.cat([reshaped, numerical_feats], axis=2)
        else:
            embed = self.embeding(seqs)
            reshaped = torch.reshape(embed, (-1, embed.shape[1], embed.shape[2] * embed.shape[3]))
            reshaped = torch.cat([reshaped, numerical_feats], axis=2)
        if self.main_layer == "LSTM":
            output, hidden = self.lstm_layer(reshaped)
        elif self.main_layer == "GRU":
            output, hidden = self.gru_layer(reshaped)
        elif self.main_layer == "blend1":
            output, hidden = self.lstm_layer(reshaped)
            output, hidden = self.hidden_lstm_layer(output)
        elif self.main_layer == "blend2":
            output, hidden = self.gru_layer(reshaped)
            output, hidden = self.hidden_gru_layer(output)
        elif self.main_layer == "blend3":
            output, hidden = self.gru_layer(reshaped)
            output, hidden = self.hidden_gru_layer(output)
        elif self.main_layer == "blend4":
            output, hidden = self.lstm_layer(reshaped)
            output, hidden = self.hidden_lstm_layer(output)
        elif self.main_layer == "WaveNet":
            output, hidden = self.lstm_layer(reshaped)
            output = output.permute(0, 2, 1)
            output = self.wave_block(output)
            output = output.permute(0, 2, 1)
        truncated = output[:, : self.pred_len, :]
        truncated = self.linear1(truncated)
        truncated = self.linear2(truncated)
        truncated = self.linear2(truncated)
        out = self.linear3(truncated)
        return out

criterion = MCRMSELoss(len(target_cols))

def compute_loss(batch_X, batch_Y, model, optimizer=None, is_train=True, scheduler=None):
    model.train(is_train)
    pred_Y = model(batch_X)
    loss = criterion(pred_Y, batch_Y)
    if is_train:
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()
        
    return loss.item(), pred_Y


In [None]:
FOLDS = 4
EPOCHS = 100
if debug:
    EPOCHS = 2
BATCH_SIZE = 64
VERBOSE = 2
LR = 0.016

In [None]:
public_df = df_test.query("seq_length == 107").copy()
private_df = df_test.query("seq_length == 130").copy()

public_inputs = torch.tensor(preprocess_inputs(public_df, preprocess_cols)).to(device)
private_inputs = torch.tensor(preprocess_inputs(private_df, preprocess_cols)).to(device)

public_loader = DataLoader(TensorDataset(public_inputs), shuffle=False, batch_size=BATCH_SIZE)
private_loader = DataLoader(TensorDataset(private_inputs), shuffle=False, batch_size=BATCH_SIZE)


### KFold Training and Inference

In [None]:
if debug:
    kmeans_model = KMeans(n_clusters=50, random_state=110).fit(preprocess_inputs(df, preprocess_cols)[:,:,0])
else:
    kmeans_model = KMeans(n_clusters=200, random_state=110).fit(preprocess_inputs(df, preprocess_cols)[:,:,0])
kmeans_labels = kmeans_model.labels_

In [None]:
model_histories = {str(model_id): [] for model_id in models}
model_oof_preds = {str(model_id): np.zeros((df.shape[0], 68, len(target_cols))) for model_id in models}
model_private_preds = {str(model_id): np.zeros((private_df.shape[0], 130, len(target_cols))) for model_id in models}
model_public_preds = {str(model_id): np.zeros((public_df.shape[0], 107, len(target_cols))) for model_id in models}

criterion = MCRMSELoss()
gkf = GroupKFold(FOLDS)


for model_dict in models:
    model_id = str(model_dict)
    model_name = model_dict["model"]
    dropout = model_dict["dropout"]
    embed_dim = model_dict["embed_dim"]
    hidden_layers = model_dict["hidden_layers"]
    
    model_oof_pred = []
    oof_idxes = []
    for k, (train_index, val_index) in enumerate(gkf.split(train_inputs, df["reactivity"], kmeans_labels)):
        oof_idxes.append(val_index)
        train_dataset = TensorDataset(train_inputs[train_index], train_labels[train_index])
        val_dataset = TensorDataset(train_inputs[val_index], train_labels[val_index])

        train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
        val_loader = DataLoader(val_dataset, shuffle=True, batch_size=BATCH_SIZE)

        model = MyModel(dropout=dropout, embed_dim=embed_dim, hidden_layers=hidden_layers, main_layer=model_name).to(device)
        optimizer = optim.Adam(model.parameters(), lr=LR)
        scheduler = None
        scheduler = CosineAnnealingLR(optimizer, T_max=20, eta_min=0.008)
        # scheduler = ExponentialLR(optimizer, gamma=0.95)

        train_losses = []
        val_losses = []
        for epoch in tqdm(range(EPOCHS)):
            train_losses_batch = []
            val_losses_batch = []
            for (batch_X, batch_Y) in train_loader:
                train_loss, _ = compute_loss(batch_X, batch_Y, model, optimizer=optimizer, is_train=True, scheduler=scheduler)
                train_losses_batch.append(train_loss)
            for (batch_X, batch_Y) in val_loader:
                model.eval()
                val_loss, val_pred = compute_loss(batch_X, batch_Y, model, optimizer=optimizer, is_train=False)
                val_losses_batch.append(val_loss)
            train_losses.append(np.mean(train_losses_batch))    
            val_losses.append(np.mean(val_losses_batch))
        model_state = model.state_dict()
        torch.save(model_state, f"{model_id}_fold{k}.pth")
        
        del model

        model_histories[model_id].append({"train_loss": train_losses, "val_loss": val_losses})

        model_short = MyModel(seq_len=107, pred_len=107, dropout=dropout, embed_dim=embed_dim, hidden_layers=hidden_layers, main_layer=model_name).to(device)
        model_short.load_state_dict(model_state)
        model_short.eval()
        model_public_pred = np.ndarray((0, 107, len(target_cols)))
        for batch in public_loader:
            batch_X = batch[0]
            pred = model_short(batch_X).detach().cpu().numpy()
            model_public_pred = np.concatenate([model_public_pred, pred], axis=0)
        model_public_preds[model_id] += model_public_pred / FOLDS

        model_long = MyModel(seq_len=130, pred_len=130, dropout=dropout, embed_dim=embed_dim, hidden_layers=hidden_layers, main_layer=model_name).to(device)
        model_long.load_state_dict(model_state)
        model_long.eval()
        model_private_pred = np.ndarray((0, 130, len(target_cols)))
        for batch in private_loader:
            batch_X = batch[0]
            pred = model_long(batch_X).detach().cpu().numpy()
            model_private_pred = np.concatenate([model_private_pred, pred], axis=0)
        model_private_preds[model_id] += model_private_pred / FOLDS
        
        oof_loader = DataLoader(val_dataset, shuffle=False, batch_size=BATCH_SIZE)
        model_oof = MyModel(seq_len=107, pred_len=68, dropout=dropout, embed_dim=embed_dim, hidden_layers=hidden_layers, main_layer=model_name).to(device)
        model_oof.load_state_dict(model_state)
        model_oof.eval()
        oof_pred = np.ndarray((0, 68, len(target_cols)))
        for batch in oof_loader:
            batch_X = batch[0]
            pred = model_oof(batch_X).detach().cpu().numpy()
            oof_pred = np.concatenate([oof_pred, pred], axis=0)
        model_oof_pred.append(oof_pred)

        del model_short, model_long
        gc.collect()
    oof_idxes = np.concatenate(oof_idxes)
    order = np.argsort(oof_idxes)
    model_oof_preds[model_id] = np.concatenate(model_oof_pred)[order]


In [None]:
if True:
    ensemble_losses = []
    alphas = []
    with torch.no_grad():
        for alpha in np.linspace(0, 1, 100):
            alphas.append(alpha)
            loss = criterion(torch.tensor(model_oof_preds[str(models[0])]*alpha + model_oof_preds[str(models[1])]*(1-alpha)), torch.tensor(train_labels.detach().cpu().numpy()))
            ensemble_losses.append(float(loss))
    plt.plot(ensemble_losses)
    best_alpha = alphas[np.argmin(ensemble_losses)]
    best_loss = ensemble_losses[np.argmin(ensemble_losses)]
    print(f"best_alpha is {best_alpha}, best loss is {best_loss} ,lr :{LR}")

In [None]:

fig, ax = plt.subplots(1, 1, figsize = (20, 10))

legend = []

for model_dict in models:
    model_id = str(model_dict)
    train_loss = np.zeros(EPOCHS)
    val_loss = np.zeros(EPOCHS)
    for history in model_histories[model_id]:
        train_loss = np.array(history['train_loss']) / FOLDS
        val_loss = np.array(history['val_loss']) / FOLDS
    ax.plot(train_loss)
    ax.plot(val_loss)

    legend.append(f'{model_id}_train')
    legend.append(f'{model_id}_validation')

    ax.set_ylabel('Loss')
    ax.set_xlabel('Epoch')
    print(f"{history['val_loss'][-1]} : valid last {model_id}  ,lr :{LR}")

ax.legend(legend)

### Submission

In [None]:
public_df = df_test.query("seq_length == 107").copy()
private_df = df_test.query("seq_length == 130").copy()

public_inputs = preprocess_inputs(public_df, preprocess_cols)
private_inputs = preprocess_inputs(private_df, preprocess_cols)

In [None]:
preds_model = {str(model_id): [] for model_id in models}
submissions = {}

for model_dict in models:
    model_id = str(model_dict)
    for df, preds in [(public_df, model_public_preds[model_id]), (private_df, model_private_preds[model_id])]:
        for i, uid in enumerate(df.id):
            single_pred = preds[i]

            single_df = pd.DataFrame(single_pred, columns=target_cols)
            single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

            preds_model[model_id].append(single_df)

    preds_df = pd.concat(preds_model[model_id])
    print(preds_df.head())
    submission = sample_sub[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
    submission['deg_pH10'] = 0
    submission['deg_50C'] = 0
    print(submission.head())
    submission.to_csv(f'submission_{model_id}.csv', index=False)
    submissions[model_id] = submission
    print('Submission saved')


## ensemble submission with two LSTM

In [None]:
if True:
    id_seqpos = submissions[str(models[0])]["id_seqpos"]
    ensemble_submission = submissions[str(models[0])].drop("id_seqpos",axis=1)*best_alpha + submissions[str(models[1])].drop("id_seqpos",axis=1)*(1-best_alpha)
    ensemble_submission["id_seqpos"] = id_seqpos
    ensemble_submission.to_csv(f'submission_lstm_gru_ensemble.csv', index=False)

In [None]:
if not debug:
    !curl -X POST -H 'Content-type: application/json' --data '{"text":"commit done! "}' <your_webhook_url>