## Imports

In [None]:
!pip install torchcontrib

In [None]:
import os
import math
import time
import torch
import random
import shutil
import datetime
import functools
import numpy as np
import pandas as pd
import torch.nn.functional as F


from torch import nn
from pathlib import Path
from torchcontrib.optim import SWA
from sklearn.cluster import KMeans
from collections import OrderedDict
from torch.utils.data import Dataset
from sklearn.model_selection import *
from transformers import get_linear_schedule_with_warmup
from torch.nn import TransformerEncoder, TransformerEncoderLayer

## Initialization

In [None]:
TARGETS = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
SCORED_TARGETS = [0, 1, 3]

NUM_TARGETS = len(TARGETS)

SEQ_SCORED_PUBLIC = 68
SEQ_SCORED_PRIVATE = 91

SEQ_LEN_PUBLIC = 107
SEQ_LEN_PRIVATE = 130

In [None]:
BASE_PATH = "../input/stanford-covid-vaccine/"
CP_PATH = ""
PRETRAINED_PATH = "../input/covid-pretrained/pretrained_model.pt"

DEVICE = torch.device('cuda')
TODAY = str(datetime.date.today())

In [None]:
train_df = pd.read_json(str(Path(BASE_PATH) / 'train.json'), lines=True)

test_df = pd.read_json(str(Path(BASE_PATH) / 'test.json'), lines=True)
public_df = test_df[test_df["seq_length"] == SEQ_LEN_PUBLIC].reset_index(drop=True)
private_df = test_df[test_df["seq_length"] == SEQ_LEN_PRIVATE].reset_index(drop=True)

In [None]:
target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
input_cols = ['sequence', 'structure', 'predicted_loop_type']
error_cols = ['reactivity_error', 'deg_error_Mg_pH10', 'deg_error_Mg_50C', 'deg_error_pH10', 'deg_error_50C']

token_dicts = {
    "sequence": {x: i for i, x in enumerate("ACGU")},
    "structure": {x: i for i, x in enumerate('().')},
    "predicted_loop_type": {x: i for i, x in enumerate("BEHIMSX")}
}

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True  # False
    
SEED = 1234
set_seed(SEED)

## Groups

In [None]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [token2int[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

In [None]:
X = preprocess_inputs(train_df, cols=['sequence'])[:, :, 0]
kmeans = KMeans(n_clusters=200, random_state=110).fit(X)
groups = kmeans.labels_

## Augmented data

In [None]:
aug_df = pd.read_csv('../input/covid-data/aug_data.csv')

In [None]:
def augment_data(df, concat=True):
    df = df.copy()
    target_df = df.copy()
    new_df = aug_df[aug_df['id'].isin(target_df['id'])]
                         
    del target_df['structure']
    del target_df['predicted_loop_type']
    new_df = new_df.merge(target_df, on=['id','sequence'], how='left').sort_values('index')

    df['cnt'] = df['id'].map(new_df[['id','cnt']].set_index('id').to_dict()['cnt'])
    df['log_gamma'] = 100
    df['score'] = 1.0
        
    if concat:
        df = df.append(new_df[df.columns]).reset_index(drop=True)
        return df
    else:
        return new_df[df.columns].reset_index(drop=True)

## PL

In [None]:
PUBLIC_IDS = public_df['id'].values

def is_public(id_seqpos):
    id_ = '_'.join(id_seqpos.split('_')[:2])
    return id_ in PUBLIC_IDS

In [None]:
PL_PATH = "../input/covid-pl/"

In [None]:
PL_PUBLIC = np.load(PL_PATH + 'pl_public.npy')
PL_PRIVATE = np.load(PL_PATH + 'pl_private.npy')

In [None]:
for t, target in enumerate(TARGETS):
    tgt = []
    for i in range(len(public_df)):
        tgt.append(list(PL_PUBLIC[i, :SEQ_SCORED_PUBLIC, t]))
    
    public_df[target] = tgt
    
    tgt = []
    for i in range(len(private_df)):
        tgt.append(list(PL_PRIVATE[i, :SEQ_SCORED_PRIVATE, t]))
    private_df[target] = tgt
    
public_df['signal_to_noise'] = 1
private_df['signal_to_noise'] = 1

public_df = augment_data(public_df)
private_df = augment_data(private_df)

### Checkpointing

In [None]:
def save_model_weights(model, filename, verbose=1, cp_folder=""):
    """
    Saves the weights of a PyTorch model
    
    Arguments:
        model {torch module} -- Model to save the weights of
        filename {str} -- Name of the checkpoint
    
    Keyword Arguments:
        verbose {int} -- Whether to display infos (default: {1})
        cp_folder {str} -- Folder to save to (default: {''})
    """
    if verbose:
        print(f"\n -> Saving weights to {os.path.join(cp_folder, filename)}\n")
    torch.save(model.state_dict(), os.path.join(cp_folder, filename))


def load_model_weights(model, filename, verbose=1, cp_folder=""):
    """
    Loads the weights of a PyTorch model. The exception handles cpu/gpu incompatibilities
    
    Arguments:
        model {torch module} -- Model to load the weights to
        filename {str} -- Name of the checkpoint
    
    Keyword Arguments:
        verbose {int} -- Whether to display infos (default: {1})
        cp_folder {str} -- Folder to load from (default: {''})
    
    Returns:
        torch module -- Model with loaded weights
    """
    if verbose:
        print(f"\n -> Loading weights from {os.path.join(cp_folder,filename)}\n")
    try:
        model.load_state_dict(os.path.join(cp_folder, filename), strict=strict)
    except BaseException:
        model.load_state_dict(
            torch.load(os.path.join(cp_folder, filename), map_location="cpu"),
            strict=True,
        )
    return model

## Data


In [None]:
def preprocess_inputs(df, cols):
    return np.concatenate([preprocess_feature_col(df, col) for col in cols], axis=2)


def preprocess_feature_col(df, col):
    dic = token_dicts[col]
    dic_len = len(dic)
    seq_length = len(df[col][0])
    ident = np.identity(dic_len)
    # convert to one hot
    arr = np.array(
        df[[col]].applymap(lambda seq: [ident[dic[x]] for x in seq]).values.tolist()
    ).squeeze(1)
    # shape: data_size x seq_length x dic_length
    assert arr.shape == (len(df), seq_length, dic_len)
    return arr


def preprocess(base_data, is_test=False):
    inputs = preprocess_inputs(base_data, input_cols)
    if is_test:
        labels = None
    else:
        labels = np.array(base_data[target_cols].values.tolist()).transpose((0, 2, 1))
        assert labels.shape[2] == len(target_cols)
    assert inputs.shape[2] == 14
    return inputs, labels


def get_bpp_feature(bpp):
    bpp_nb_mean = 0.077522  # mean of bpps_nb across all training data
    bpp_nb_std = 0.08914  # std of bpps_nb across all training data
    bpp_max = bpp.max(-1)[0]
    bpp_sum = bpp.sum(-1)
    bpp_nb = torch.true_divide((bpp > 0).sum(dim=1), bpp.shape[1])
    bpp_nb = torch.true_divide(bpp_nb - bpp_nb_mean, bpp_nb_std)
    return [bpp_max.unsqueeze(2), bpp_sum.unsqueeze(2), bpp_nb.unsqueeze(2)]


@functools.lru_cache(5000)
def load_from_id(id_):
    path = Path(BASE_PATH) / f"bpps/{id_}.npy"
    data = np.load(str(path))
    return data


def get_distance_matrix(leng):
    idx = np.arange(leng)
    Ds = []
    for i in range(len(idx)):
        d = np.abs(idx[i] - idx)
        Ds.append(d)

    Ds = np.array(Ds) + 1
    Ds = 1 / Ds
    Ds = Ds[None, :, :]
    Ds = np.repeat(Ds, 1, axis=0)

    Dss = []
    for i in [1, 2, 4]:
        Dss.append(Ds ** i)
    Ds = np.stack(Dss, axis=3)

    return Ds


def get_structure_adj(df):
    Ss = []
    for i in range(len(df)):
        seq_length = df["seq_length"].iloc[i]
        structure = df["structure"].iloc[i]
        sequence = df["sequence"].iloc[i]

        cue = []
        a_structures = OrderedDict([
            (("A", "U"), np.zeros([seq_length, seq_length])),
            (("C", "G"), np.zeros([seq_length, seq_length])),
            (("U", "G"), np.zeros([seq_length, seq_length])),
            (("U", "A"), np.zeros([seq_length, seq_length])),
            (("G", "C"), np.zeros([seq_length, seq_length])),
            (("G", "U"), np.zeros([seq_length, seq_length])),
        ])
        for j in range(seq_length):
            if structure[j] == "(":
                cue.append(j)
            elif structure[j] == ")":
                start = cue.pop()
                a_structures[(sequence[start], sequence[j])][start, j] = 1
                a_structures[(sequence[j], sequence[start])][j, start] = 1

        a_strc = np.stack([a for a in a_structures.values()], axis=2)
        a_strc = np.sum(a_strc, axis=2, keepdims=True)
        Ss.append(a_strc)

    Ss = np.array(Ss)
    return Ss

## Loaders


In [None]:
def create_loader(df, batch_size=64, is_test=False, shuffle=True):
    if is_test:
        shuffle = False
        
    features, labels = preprocess(df, is_test)
    features_tensor = torch.from_numpy(features)
    
    if labels is not None:
        labels_tensor = torch.from_numpy(labels)
        dataset = VacDataset(features_tensor, df, labels_tensor)
        loader = torch.utils.data.DataLoader(dataset, batch_size, shuffle=shuffle)
    else:
        dataset = VacDataset(features_tensor, df, None)
        loader = torch.utils.data.DataLoader(dataset, batch_size, shuffle=False)
    return loader


class VacDataset(Dataset):
    def __init__(self, features, df, labels=None):
        self.features = features
        self.labels = labels
        self.test = labels is None
        self.ids = df["id"]
        self.score = None
        self.structure_adj = get_structure_adj(df)
        self.distance_matrix = get_distance_matrix(self.structure_adj.shape[1])
        if "score" in df.columns:
            self.score = df["score"]
        else:
            df["score"] = 1.0
            self.score = df["score"]
        self.signal_to_noise = None
        if not self.test:
            self.signal_to_noise = df["signal_to_noise"]
            assert self.features.shape[0] == self.labels.shape[0]
        else:
            assert self.ids is not None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        bpp = torch.from_numpy(load_from_id(self.ids[index]).copy()).float()
        adj = self.structure_adj[index]
        distance = self.distance_matrix[0]
        bpp = np.concatenate([bpp[:, :, None], adj, distance], axis=2)
        if self.test:
            return dict(sequence=self.features[index].float(), bpp=bpp, ids=self.ids[index])
        else:
            return dict(sequence=self.features[index].float(), bpp=bpp,
                        label=self.labels[index], ids=self.ids[index],
                        signal_to_noise=self.signal_to_noise[index],
                        score=self.score[index])

## Model

In [None]:
USE_FT = True

CNN_DROP = 0.1
ENC_DROP = 0.1
RNN_DROP = 0.3
LOGIT_DROP = 0.25

D = 256

In [None]:
class Conv1dStack(nn.Module):
    def __init__(self, in_dim, out_dim, kernel_size=3, padding=1, dilation=1):
        super(Conv1dStack, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(in_dim, out_dim, kernel_size=kernel_size, padding=padding, dilation=dilation, bias=False),
            nn.BatchNorm1d(out_dim),
            nn.Dropout(CNN_DROP),
            nn.LeakyReLU(),
        )
        self.res = nn.Sequential(
            nn.Conv1d(out_dim, out_dim, kernel_size=kernel_size, padding=padding, dilation=dilation, bias=False),
            nn.BatchNorm1d(out_dim),
            nn.Dropout(CNN_DROP),
            nn.LeakyReLU(),
        )

    def forward(self, x):
        x = self.conv(x)
        h = self.res(x)
        return x + h


class Conv2dStack(nn.Module):
    def __init__(self, in_dim, out_dim, kernel_size=3, padding=1, dilation=1):
        super(Conv2dStack, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_dim, out_dim, kernel_size=kernel_size, padding=padding, dilation=dilation, bias=False),
            nn.BatchNorm2d(out_dim),
            nn.Dropout(CNN_DROP),
            nn.LeakyReLU(),
        )
        self.res = nn.Sequential(
            nn.Conv2d(out_dim, out_dim, kernel_size=kernel_size, padding=padding, dilation=dilation, bias=False),
            nn.BatchNorm2d(out_dim),
            nn.Dropout(CNN_DROP),
            nn.LeakyReLU(),
        )

    def forward(self, x):
        x = self.conv(x)
        h = self.res(x)
        return x + h


class SeqEncoder(nn.Module):
    def __init__(self, in_dim: int, out_dim=256):
        super(SeqEncoder, self).__init__()
        self.conv0 = Conv1dStack(in_dim, out_dim // 2, 3, padding=1)
        self.conv1 = Conv1dStack(out_dim // 2, out_dim // 4, 6, padding=5, dilation=2)
        self.conv2 = Conv1dStack(out_dim // 4, out_dim // 8, 15, padding=7, dilation=1)
        self.conv3 = Conv1dStack(out_dim // 8, out_dim // 8, 30, padding=29, dilation=2)

    def forward(self, x):
        x1 = self.conv0(x)
        x2 = self.conv1(x1)
        x3 = self.conv2(x2)
        x4 = self.conv3(x3)
        x = torch.cat([x1, x2, x3, x4], dim=1)
        # x = x.permute(0, 2, 1).contiguous()
        # BATCH x 256 x seq_length
        return x


class BppAttn(nn.Module):
    def __init__(self, in_channel: int, out_channel: int):
        super(BppAttn, self).__init__()
        self.conv0 = Conv1dStack(in_channel, out_channel, 3, padding=1)
        self.bpp_conv = Conv2dStack(5, out_channel)

    def forward(self, x, bpp):
        x = self.conv0(x)
        bpp = self.bpp_conv(bpp)
        # BATCH x C x SEQ x SEQ
        # BATCH x C x SEQ
        x = torch.matmul(bpp, x.unsqueeze(-1))
        return x.squeeze(-1)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        if ENC_DROP:
            self.dropout = nn.Dropout(p=ENC_DROP)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        if ENC_DROP:
            x = self.dropout(x)
        return x


class TransformerWrapper(nn.Module):
    def __init__(self, dmodel=256, nhead=8, num_layers=2):
        super(TransformerWrapper, self).__init__()
        self.pos_encoder = PositionalEncoding(256)
        encoder_layer = TransformerEncoderLayer(d_model=dmodel, nhead=nhead)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers)
        self.pos_emb = PositionalEncoding(dmodel)

    def flatten_parameters(self):
        pass

    def forward(self, x):
        x = x.permute((1, 0, 2)).contiguous()
        x = self.pos_emb(x)
        x = self.transformer_encoder(x)
        x = x.permute((1, 0, 2)).contiguous()
        return x, None


class RnnLayers(nn.Module):
    def __init__(self, dmodel, transformer_layers: int = 2):
        super(RnnLayers, self).__init__()
        self.dropout = nn.Dropout(RNN_DROP)
        self.rnn0 = TransformerWrapper(dmodel, nhead=8, num_layers=transformer_layers)
        self.rnn1 = nn.LSTM(dmodel, dmodel // 2, batch_first=True, num_layers=1, bidirectional=True)
        self.rnn2 = nn.GRU(dmodel, dmodel // 2, batch_first=True, num_layers=1, bidirectional=True)

    def forward(self, x):
        self.rnn0.flatten_parameters()
        x, _ = self.rnn0(x)
        if self.rnn1 is not None:
            self.rnn1.flatten_parameters()
            x = self.dropout(x)
            x, _ = self.rnn1(x)
        if self.rnn2 is not None:
            self.rnn2.flatten_parameters()
            x = self.dropout(x)
            x, _ = self.rnn2(x)
        return x

    
class BaseAttnModel(nn.Module):
    def __init__(self, transformer_layers=2, d=256):
        super(BaseAttnModel, self).__init__()
        
        if USE_FT:
            self.linear0 = nn.Linear(14 + 3, 1)
            self.seq_encoder_x = SeqEncoder(in_dim=18, out_dim=d)
        else:
            self.linear0 = nn.Linear(14, 1)
            self.seq_encoder_x = SeqEncoder(in_dim=15, out_dim=d)
        
        self.attn = BppAttn(d, d//2)
        self.seq_encoder_bpp = SeqEncoder(in_dim=d//2, out_dim=d)
        self.seq = RnnLayers(d * 2, transformer_layers=transformer_layers)

    def forward(self, x, bpp):
        bpp_features = get_bpp_feature(bpp[:, :, :, 0].float())
        
        if USE_FT:
            x = torch.cat([x] + bpp_features, dim=-1)
            
        learned = self.linear0(x)
        x = torch.cat([x, learned], dim=-1)
        x = x.permute(0, 2, 1).contiguous().float()
        # BATCH x 18 x seq_len
        bpp = bpp.permute([0, 3, 1, 2]).contiguous().float()
        # BATCH x 5 x seq_len x seq_len
        x = self.seq_encoder_x(x)
        # BATCH x d x seq_len
        bpp = self.attn(x, bpp)
        bpp = self.seq_encoder_bpp(bpp)
        # BATCH x d x seq_len
        x = x.permute(0, 2, 1).contiguous()
        # BATCH x seq_len x d
        bpp = bpp.permute(0, 2, 1).contiguous()
        # BATCH x seq_len x d
        x = torch.cat([x, bpp], dim=2)
        # BATCH x seq_len x 2d
        x = self.seq(x)
        return x


class AEModel(nn.Module):
    def __init__(self, transformer_layers=2):
        super(AEModel, self).__init__()
        self.seq = BaseAttnModel(transformer_layers=transformer_layers, d=D)
        self.linear = nn.Sequential(
            nn.Linear(D * 2, 14),
            nn.Sigmoid(),
        )

    def forward(self, x, bpp):
        x = self.seq(x, bpp)
        x = F.dropout(x, p=0.3)
        x = self.linear(x)
        return x


class FromAeModel(nn.Module):
    def __init__(self, seq):
        super(FromAeModel, self).__init__()
        self.seq = seq
        
        self.linear = nn.Sequential(
            nn.Linear(D * 2, len(target_cols)),
        )
        
#         self.linear = nn.Sequential(
#             nn.Linear(D * 2, D),
#             nn.ReLU(),
#             nn.Dropout(LOGIT_DROP),
#             nn.Linear(D, len(target_cols))
#         )

    def forward(self, x, bpp, pred_len=68):
        x = self.seq(x, bpp)
        x = self.linear(x)
        x = x[:, :pred_len]
        return x


## Pretrain

In [None]:
PRETRAIN = False

### Create loaders

In [None]:
features, _ = preprocess(train_df, True)
features_tensor = torch.from_numpy(features)
dataset0 = VacDataset(features_tensor, train_df, None)

features, _ = preprocess(public_df, True)
features_tensor = torch.from_numpy(features)
dataset1 = VacDataset(features_tensor, public_df, None)

features, _ = preprocess(private_df, True)
features_tensor = torch.from_numpy(features)
dataset2 = VacDataset(features_tensor, private_df, None)

In [None]:
BATCH_SIZE = 64

loader0 = torch.utils.data.DataLoader(dataset0, BATCH_SIZE, shuffle=True)
loader1 = torch.utils.data.DataLoader(dataset1, BATCH_SIZE, shuffle=True)
loader2 = torch.utils.data.DataLoader(dataset2, BATCH_SIZE, shuffle=True)

### Pretrain

In [None]:
def learn_from_batch_ae(model, data):
    seq = data["sequence"].clone()
    seq[:, :, :14] = F.dropout2d(seq[:, :, :14], p=0.3)
    target = data["sequence"][:, :, :14]
    out = model(seq.to(DEVICE), data["bpp"].to(DEVICE))
    loss = F.binary_cross_entropy(out, target.to(DEVICE))
    return loss


def train_ae(model, train_data, optimizer, lr_scheduler, epochs=10, start_epoch=0, start_it=0, log_path="./logs"):

    it = start_it
    model_save_path = Path(MODEL_SAVE_PATH)
    
    end_epoch = start_epoch + epochs
    min_loss = 10.0
    min_loss_epoch = 0
    
    if not model_save_path.exists():
        model_save_path.mkdir(parents=True)
        
    for epoch in range(start_epoch, end_epoch):
        model.train()
        losses = []
        for i, data in enumerate(train_data):
            optimizer.zero_grad()
            
            loss = learn_from_batch_ae(model, data)
            loss.backward()
            
            nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            
            if lr_scheduler:
                lr_scheduler.step()
                
            loss_v = loss.item()
            losses.append(loss_v)
            it += 1
            
        loss_m = np.mean(losses)
        if loss_m < min_loss:
            min_loss_epoch = epoch
            min_loss = loss_m
        
        print(f'Epoch {epoch} \t loss={loss_m:.4f}')
        
        torch.save(optimizer.state_dict(), str(model_save_path / "optimizer.pt"))
        torch.save(model.state_dict(), str(model_save_path / f"model-{epoch}.pt"))
    
    return dict(end_epoch=end_epoch, it=it, min_loss_epoch=min_loss_epoch)


In [None]:
set_seed(SEED)

In [None]:
if PRETRAIN:
    model = AEModel()
    model = model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    lr_scheduler = None

    res = dict(end_epoch=0, it=0, min_loss_epoch=0)
    epochs = [5, 5, 5, 5]


    for e in epochs:
        print(' -> Training with train data')
        res = train_ae(model, loader0, optimizer, lr_scheduler, e, start_epoch=res["end_epoch"], start_it=res["it"])

        print('\n -> Training with public data')
        res = train_ae(model, loader1, optimizer, lr_scheduler, e, start_epoch=res["end_epoch"], start_it=res["it"])

        print('\n -> Training with private data')
        res = train_ae(model, loader2, optimizer, lr_scheduler, e, start_epoch=res["end_epoch"], start_it=res["it"])


    save_model_weights(model, CP_PATH + f'pretrained_model_{TODAY}.pt')

    print('\n Done.')

## Training

In [None]:
CLASS_WEIGHT_5 = torch.from_numpy(np.array([1, 1, 1, 1, 1])).unsqueeze(0).cuda()
CLASS_WEIGHT_3 = torch.from_numpy(np.array([1, 1, 0, 1, 0])).unsqueeze(0).cuda()

In [None]:
def mcrmse(truth, pred, verbose=0, scored_targets=[0, 1, 3], filtered=None, reduce=True):
    """
    Metric for the competition
    """

    error = (truth - pred) ** 2
    error = error[:, :, scored_targets]

    if filtered is not None:
        error = np.array([error[i] for i, kept in enumerate(filtered) if kept])

    rmse = np.sqrt(error.mean(1))
    
    if verbose:
        for t, score in zip(scored_targets, rmse.mean(0)):
            print(f'Score for target "{TARGETS[t]}":\t {score:.4f}')
            
    if reduce:
        return rmse.mean()
    else:
        return rmse.mean(-1)

def MCRMSE(y_true, y_pred, class_weight):
    colwise_mse = torch.mean(torch.square(y_true - y_pred), dim=1)
    return torch.mean(torch.sqrt(colwise_mse) * class_weight, dim=1)


def sn_mcrmse_loss(predict, target, signal_to_noise, class_weight):
    loss = MCRMSE(target, predict, class_weight)
    weight = 0.5 * torch.log(signal_to_noise + 1.01)
#     weight = torch.sqrt(signal_to_noise + 1.01)
    weight = torch.clamp(weight, 0.01, 1000)
    loss = (loss * weight).mean()
    return loss

In [None]:
def learn_from_batch(model, data, optimizer, lr_scheduler, class_weight, pred_len=68):
    optimizer.zero_grad()
    
    out = model(
        data["sequence"].to(DEVICE), 
        data["bpp"].to(DEVICE),
        pred_len=pred_len,
    )
    
    signal_to_noise = data["signal_to_noise"] * data["score"]
    loss = sn_mcrmse_loss(
        out, 
        data["label"].to(DEVICE), 
        signal_to_noise.to(DEVICE),
        class_weight,
    )
    loss.backward()
    
    nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    
    optimizer.step()
    if lr_scheduler:
        lr_scheduler.step()
        
    return out, loss


def evaluate(model, valid_data, class_weight):
    model.eval()
    loss_list = []
    mcrmses = []
    for i, data in enumerate(valid_data):
        with torch.no_grad():
            y = model(
                data["sequence"].to(DEVICE), 
                data["bpp"].to(DEVICE),
                pred_len=68,
            )
            
            mcrmse_ = mcrmse(data["label"].numpy(), y.cpu().numpy(), filtered=data["signal_to_noise"] > 1, reduce=False)
            mcrmses += list(mcrmse_)
            
            loss = sn_mcrmse_loss(
                y, 
                data["label"].to(DEVICE), 
                data["signal_to_noise"].to(DEVICE),
                class_weight
            )
            loss_list.append(loss.item())
    model.train()
    return dict(loss=np.mean(loss_list), mcrmse=np.mean(mcrmses))

In [None]:
def predict(model, loader, pred_len=68):
    """
    Usual torch predict function

    Arguments:
        model {torch model} -- Model to predict with
        dataset {torch dataset} -- Dataset to predict with on

    Keyword Arguments:
        batch_size {int} -- Batch size (default: {32})
        pred_len {int} -- Number of elements to keep for scoring (default: {68})

    Returns:
        numpy array -- Predictions
    """
    model.eval()
    preds = np.empty((0, pred_len, NUM_TARGETS))

    with torch.no_grad():
        for batch in loader:
            y_pred = model(
                batch["sequence"].cuda(),
                batch["bpp"].cuda(),
                pred_len=pred_len,
            ).detach()
            
            preds = np.concatenate([preds, y_pred.cpu().numpy()])

    return preds

In [None]:
def train(model, train_data, valid_data, optimizer, lr_scheduler, epochs=10, swa_first_epoch=40, class_weight=None, pred_len=68):
    it = 0

    for epoch in range(epochs):
        t0 = time.time()
        print(f"Epoch {epoch+1}/{epochs}", end='\t')
        model.train()
        
        # Training
        losses = []
        for i, data in enumerate(train_data):
            _, loss = learn_from_batch(model, data, optimizer, lr_scheduler, class_weight, pred_len=pred_len)
            loss_v = loss.item()
            losses.append(loss_v)
            it += 1
        
        # Evaluating
        
        if epoch + 1 >= swa_first_epoch:
            optimizer.update_swa()
            optimizer.swap_swa_sgd()
            
        eval_result = evaluate(model, valid_data, class_weight)
        eval_loss = eval_result["loss"]
        
        if epoch + 1 >= swa_first_epoch and epoch < epochs - 1:
#             print(epoch, "swap")
            optimizer.swap_swa_sgd()
    
        dt = time.time() - t0

        lr = lr_scheduler.get_last_lr()[0] if lr_scheduler else 1e-3
        print(f't={dt:.1f}s\t lr={lr:.1e}\tloss={np.mean(losses):.4f}', end='\t')
        print(f"val_loss={eval_loss:.4f} \t val_mcrmse={eval_result['mcrmse']:.4f}")

In [None]:
EPOCHS_1 = 30
EPOCHS_2 = 10
EPOCHS_3 = 10
EPOCHS_4 = 5

SWA_FIRST_EPOCH = 0
WARMUP_PROP = 0.05
K = 5 
BATCH_SIZE = 32
LR = 5e-4

LOAD = True

In [None]:
samples = train_df.copy().drop('score', axis=1)
ids = samples.reset_index()["id"]
set_seed(SEED)

In [None]:
gkf = GroupKFold(n_splits=K)
splits = list(gkf.split(X=samples, groups=groups))

In [None]:
scores = []
pred_oof = np.zeros((len(samples) , 68, NUM_TARGETS))

for fold, (train_index, test_index) in enumerate(splits):
    print(f"\n-------------  Fold {fold + 1}/{K}  -------------\n")
    set_seed(SEED)
    
    df_train = samples.loc[train_index].reset_index()
    df_train = augment_data(df_train)
    
    df_val = samples.loc[test_index].reset_index()
    df_val_aug = augment_data(df_val.copy(), concat=False)
    
    train_loader = create_loader(df_train, BATCH_SIZE)
    public_loader = create_loader(public_df, BATCH_SIZE)
    private_loader = create_loader(private_df, BATCH_SIZE)
    
    valid_loader = create_loader(df_val, BATCH_SIZE, shuffle=False)
    valid_loader_aug = create_loader(df_val_aug, BATCH_SIZE, shuffle=False)

    ae_model = AEModel()
    
    if LOAD:
        load_model_weights(ae_model, PRETRAINED_PATH)
    
    model = FromAeModel(ae_model.seq)
    model = model.to(DEVICE)
    model.zero_grad()
    model.train()
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    if SWA_FIRST_EPOCH < EPOCHS_4:
        optimizer = SWA(optimizer)
    
    s1 = EPOCHS_1 * len(train_loader)
    s2 = EPOCHS_2 * len(public_loader)
    s3 = EPOCHS_3 * len(private_loader)
    s4 = EPOCHS_4 * len(train_loader)
    num_training_steps = int(s1 + s2 + s3 + s4)
    num_warmup_steps = int(EPOCHS_1 * len(train_loader) * WARMUP_PROP)
    
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps, num_training_steps
    )
    
    print('\n -> Training with train data \n')
    train(
        model, train_loader, valid_loader, optimizer, lr_scheduler, pred_len=68,
        epochs=EPOCHS_1, swa_first_epoch=100, class_weight=CLASS_WEIGHT_5)
    
    print('\n -> Training with public data \n')
    train(
        model, public_loader, valid_loader, optimizer, lr_scheduler, pred_len=SEQ_SCORED_PUBLIC,
        epochs=EPOCHS_2, swa_first_epoch=100, class_weight=CLASS_WEIGHT_3
    )
    
    print('\n -> Training with private data \n')
    train(
        model, private_loader, valid_loader, optimizer, lr_scheduler, pred_len=SEQ_SCORED_PRIVATE,
        epochs=EPOCHS_3, swa_first_epoch=100, class_weight=CLASS_WEIGHT_3
    )
    
    print('\n -> Retraining with train data \n')
    train(
        model, train_loader, valid_loader, optimizer, lr_scheduler, pred_len=68,
        epochs=EPOCHS_4, swa_first_epoch=SWA_FIRST_EPOCH, class_weight=CLASS_WEIGHT_3
    )
    
    y = np.array(df_val[TARGETS].values.tolist()).transpose((0, 2, 1))
    
    pred_val_ = predict(model, valid_loader)
    pred_val_aug = predict(model, valid_loader_aug)
    pred_val = (0.5 * pred_val_ + 0.5 * pred_val_aug)
    
    pred_oof[test_index] = pred_val
    
    score = mcrmse(y, pred_val, verbose=0, scored_targets=[0, 1, 3], filtered=df_val["signal_to_noise"] > 1)
    scores.append(score)
    
    print(f"\n   -> Scored {score:.4f} with TTA")

    save_model_weights(model, f'model_{fold}.pt', verbose=0, cp_folder=CP_PATH)
    
    del model

## Prediction

In [None]:
BATCH_SIZE = 64
TTA = True

In [None]:
test_df = pd.read_json(str(Path(BASE_PATH) / 'test.json'), lines=True)

public_df = test_df[test_df["seq_length"] == SEQ_LEN_PUBLIC].reset_index(drop=True)
private_df = test_df[test_df["seq_length"] == SEQ_LEN_PRIVATE].reset_index(drop=True)

pub_loader = create_loader(public_df, BATCH_SIZE, is_test=True)
pri_loader = create_loader(private_df, BATCH_SIZE, is_test=True)

public_df_aug = augment_data(public_df.copy().drop('score', axis=1), concat=False)
pub_loader_aug = create_loader(public_df_aug, BATCH_SIZE, is_test=True)

private_df_aug = augment_data(private_df.copy().drop('score', axis=1), concat=False)
pri_loader_aug = create_loader(private_df_aug, BATCH_SIZE, is_test=True)

In [None]:
pred_df_list = []
pred_public = np.zeros((len(public_df), 107, NUM_TARGETS))
pred_private = np.zeros((len(private_df), SEQ_LEN_PRIVATE, NUM_TARGETS))

for fold in range(K):
    print(f"\n-------------  Fold {fold + 1}/{K}  -------------\n")

    model_load_path = CP_PATH + f"model_{fold}.pt"
    
    print(f' -> Loading weights from {model_load_path}\n')
    
    ae_model0 = AEModel()
    model_pub = FromAeModel(seq=ae_model0.seq)
    model_pub = model_pub.to(DEVICE)
    
    ae_model1 = AEModel()
    model_pri = FromAeModel(seq=ae_model1.seq)
    model_pri = model_pri.to(DEVICE)
    
    state_dict = torch.load(model_load_path, map_location=DEVICE)
    model_pub.load_state_dict(state_dict)
    model_pri.load_state_dict(state_dict)
    del state_dict

    pred_public += predict(model_pub, pub_loader, pred_len=107) / K
    pred_private += predict(model_pri, pri_loader, pred_len=130) / K

    if TTA:
        pred_public += predict(model_pub, pub_loader_aug, pred_len=107) / K
        pred_private += predict(model_pri, pri_loader_aug, pred_len=130) / K
              
if TTA:
    pred_public *= 0.5
    pred_private *= 0.5

In [None]:
def pred_to_sub(df_test, pred_public, pred_private):
    sub_public = df_test[df_test['seq_scored'] == SEQ_SCORED_PUBLIC][['id']].reset_index(drop=True)
    sub_private = df_test[df_test['seq_scored'] == SEQ_SCORED_PRIVATE][['id']].reset_index(drop=True)
    
    test_preds = []

    for sub, pred in [(sub_public, pred_public), (sub_private, pred_private)]:
        for i, seq in enumerate(sub.id):
            single_pred = pred[i]
            single_df = pd.DataFrame(single_pred, columns=TARGETS)
            single_df['id_seqpos'] = [f'{seq}_{x}' for x in range(single_df.shape[0])]
            test_preds.append(single_df)

    return pd.concat(test_preds)[['id_seqpos'] + TARGETS]

In [None]:
sub = pred_to_sub(test_df, pred_public, pred_private)

In [None]:
score = np.mean(scores)
score

In [None]:
print(f'Saving submission to "{TODAY}_{score:.4f}_pl.csv"')

sub.to_csv(f"{TODAY}_{score:.4f}_pl.csv", index=False)
np.save(f"oof_{TODAY}_{score:.4f}_pl.npy", pred_oof)

sub.head()