In [None]:
%%writefile models.py

import torch
import torch.nn as nn
import torch.nn.functional as F

class SpatialDropout(nn.Dropout2d):
    
    def forward(self, x):
        x = x.permute(0, 2, 1)  
        x = super(SpatialDropout, self).forward(x)  
        x = x.permute(0, 2, 1)  
        return x
    

class GRU_model(nn.Module):
    def __init__(
        self,
        args,
        pred_len=68
    ):
        super(GRU_model, self).__init__()
        self.pred_len = pred_len

        self.embedding = nn.Embedding(num_embeddings=args.num_embeddings, embedding_dim=args.embedding_dim)
        self.cnn_layer = nn.Conv1d(in_channels=16, out_channels=args.embedding_dim, kernel_size=5, padding=5//2)
        
        self.embedding_dropout = SpatialDropout(0.3)

        self.gru = nn.GRU(
            input_size=args.embedding_dim,
            hidden_size=args.hidden_size,
            num_layers=args.hidden_layers,
            dropout=args.dropout,
            bidirectional=True,
            batch_first=True
        )

        self.linear = nn.Linear(args.hidden_size * 2, 5)

    def forward(self, seqs):
        seqs = seqs.permute(0, 2, 1)
        embed = self.cnn_layer(seqs)
        embed = self.embedding_dropout(embed)
        reshaped = embed.permute(0, 2, 1) #torch.reshape(embed, (-1, embed.shape[1], embed.shape[2] * embed.shape[3]))
        output, hidden = self.gru(reshaped)
        turncated = output[:, :self.pred_len, :]
        out = self.linear(turncated)
        
        return out

In [None]:
%%writefile dataset.py

import numpy as np, pandas as pd
import torch, torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset

target_col = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

rna_dict    = {x:i for i, x in enumerate('ACGU')} #4
struct_dict = {x:i for i, x in enumerate('().')}  #3
loop_dict   = {x:i for i, x in enumerate('BEHIMSX')}#7

class RNADataset(Dataset):
    def __init__(self, df, augment=None):

        self.rna    = df['sequence'].map(lambda seq: [rna_dict[x] for x in seq])
        self.struct = df['structure'].map(lambda seq: [struct_dict[x] for x in seq])
        self.loop   = df['predicted_loop_type'].map(lambda seq: [loop_dict[x] for x in seq])

        bbp0 =[]
        bbp1 =[]
        id = df.id.values
        for i in id:
            probability = np.load(f'../input/stanford-covid-vaccine/bpps/{i}.npy')
            bbp0.append(probability.max(-1).tolist())
            bbp1.append((1-probability.sum(-1)).tolist())
        self.bbp0 = bbp0
        self.bbp1 = bbp1

        #---
        if 'reactivity' in df.columns:
            target = np.transpose(
                df[target_col]
                .values
                .tolist(),
            (0, 2, 1))
            target = np.ascontiguousarray(target)
        else:
            target = np.zeros((len(df),1,1)) #dummy

        self.df =  df
        self.len = len(self.df)
        self.augment = augment
        self.target = target

    def __str__(self):
        string  = ''
        string += '\tlen  = %d\n'%len(self)
        return string

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        r = self.df.loc[index]
        target = self.target[index]

        rna = np.array(self.rna[index])
        struct = np.array(self.struct[index])
        loop = np.array(self.loop[index])
        bbp0 = np.array(self.bbp0[index]).reshape(-1,1)
        bbp1 = np.array(self.bbp1[index]).reshape(-1,1)

        #bbp = np.load(f'../input/stanford-covid-vaccine/bpps/{r.id}.npy')
        #bbp = np.expand_dims(bbp, axis=0)

        seq = np.concatenate([
            np_onehot(rna,4),
            np_onehot(struct,3),
            np_onehot(loop,7),
            bbp0,
            bbp1,
        ],1)

        #------
        record = {
            'target': torch.tensor(target, dtype=torch.float),
            #'bbps'  : torch.tensor(bbp, dtype=torch.float),
            'seq' : torch.tensor(seq, dtype=torch.float),
            'ids' : r.id
        }
        if self.augment is not None: record = self.augment(record)
        return record

def np_onehot(x, max=54):
    return np.eye(max)[x]


In [None]:
%%writefile losses.py

import torch
import torch.nn as nn
import torch.nn.functional as F

class MCRMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
    
    def rmse(self, y_actual, y_pred):
        mse = self.mse(y_actual, y_pred)
        return torch.sqrt(mse)
    
    def forward(self, y_actual, y_pred, num_scored=None):
        if num_scored == None:
            num_scored = y_actual.shape[-1]
        score = 0
        for i in range(num_scored):
            score += self.rmse(y_actual[:, :, i], y_pred[:, :, i]) / num_scored
        return score

In [None]:
%%writefile utils.py

class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
%%writefile train.py

import math, json, gc, random, os, sys, time
import numpy as np, pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from tqdm import tqdm

import models
import losses
from dataset import RNADataset
from utils import AverageMeter
from config import args

## Model
#get comp data
train = pd.read_json('../input/stanford-covid-vaccine/train.json', lines=True)
test = pd.read_json('../input/stanford-covid-vaccine/test.json', lines=True)
sample_sub = pd.read_csv("../input/stanford-covid-vaccine/sample_submission.csv")

target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

train = train[train.signal_to_noise >= 1]


def train_epcoh(model, loader, optimizer, criterion, device, epoch):
    losses = AverageMeter()

    model.train()
    t = tqdm(loader)
    for i, d in enumerate(t):

        #print(d)

        X = d['seq'].to(device)
        y = d['target'].to(device)

        pred_y = model(X)

        loss = criterion(y, pred_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        bs = X.size(0)
        losses.update(loss.item(), bs)

        t.set_description(f"Train E:{epoch} - Loss:{losses.avg:0.5f}")
    
    t.close()
    return losses.avg

def valid_epoch(model, loader, criterion, device, epoch):
    losses = AverageMeter()

    model.eval()

    with torch.no_grad():
        t = tqdm(loader)
        for i, d in enumerate(t):

            X = d['seq'].to(device)
            y = d['target'].to(device)

            pred_y = model(X)
            
            #print(y.shape, pred_y.shape)
            
            loss = criterion(y, pred_y)

            bs = X.size(0)
            losses.update(loss.item(), bs)

            t.set_description(f"Valid E:{epoch} - Loss:{losses.avg:0.5f}")
        
    t.close()
    return losses.avg

def test_predic(model, loader, device):
    
    outputs_dict = {
        "ids" : [],
        "predicts" : []
    }
    
    model.eval()
    
    with torch.no_grad():
        t = tqdm(loader)
        for i, d in enumerate(t):
            X = d['seq'].float().to(device)
            ids = d['ids']
            
            outs = model(X).cpu().detach().numpy().tolist()
            
            outputs_dict['predicts'].extend(outs)
            outputs_dict['ids'].extend(ids)
            
    return outputs_dict

def main():

    # Setting seed
    seed = args.seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    args.save_path = os.path.join(args.output_dir, args.exp_name)
    os.makedirs(args.save_path, exist_ok=True)

    public_predictions = [] 
    public_ids         = []

    private_predictions = []
    private_ids         = []

    public_df = test.query("seq_length == 107").reset_index(drop=True)
    private_df = test.query("seq_length == 130").reset_index(drop=True)

    public_dataset = RNADataset(public_df)
    private_dataset = RNADataset(private_df)

    public_loader = DataLoader(public_dataset, batch_size=args.batch_size, shuffle=False)
    private_loader = DataLoader(private_dataset, batch_size=args.batch_size, shuffle=False)


    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    skf = KFold(args.n_folds, shuffle=True, random_state=seed)

    for i, (train_index, valid_index) in enumerate(skf.split(train, train['SN_filter'])):
        print("#"*20)
        print(f"##### Fold : {i}")

        args.fold = i

        train_df = train.iloc[train_index].reset_index(drop=True)
        valid_df = train.iloc[valid_index].reset_index(drop=True)
        
        #valid_df = valid_df[valid_df.SN_filter == 1].reset_index(drop=True)

        train_dataset = RNADataset(train_df)
        valid_dataset = RNADataset(valid_df)

        train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False)

        model = models.__dict__[args.network](args, pred_len=68)
        model = model.to(device)

        criterion = losses.__dict__[args.losses]()

        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        best_loss = 99999

        for epoch in range(args.epochs):

            train_loss = train_epcoh(model, train_loader, optimizer, criterion, device, epoch)
            valid_loss = valid_epoch(model, valid_loader, criterion, device, epoch)

            content = f"""
                {time.ctime()} \n
                Fold:{args.fold}, Epoch:{epoch}, lr:{optimizer.param_groups[0]['lr']:.7}, \n
                Train Loss:{train_loss:0.4f} - Valid Loss:{valid_loss:0.4f} \n
            """
            print(content)

            with open(f'{args.save_path}/log_{args.exp_name}.txt', 'a') as appender:
                appender.write(content + '\n')
            
            if valid_loss < best_loss:
                print(f"######### >>>>>>> Model Improved from {best_loss} -----> {valid_loss}")
                torch.save(model.state_dict(), os.path.join(args.save_path, f"fold-{args.fold}.bin"))
                best_loss = valid_loss
            
            torch.save(model.state_dict(), os.path.join(args.save_path, f"last-fold-{args.fold}.bin"))
            
        public_model = models.__dict__[args.network](args, pred_len=107).to(device)
        public_model.load_state_dict(torch.load(os.path.join(args.save_path, f"fold-{args.fold}.bin")))
        
        private_model = models.__dict__[args.network](args, pred_len=130).to(device)
        private_model.load_state_dict(torch.load(os.path.join(args.save_path, f"fold-{args.fold}.bin")))
        
        public_pred_dict = test_predic(public_model, public_loader, device)
        private_pred_dict = test_predic(private_model, private_loader, device)
        
        public_predictions.append(np.array(public_pred_dict["predicts"]).reshape(629 * 107 , 5))
        private_predictions.append(np.array(private_pred_dict["predicts"]).reshape(3005 * 130, 5))
        
        public_ids.append(public_pred_dict["ids"])
        private_ids.append(private_pred_dict["ids"])

    public_ids1 = [f"{id}_{i}" for id in public_ids[0] for i in range(107)]
    private_ids1 = [f"{id}_{i}" for id in private_ids[0] for i in range(130)]

    public_preds = np.mean(public_predictions, axis=0)
    private_preds = np.mean(private_predictions, axis=0)

    public_pred_df = pd.DataFrame(public_preds, columns=target_cols)
    public_pred_df["id_seqpos"] = public_ids1

    private_pred_df = pd.DataFrame(private_preds, columns=target_cols)
    private_pred_df["id_seqpos"] = private_ids1

    pred_sub_df = public_pred_df.append(private_pred_df)

    pred_sub_df.to_csv(os.path.join(args.save_path, f"{args.sub_name}_submission.csv"), index=False)

if __name__ == "__main__":
    main()

In [None]:
%%writefile config.py


class args:
    
    exp_name = "base_model"
    sub_name = ""
    output_dir = "weights"

    network = "GRU_model"

    losses  = "MCRMSELoss"

    
    # Model parameters
    num_embeddings = 14
    embedding_dim  = 128
    hidden_layers  = 3
    hidden_size    = 128
    dropout        = 0.5

    # Training parameters
    lr = 0.0001
    seed = 42
    epochs = 50
    n_folds = 5
    batch_size = 32

In [None]:
!python3 train.py