In [None]:
import os
import re
import time
import string
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import seaborn as sns

from spacy.lang.en import English
from collections import defaultdict
from nltk.stem import WordNetLemmatizer

from spacy.lang.en.stop_words import STOP_WORDS

from torch.nn.functional import softmax
from torch.optim.swa_utils import AveragedModel, SWALR
from torch.optim.lr_scheduler import CosineAnnealingLR

from scipy.stats import skew

In [None]:
def seed_everything(s):
    np.random.seed(s)
    torch.manual_seed(s)
    torch.cuda.manual_seed(s)
    torch.cuda.manual_seed_all(s)
    
seed_everything(10)

# Config

In [None]:
class CONFIG:
    glove_path='../input/glove6b100dtxt/glove.6B.100d.txt'
    glove_dim=100
    gru_hdim=128
    dmodel=512
    
    folds=5
    batch_size=128
    max_seq_len=150
    eval_every=10
    
    learning_rate=1e-4
    weight_decay=1e-4
    optimizer='AdamW'
    epochs=30
    clip_gradient_norm=1.0
    
    device=torch.device( 'cuda' if torch.cuda.is_available() else 'cpu')

# Target Normalization

In [None]:
train_df=pd.read_csv('../input/commonlit-kfold-dataset/fold_train.csv')

target_mean=train_df.target.mean()
target_std=train_df.target.std()

print("Taget Mean:", target_mean)
print("Taget Std:", target_std)

train_df['normalized_target']=(train_df.target - target_mean)/target_std
sns.histplot(data=train_df, x='normalized_target')

In [None]:
qs=[]
for i in np.arange(0.1, 1.1, 0.1):
    q=train_df.normalized_target.quantile(i)
    qs.append(q)

def get_quantile(target):
    for i,q in enumerate(qs):
        if target<=q:
            return i

train_df['q']=train_df.normalized_target.apply(get_quantile)
train_df.head()

# Load Glove 100-d vectors

In [None]:
%%time
glove_embeddings={}
with open(CONFIG.glove_path) as file:
    for line in file:
        line=line.split()
        word=line[0]
        v=np.array(line[1:]).astype(np.float)
        glove_embeddings[word]=v
print(len(glove_embeddings))

# Tokenizer

In [None]:
class Tokenizer:
    def __init__(self):
        self.lemmatizer=WordNetLemmatizer()
        self.nlp=English()
    def __call__(self, doc):
        tokens=[]
        for token in self.nlp(doc):
            if token.like_num or token.text=='' or (not token.is_ascii):
                continue
            token=token.lower_.strip()
            for p in string.punctuation:
                token=token.replace(p, ' ')
            token=token.split(' ')
            token=[w for w in token if w!='']
            tokens+=token
        return tokens

In [None]:
tokenizer=Tokenizer()
train_df['doc']=train_df.excerpt.apply(tokenizer)
train_df.head()

# Dataset and DataSamplers

In [None]:
class TrainDataSampler:
    def __init__(self, batch_size, df):
        self.qmap={}
        self.batch_size=batch_size
        self.batch_fraction=1.0
        self.df=df.copy()
        
        for i in range(10):
            ids=self.df[self.df.q==i].id.values
            np.random.shuffle(ids)
            self.qmap[i]=ids
        
    def convert_sentences(self, num_samples, sentences):
        X=torch.zeros((num_samples, CONFIG.max_seq_len, CONFIG.glove_dim), dtype=torch.float32)
        for i, doc in enumerate(sentences):
            for j, word in enumerate(doc[:CONFIG.max_seq_len]):
                if word in glove_embeddings:
                    X[i][j]=torch.tensor(glove_embeddings[word])
        return X
    
    def get_mbs(self):
        y=[]
        sentences=[]
        for i in range(10):
            if i not in self.qmap:
                continue
            yids=self.qmap[i][-12:]
            y+=list(self.df[self.df.id.isin(yids)].normalized_target.values)
            sentences+=list(self.df[self.df.id.isin(yids)].doc.values)
            
            self.qmap[i]=self.qmap[i][:-12]
            if len(self.qmap[i]) == 0:
                self.qmap.pop(i)
        
        num_samples=len(y)
        self.batch_fraction=len(y)/self.batch_size
        
        X=self.convert_sentences(num_samples, sentences)
        y=torch.tensor(y, dtype=torch.float32)
        
        X=X.to(CONFIG.device)
        y=y.to(CONFIG.device)
        return X, y
    
    def __iter__(self):
        while len(self.qmap)>0:
            X, y=self.get_mbs()
            if self.batch_fraction < 0.5:
                break
            yield X, y
    def __next__(self):
        for i in range(10):
            yield i

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, phase):
        self.df=df
        self.phase=phase
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        doc=row.doc
        
        X=torch.zeros((CONFIG.max_seq_len, CONFIG.glove_dim), dtype=torch.float32)
        for i, word in enumerate(doc[:CONFIG.max_seq_len]):
            if word in glove_embeddings:
                X[i]=torch.tensor(glove_embeddings[word])
        
        if self.phase in ['train', 'val']:
            y=torch.tensor(row.normalized_target, dtype=torch.float32)
            return (X.to(CONFIG.device), y.to(CONFIG.device))
        return X.to(CONFIG.device)
    def __len__(self):
        return len(self.df)

# Model

In [None]:
class ProjectionHead(nn.Module):
    def __init__(self):
        super().__init__()
        self.bn=nn.BatchNorm1d(CONFIG.dmodel)
        self.dropout=nn.Dropout()
        self.relu=nn.ReLU()
        self.out_layer=nn.Linear(CONFIG.dmodel, 1)
    def forward(self, x):
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=self.out_layer(x)
        return x

class AttentionAggregation(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear=nn.Linear(CONFIG.dmodel, 1)
    def forward(self, x):
        w=self.linear(x)
        w=softmax(w, dim=1)
        x=x.permute(0, 2, 1)
        x=torch.matmul(x, w).squeeze(-1)
        return x

class FFN(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.linear=nn.Linear(in_dim, out_dim)
        self.relu=nn.ReLU()
        self.dropout=nn.Dropout(0.2)
        self.layer_norm=nn.LayerNorm( (CONFIG.max_seq_len,out_dim) )
    def forward(self, x):
        x=self.linear(x)
        x=self.layer_norm(x)
        x=self.dropout(x)
        x=self.relu(x)
        return x
    
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.gru=nn.GRU(CONFIG.glove_dim, CONFIG.gru_hdim, num_layers=2,
                         bidirectional=True,batch_first=True, dropout=0.25)
        self.layernorm=nn.LayerNorm((CONFIG.max_seq_len, 2*CONFIG.gru_hdim))
        self.relu=nn.ReLU()
        self.ffn=FFN(2*CONFIG.gru_hdim, CONFIG.dmodel)
        self.attn_agg=AttentionAggregation()
        self.projection_head=ProjectionHead()
        
    def forward(self, x):
        batch_size=x.shape[0]
        x,_ = self.gru(x)
        x=self.layernorm(x)
        x=self.relu(x)
        x=self.ffn(x)
        x=self.attn_agg(x)
        x=self.projection_head(x)
        return x

# Train

In [None]:
class Trainer:
    def __init__(self, model, fold_train_df, train_dataloader, val_dataloader):
        self.model=model
        self.max_iter_count=500
        #self.swa_model=AveragedModel(model, device=CONFIG.device)
        
        self.criterion=nn.MSELoss(reduction='mean')
        self.fold_train_df=fold_train_df
        
        self.train_dataloader=train_dataloader
        self.val_dataloader=val_dataloader
        self.optimizer=torch.optim.AdamW(model.parameters(), lr=CONFIG.learning_rate, weight_decay=CONFIG.weight_decay)
        self.schedular=torch.optim.lr_scheduler.OneCycleLR(self.optimizer,
                                                           max_lr=CONFIG.learning_rate,
                                                           total_steps=self.max_iter_count)
                                                           #epochs=CONFIG.epochs,
                                                           #steps_per_epoch=len(self.train_dataloader))
        self.iter_count=0
        self.best_loss=None
        self.best_iteration=None
        self.train_loss_=[]
        self.train_batch_stdv_=[]
        self.batch_skew=[]
        self.val_loss_=[]
        #self.swa_start=500
        
    def evaluate(self):
        self.model.eval()
        ytrue=[]; ypred=[];
        for X, y in self.val_dataloader:
            X=X.to(CONFIG.device)
            ytrue+=y.view(-1).tolist()
            with torch.no_grad():
                yhat=self.model(X).view(-1).detach().cpu()
                ypred+=yhat.tolist()
        ytrue=torch.tensor(ytrue, dtype=torch.float32)
        ypred=torch.tensor(ypred, dtype=torch.float32)
        return self.criterion(ypred, ytrue).item()
    
    def checkpoint(self, val_loss):
        if self.best_loss is None or self.best_loss > val_loss:
            torch.save(self.model, 'best_model.pt')
            #torch.save(model, 'model_{}'.format(self.iter_count))
            self.best_loss=val_loss
            self.best_iteration=self.iter_count
    
    def train_ops(self, X, y):
        self.model.train()
        
        self.optimizer.zero_grad()
        y_hat=self.model(X).view(-1)
        
        loss=self.criterion(y_hat, y)
        loss.backward()
        
        self.optimizer.step()
        self.schedular.step()
        
        return loss.item()
    
    def train(self):
        
        t1=time.time()
        for _ in range(CONFIG.epochs):
            if self.iter_count > self.max_iter_count:
                break
            for mbs in TrainDataSampler(120, self.fold_train_df):
                X, y=mbs
                self.iter_count+=1
                if self.iter_count > self.max_iter_count:
                    break
                X=X.to(CONFIG.device)
                y=y.to(CONFIG.device)
                
                self.train_batch_stdv_.append(np.std(y.view(-1).tolist()))
                self.batch_skew.append(skew(y.view(-1).tolist()))
                
                train_loss=self.train_ops(X, y)
                self.train_loss_.append(train_loss)
                
                if self.iter_count%CONFIG.eval_every==0:
                    val_loss=self.evaluate()
                    self.val_loss_.append(val_loss)
                    self.checkpoint(val_loss)
                    t2=time.time()
                    print("======"*10)
                    print()
                    print("Iteration:{} | Time Taken: {:.2f} | Train Loss:{:.3f}".format(self.iter_count, (t2-t1)/60, self.train_loss_[-1]))
                    print("Val Loss:{:.4f} | Best Loss:{:.4f} | Best Iteration:{}".format(val_loss,self.best_loss,self.best_iteration))
                    
                    #print()
                    #print("======"*10)
                    #print("Batch Stats:")
                    #print("Last 3 batch std: ", self.train_batch_stdv_[-3:])
                    #print("Last 3 batch skew: ", self.batch_skew[-3:])
                    
                    #print("Last 6 Avg std:", np.mean(self.train_batch_stdv_[-6:]))
                    #print("Last 6 Avg skew:", np.mean(self.batch_skew[-6:]))
                    
                    #print("Val Loss:{:.3f}".format(val_loss))
                    t1=time.time()
        #torch.optim.swa_utils.update_bn(self.train_dataloader, self.swa_model)
        #torch.save(self.swa_model, 'swa_model.pt')
                    
    def lr_range_test(self):
        self.model.train()
        lrs=[];losses=[]
        min_lr=5e-5;max_lr=1e-3;
        optimizer=torch.optim.AdamW(model.parameters(), lr=min_lr, weight_decay=CONFIG.weight_decay)
        mse_loss=nn.MSELoss(reduction='mean')
        schedular=torch.optim.lr_scheduler.StepLR(optimizer, 1, 1.05)

        lrs=[]
        losses=[]

        for i in range(10):
            print('Epoch:', i+1, schedular.get_last_lr())
            #for j, (X, y) in enumerate(self.train_dataloader):
            for X, y in TrainDataSampler(120, self.fold_train_df):
                y_hat=self.model(X).view(-1)
                loss=mse_loss(y_hat, y)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                schedular.step()

                lrs+=schedular.get_last_lr()
                losses.append(loss.item())
                if lrs[-1]>max_lr:
                    break
            if lrs[-1]>max_lr:
                break
        return lrs, losses

In [None]:
#model=Model()
#model=model.to(CONFIG.device)
#trainer=Trainer(model, fold_train_df, train_dataloader, val_dataloader)

#lrs, losses=trainer.lr_range_test()
#plt.plot(lrs, losses)

In [None]:
models=[]
for i in range(5):
    print()
    print()
    print("==="*10)
    print("Fold:{}".format(i))
    print()
    fold_train_df=train_df[train_df.kfold!=i].copy()
    fold_val_df=train_df[train_df.kfold==i].copy()

    train_dataset=Dataset(fold_train_df, 'train')
    train_dataloader=torch.utils.data.DataLoader(train_dataset, 
                                                 #batch_size=CONFIG.batch_size,
                                                 batch_size=120,
                                                 shuffle=True)

    val_dataset=Dataset(fold_val_df, 'val')
    val_dataloader=torch.utils.data.DataLoader(val_dataset, batch_size=CONFIG.batch_size, shuffle=False)


    #Model Instance 
    model=Model()
    model=model.to(CONFIG.device)
    
    #Trainer Instance
    trainer=Trainer(model, fold_train_df, train_dataloader, val_dataloader)
    trainer.train()
    
    model=torch.load('./best_model.pt')
    models.append(model)
    
    plt.title("Train Loss- Fold:{}".format(i))
    plt.plot(trainer.train_loss_)
    plt.show()
    
    plt.title("Val Loss- Fold:{}".format(i))
    plt.plot(trainer.val_loss_)
    plt.show()

In [None]:
for i, model in enumerate(models):
    torch.save(model, "model_{}.pt".format(i))

In [None]:
ytrue=[]; ypred=[];

for X, y in val_dataloader:
    X=X.to(CONFIG.device)
    ytrue+=y.view(-1).tolist()
    
    yhat=np.zeros(y.size(0))
    for model in models:
        model.eval()
        with torch.no_grad():
            yhat+=model(X).view(-1).detach().cpu().numpy()
    
    yhat/=len(models)
    ypred+=list(yhat)

ytrue=torch.tensor(ytrue, dtype=torch.float32)
ypred=torch.tensor(ypred, dtype=torch.float32)
print(nn.MSELoss(reduction='mean')(ypred, ytrue).item())

# Inference

In [None]:
def infer(models, dataloader):
    preds=[]
    for X in dataloader:
        X=X.to(CONFIG.device)
        y_hat=torch.zeros(X.shape[0])
        for model in models:
            model.eval()
            with torch.no_grad():
                y=model(X).view(-1).detach().cpu()
                y_hat+=(target_std*y) + target_mean
        preds+=list(y_hat.numpy()/len(models))
    return preds

In [None]:
infer_train_dataset=Dataset(train_df, 'test')
infer_train_dataloader=torch.utils.data.DataLoader(infer_train_dataset, batch_size=200, shuffle=False)
train_df['preds'] = infer(models, infer_train_dataloader)
train_df[['id', 'target', 'normalized_target', 'preds']].head()

In [None]:
(np.sqrt((train_df.preds-train_df.target)**2)).mean()

In [None]:
_, ax=plt.subplots(2, 1)
sns.boxplot(data=train_df, x='target', ax=ax[0])
sns.boxplot(data=train_df, x='preds', ax=ax[1])

In [None]:
sns.histplot(train_df, x='target', bins=100, color='red')
sns.histplot(train_df, x='preds', bins=100,)


In [None]:
train_df.to_csv('train_with_preds.csv', index=False)

# Submission

In [None]:
test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_df['doc']=test_df.excerpt.apply(tokenizer)
test_df.head()

In [None]:
infer_test_dataset=Dataset(test_df, 'test')
infer_test_dataloader=torch.utils.data.DataLoader(infer_test_dataset, batch_size=200, shuffle=False)
test_df['target'] = infer(models, infer_test_dataloader)


In [None]:
submission_df=test_df[['id', 'target']].copy()
submission_df.head()

In [None]:
submission_df.tail()

In [None]:
submission_df.to_csv('submission.csv', index=False)