In [None]:
import os
import re
import time
import string
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import seaborn as sns

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from torch.nn.functional import softmax

from collections import defaultdict

# References
http://nlp.seas.harvard.edu/2018/04/03/attention.html#encoder

https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf

In [None]:
def seed_everything():
    np.random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    
seed_everything()

In [None]:
class CONFIG:
    glove_path='../input/glove6b100dtxt/glove.6B.100d.txt'
    glove_dim=100
    
    word_hdim=256
    word_ff_dim=512
    
    sent_hdim=512
    sent_ff_dim=512
    
    learning_rate=1e-4
    weight_decay=1e-5
    optimizer='AdamW'
    
    batch_size=128
    max_passage_len=250
    max_sentence_len=100
    overlap_sentence_len=10
    max_num_sents=30
    
    
    folds=5
    epochs=25
    eval_every=10
    clip_gradient_norm=1.0
    device=torch.device( 'cuda' if torch.cuda.is_available() else 'cpu')

# Load Glove 100-d vectors

In [None]:
%%time
glove_embeddings={}
with open(CONFIG.glove_path) as file:
    for line in file:
        line=line.split()
        word=line[0]
        v=np.array(line[1:]).astype(np.float)
        glove_embeddings[word]=v
print(len(glove_embeddings))

# Tokenizer

In [None]:
class Tokenizer:
    def __init__(self):
        self.lemmatizer=WordNetLemmatizer()
    def is_valid(self, word):
        if (not word.isascii()) or word.isnumeric() or (len(word)==0):
            return False
        for p in string.punctuation:
            if word == p:
                return False
        return True
    def __call__(self, doc):
        sentence_id=0
        tokens=[];sent_ids=[]
        for sentence in sent_tokenize(doc):
            words=[word.strip().lower() for word in word_tokenize(sentence)]
            words=[self.lemmatizer.lemmatize(word) for word in words if self.is_valid(word)]
            words_len=len(words)
            i=0
            
            while i < words_len:
                cur_tokens=words[i: i+CONFIG.max_sentence_len]
                tokens+=cur_tokens
                sent_ids+=[sentence_id]*len(cur_tokens)
                sentence_id+=1
                i+=CONFIG.max_sentence_len-CONFIG.overlap_sentence_len
                break
        return {
            'tokens': tokens,
            'sent_ids': sent_ids
        }

In [None]:
tokenizer=Tokenizer()

train_df=pd.read_csv('../input/commonlit-kfold-dataset/fold_train.csv')
train_df['inputs']=train_df.excerpt.apply(tokenizer)
train_df.head()

In [None]:
def get_sentence_counts(inputs):
    sent_ids=inputs['sent_ids']
    return np.max(sent_ids)

def get_max_tokens_count(inputs):
    tokens=inputs['tokens']
    sent_ids=inputs['sent_ids']
    num_tokens=len(tokens)
    
    i=0;sid=0; cnt=0
    max_cnt=0
    while i < num_tokens:
        cnt+=1
        if sent_ids[i]!=sid:
            sid=sent_ids[i]
            max_cnt=max(cnt, max_cnt)
            cnt=0
        i+=1
    max_cnt=max(cnt, max_cnt)
    return max_cnt

def get_num_tokens(inputs):
    tokens=inputs['tokens']
    return len(tokens)

In [None]:
train_df['num_sents']=train_df['inputs'].apply(get_sentence_counts)
train_df['max_sent_tokens']=train_df['inputs'].apply(get_max_tokens_count)
train_df['num_tokens']=train_df['inputs'].apply(get_num_tokens)

train_df.head()

In [None]:
fold_train_df=train_df[train_df.kfold!=0].copy()
fold_val_df  =train_df[train_df.kfold==0].copy()

print("Train Fold:", len(fold_train_df))
print("Val Fold: ", len(fold_val_df))

# Dataset

In [None]:
class CLRPDataset(torch.utils.data.Dataset):
    def __init__(self, df, phase):
        self.df=df
        self.phase=phase
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        inputs=row.inputs
        tokens=inputs['tokens'][:CONFIG.max_passage_len]
        sent_ids=torch.tensor(inputs['sent_ids'][:CONFIG.max_passage_len], dtype=torch.long)
        
        X=torch.zeros((CONFIG.max_passage_len, CONFIG.glove_dim), dtype=torch.float32)
        sids=torch.zeros(CONFIG.max_passage_len, dtype=torch.long)
        sids=torch.fill_(sids, -1)
        sids[:len(sent_ids)]=sent_ids
        
        num_sents=torch.max(sids)+1
        for i, token in enumerate(tokens):
            if token in glove_embeddings:
                X[i]=torch.tensor(glove_embeddings[token])
                
        if self.phase in ['train', 'val']:
            y=torch.tensor(row.target, dtype=torch.float32)
            return (num_sents, sids, X, y)
        return (num_sents,sids, X)
        
    def __len__(self):
        return len(self.df)

# Model

In [None]:
class Wrappers:
    def __init__(self):
        pass
    def unwrap_sentences(self, nsents, sids, X):
        batch_size=len(nsents)
        batch_sents_count=nsents.sum().item()
        unwrapX=torch.zeros( (batch_sents_count, CONFIG.max_sentence_len, CONFIG.glove_dim) )
        mask=torch.zeros((batch_sents_count, CONFIG.max_sentence_len))
        unwrap_sentid=0
        batch_max_seq_len=0
        slens=[]
        
        for i in range(batch_size):
            for sent_id in range(nsents[i]):
                curX=X[i][ sids[i]==sent_id ]
                curX=curX[:CONFIG.max_sentence_len, :]
                unwrapX[unwrap_sentid, :len(curX), :]=curX
                batch_max_seq_len=max(batch_max_seq_len, len(curX))
                mask[unwrap_sentid, :len(curX)]=1
                unwrap_sentid+=1
                slens.append(len(curX))
        unwrapX=unwrapX[:, :batch_max_seq_len, :]
        mask=mask[:, :batch_max_seq_len]
        slens=torch.tensor(slens, dtype=torch.long)
        return unwrapX, slens, mask
    
    def wrap_sentences(self, nsents, unwrapX):
        batch_size=len(nsents)
        sent_start=0; sent_end=0;batch_max_sents=0
        wrapX=torch.zeros((batch_size, CONFIG.max_num_sents, 2*CONFIG.word_hdim))
        mask=torch.zeros(batch_size, CONFIG.max_num_sents)
        
        for i in range(batch_size):
            num_sents=nsents[i].item()
            sent_end+=num_sents
            for j in range(sent_start, sent_end):
                if j-sent_start>=CONFIG.max_num_sents:
                    break
                wrapX[i][j-sent_start] = unwrapX[j]
                mask[i][j-sent_start]=1
                batch_max_sents=max(batch_max_sents, j-sent_start)
            sent_start=sent_end
        wrapX=wrapX[:, :batch_max_sents, :]
        mask=mask[:, :batch_max_sents]
        return wrapX, mask

In [None]:
def attention(q, k, v, mask, dropout):
    d=q.size(-1)
    scores=torch.matmul(q, k.transpose(-2, -1))/np.sqrt(d)
    if mask is not None:
        scores.masked_fill_(mask.unsqueeze(-1)==0, -1e-9)
    p_attn=softmax(scores, dim=-1)
    if dropout is not None:
        p_attn=dropout(p_attn)
    return torch.matmul(p_attn, v), p_attn

In [None]:
class WordPooling(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear=nn.Linear(2*CONFIG.word_hdim, 1)
    def forward(self, x, unwrap_mask):
        unwrap_mask=unwrap_mask.unsqueeze(-1)
        w=self.linear(x)
        w=w.masked_fill_(unwrap_mask==0, -1e-9)
        w=softmax(w, dim=1)#[bs, seq_len] summed to probabilies
        x=x.permute(0, 2, 1)
        x=torch.matmul(x, w).squeeze(-1)
        return x
    
class SentencePooling(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear=nn.Linear(2*CONFIG.sent_hdim, 1)
    def forward(self, x, wrap_mask):
        wrap_mask=wrap_mask.unsqueeze(-1)
        w=self.linear(x)
        w=w.masked_fill_(wrap_mask==0, -1e-9)
        w=softmax(w, dim=1)#[bs, seq_len] summed to probabilies
        x=x.permute(0, 2, 1)
        x=torch.matmul(x, w).squeeze(-1)
        return x

class ProjectionHead(nn.Module):
    def __init__(self):
        super().__init__()
        self.bn=nn.BatchNorm1d(2*CONFIG.sent_hdim)
        self.dropout=nn.Dropout(0.2)
        self.relu=nn.ReLU()
        self.linear=nn.Linear(2*CONFIG.sent_hdim, 1)
    def forward(self, x):
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=self.linear(x)
        return x

In [None]:
class PointWiseFeedforward(nn.Module):
    def __init__(self, dmodel):
        super().__init__()
        self.linear1=nn.Linear(dmodel, 2*dmodel)
        self.relu=nn.ReLU()
        self.linear2=nn.Linear(2*dmodel, dmodel)
        self.dropout=nn.Dropout(0.2)
        self.layer_norm=nn.LayerNorm(dmodel)
    def forward(self, x):
        x1=self.dropout(self.relu(self.linear1(x)))
        x_ffn=self.linear2(x1)
        return self.layer_norm(x+x_ffn)
    
class TransformerLayer(nn.Module):
    def __init__(self, dmodel):
        super().__init__()
        self.linearQ=nn.Linear(dmodel, dmodel)
        self.linearK=nn.Linear(dmodel, dmodel)
        self.linearV=nn.Linear(dmodel, dmodel)
        self.layer_norm=nn.LayerNorm(dmodel)
        self.dropout=nn.Dropout(0.2)
        self.point_wise_ff=PointWiseFeedforward(dmodel)
        
    def forward(self, x, mask):
        q=self.linearQ(x)
        k=self.linearK(x)
        v=self.linearV(x)
        
        x_attn_out, p_attn=attention(q, k, v, mask, self.dropout)
        x=self.layer_norm(x+x_attn_out)
        x=self.point_wise_ff(x)
        return x

In [None]:
class WordEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.word_encoder=nn.GRU(CONFIG.glove_dim, CONFIG.word_hdim, bidirectional=True, 
                                 num_layers=1, batch_first=True)
        
        self.transformer_layer1=TransformerLayer(2*CONFIG.word_hdim)
        self.transformer_layer2=TransformerLayer(2*CONFIG.word_hdim)
        self.dropout=nn.Dropout(0.2)
        self.relu=nn.ReLU()
        
    def forward(self, x, mask):
        x, _=self.word_encoder(x)
        x=self.transformer_layer1(x, mask)
        x=self.dropout(self.relu(x))
        x=self.transformer_layer2(x, mask)
        return x
    
class SentenceEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.sent_encoder=nn.GRU(2*CONFIG.word_hdim, CONFIG.sent_hdim, bidirectional=True, batch_first=True)
        self.transformer_layer1=TransformerLayer(2*CONFIG.sent_hdim)
        self.transformer_layer2=TransformerLayer(2*CONFIG.sent_hdim)
        self.dropout=nn.Dropout(0.2)
        self.relu=nn.ReLU()
    def forward(self, x, mask):
        x,_=self.sent_encoder(x)
        x=self.transformer_layer1(x, mask)
        x=self.dropout(self.relu(x))
        x=self.transformer_layer2(x, mask)
        return x

In [None]:
class CLRPModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.wrapper=Wrappers()
        
        self.word_encoder=WordEncoder()
        self.word_pooling=WordPooling()
        
        self.sent_encoder=SentenceEncoder()
        self.sent_pooling=SentencePooling()
        
        self.proj_head=ProjectionHead()
    
    def forward(self, inputs):
        num_sents=inputs['num_sents']
        sids=inputs['sids']
        X=inputs['X']
        
        unwrapX, slens, unwrap_mask=self.wrapper.unwrap_sentences(num_sents, sids, X)
        unwrapX=unwrapX.to(CONFIG.device)
        unwrap_mask=unwrap_mask.to(CONFIG.device)
        
        word_encode=self.word_encoder(unwrapX, unwrap_mask)
        word_pool=self.word_pooling(word_encode, unwrap_mask)
        
        wrapX, wrap_mask=self.wrapper.wrap_sentences(num_sents, word_pool)
        wrapX=wrapX.to(CONFIG.device)
        wrap_mask=wrap_mask.to(CONFIG.device)
        
        sent_encode=self.sent_encoder(wrapX, wrap_mask)
        sent_pool=self.sent_pooling(sent_encode, wrap_mask)
        
        y=self.proj_head(sent_pool)
        return y

# Trainer

In [None]:
class Trainer:
    def __init__(self, model, train_dataloader, val_dataloader):
        self.model=model
        self.criterion=nn.MSELoss(reduction='mean')
        self.train_dataloader=train_dataloader
        self.val_dataloader=val_dataloader
        self.optimizer=torch.optim.AdamW(model.parameters(),
                                         lr=CONFIG.learning_rate, 
                                         weight_decay=CONFIG.weight_decay
                                        )
        self.schedular=torch.optim.lr_scheduler.OneCycleLR(self.optimizer, 
                                                           max_lr=CONFIG.learning_rate,
                                                           epochs=CONFIG.epochs,
                                                           steps_per_epoch=len(self.train_dataloader))
        
        self.iter_count=0
        self.best_loss=None
        self.best_iteration=None
        self.train_loss_=[]
        self.val_loss_=[]
        
    def evaluate(self):
        self.model.eval()
        ytrue=[]; ypred=[];
        for (num_sents, sids, X, y) in self.val_dataloader:
            X=X.to(CONFIG.device)
            ytrue+=y.view(-1).tolist()
            
            inputs={
                    'num_sents': num_sents,
                    'sids': sids,
                    'X': X
            }
            with torch.no_grad():
                yhat=self.model(inputs)
                yhat=yhat.view(-1).detach().cpu()
                ypred+=yhat.tolist()
        ytrue=torch.tensor(ytrue, dtype=torch.float32)
        ypred=torch.tensor(ypred, dtype=torch.float32)
        return self.criterion(ypred, ytrue).item()
    
    def checkpoint(self, val_loss):
        if self.best_loss is None or self.best_loss > val_loss:
            torch.save(model, 'best_model.pt')
            self.best_loss=val_loss
            self.best_iteration=self.iter_count
    
    def train_ops(self, inputs, y):
        self.model.train()
        self.optimizer.zero_grad()
        yhat=self.model(inputs)
        yhat=yhat.view(-1)
        loss=self.criterion(yhat, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), CONFIG.clip_gradient_norm)
        self.optimizer.step()
        self.schedular.step()
        return loss.item()
    
    def train(self):
        t1=time.time()
        for _ in range(CONFIG.epochs):
            for (num_sents, sids, X, y) in self.train_dataloader:
                self.iter_count+=1
                X=X.to(CONFIG.device)
                y=y.to(CONFIG.device)
                inputs={
                    'num_sents': num_sents,
                    'sids': sids,
                    'X': X
                }
                
                train_loss=self.train_ops(inputs, y)
                self.train_loss_.append(train_loss)
                if self.iter_count%CONFIG.eval_every==0:
                    val_loss=self.evaluate()
                    self.val_loss_.append(val_loss)
                    self.checkpoint(val_loss)
                    t2=time.time()
                    print("======"*10)
                    print()
                    print("Iteration:{} | Time Taken: {:.2f} | Train Loss:{:.3f}".format(self.iter_count, (t2-t1)/60, self.train_loss_[-1]))
                    print("Val Loss:{:.4f} | Best Loss:{:.4f} | Best Iteration:{}".format(val_loss,self.best_loss,self.best_iteration))
                    t1=time.time()
                    
    def lr_range_test(self):
        self.model.train()
        lrs=[];losses=[]
        min_lr=6e-5;max_lr=1e-3;
        optimizer=torch.optim.AdamW(model.parameters(),
                                    lr=min_lr,
                                    weight_decay=CONFIG.weight_decay)
        mse_loss=nn.MSELoss(reduction='mean')
        schedular=torch.optim.lr_scheduler.StepLR(optimizer, 1, 1.05)

        lrs=[]
        losses=[]

        for i in range(20):
            print('Epoch:', i+1, schedular.get_last_lr())
            for j, (num_sents, sids, X, y) in enumerate(self.train_dataloader):
                X=X.to(CONFIG.device)
                y=y.to(CONFIG.device)
                inputs={
                    'num_sents': num_sents,
                    'sids': sids,
                    'X': X
                }
                yhat=self.model(inputs)
                yhat=yhat.view(-1)        
                optimizer.zero_grad()
                

                loss=self.criterion(yhat, y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), CONFIG.clip_gradient_norm)
                optimizer.step()
                schedular.step()

                lrs+=schedular.get_last_lr()
                losses.append(loss.item())
                if lrs[-1]>max_lr:
                    break
            if lrs[-1]>max_lr:
                break
        return lrs, losses

In [None]:
train_dataset=CLRPDataset(fold_train_df, 'train')
train_dataloader=torch.utils.data.DataLoader(train_dataset, shuffle=True,
                                             batch_size=CONFIG.batch_size)


val_dataset=CLRPDataset(fold_val_df, 'val')
val_dataloader=torch.utils.data.DataLoader(val_dataset, shuffle=False, 
                                           batch_size=CONFIG.batch_size)

In [None]:
#model=CLRPModel()
#model=model.to(CONFIG.device)
#trainer=Trainer(model, train_dataloader, val_dataloader)
#lrs, losses=trainer.lr_range_test()
#lmt=45
#plt.plot(lrs[:lmt], losses[:lmt])

In [None]:
all_best_loss=None
for i in range(5):
    print("Run: ... ", i)
    print('--'*10)
    model=CLRPModel()
    model=model.to(CONFIG.device)

    trainer=Trainer(model, train_dataloader, val_dataloader)
    trainer.train()
    
    plt.title("Run:{} - TrainLoss".format(i))
    plt.plot(trainer.train_loss_)
    plt.show()
    
    plt.title("Run:{} - ValLoss".format(i))
    plt.plot(trainer.val_loss_)
    plt.show()
    if all_best_loss is None or all_best_loss > trainer.best_loss:
        all_best_loss=trainer.best_loss
        torch.save(model, 'model1.pt')

# Inference

In [None]:
model=torch.load('./model1.pt')
models=[model]

In [None]:
def infer(models, dataloader):
    preds=[]
    for (num_sents, sids, X) in dataloader:
        X=X.to(CONFIG.device)
        y_hat=torch.zeros(X.shape[0])
        
        inputs={
            'num_sents': num_sents,
            'sids': sids,
            'X': X
        }
        
        for model in models:
            model.eval()
            with torch.no_grad():
                y=model(inputs)
                y=y.view(-1).detach().cpu()
                #y_hat+=(target_std*y) + target_mean
                y_hat+=y
        preds+=list(y_hat.numpy()/len(models))
    return preds

In [None]:
infer_train_dataset=CLRPDataset(train_df, 'test')
infer_train_dataloader=torch.utils.data.DataLoader(infer_train_dataset, batch_size=200, shuffle=False)
train_df['preds'] = infer(models, infer_train_dataloader)

In [None]:
train_df[['id', 'target', 'preds']].head()

In [None]:
(np.sqrt((train_df.preds-train_df.target)**2)).mean()

In [None]:
_, ax=plt.subplots(2, 1)
sns.boxplot(data=train_df, x='target', ax=ax[0])
sns.boxplot(data=train_df, x='preds', ax=ax[1])

In [None]:
sns.histplot(train_df, x='target', bins=100, color='red')
sns.histplot(train_df, x='preds', bins=100,)

# Submission

In [None]:
test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_df['inputs']=test_df.excerpt.apply(tokenizer)
test_df.head()

In [None]:
infer_test_dataset=CLRPDataset(test_df, 'test')
infer_test_dataloader=torch.utils.data.DataLoader(infer_test_dataset, batch_size=200, shuffle=False)
test_df['target'] = infer(models, infer_test_dataloader)

In [None]:
submission_df=test_df[['id', 'target']].copy()
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)