In [None]:
import os
import re
import time
import string
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn

import seaborn as sns

from nltk.stem import WordNetLemmatizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS


from collections import defaultdict

In [None]:
train_df=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train_df.head()

In [None]:
def get_cv_ids():
    global train_df
    df=train_df.copy()
    cv_ids=[]
    ranges=[(-4, -3.0), (-3.0, -2.0),(-2.0, -1.0),(-1.0, 0.0), (0.0, 1.0), (1.0, 2)]
    for r in ranges:
        l=r[0]
        h=r[1]
        
        cur_cvids=list(df[(df.target>=l) & (df.target<h)].id.values)
        np.random.choice(cur_cvids)
        cv_ids+=cur_cvids[:int(len(cur_cvids)*0.1)]
    return cv_ids
cv_ids=get_cv_ids()


val_df=train_df[train_df.id.isin(cv_ids)].copy()
train_df=train_df[train_df.id.isin(cv_ids)==False].copy()

print('Number Of Validation Records:', len(val_df))
print('Number Of Train Records:', len(train_df))

val_df.target.hist(bins=100)
plt.show()

In [None]:
train_df.target.hist(bins=100)
plt.show()

# Load Glove 100-d vectors

In [None]:
glove_path='../input/glove6b100dtxt/glove.6B.100d.txt'
glove_embeddings={}
with open(glove_path) as file:
    for line in file:
        line=line.split()
        word=line[0]
        v=np.array(line[1:]).astype(np.float)
        glove_embeddings[word]=v
print(len(glove_embeddings))

In [None]:
def seed_all():
    torch.manual_seed(0)
    random.seed(0)
    np.random.seed(0)

seed_all()

In [None]:
class Tokenizer:
    def __init__(self):
        self.lemmatizer=WordNetLemmatizer()
        self.nlp=English()
    def __call__(self, doc):
        tokens=[]
        for token in self.nlp(doc):
            if token.like_num or token.text=='':
                continue
            token=token.lower_.strip()
            for p in string.punctuation:
                token=token.replace(p, ' ')
            token=token.split(' ')
            token=[w for w in token if w!='']
            tokens+=token
        return tokens

In [None]:
tokenizer=Tokenizer()
train_df['doc']=train_df.excerpt.apply(tokenizer)
train_df.head()

lets us consider all the words that appear atleast in 5 documents

# Configuration

In [None]:
MAX_SEQ_LEN=150
BATCH_SIZE=128

In [None]:
target_mean=train_df.target.mean()
target_std=train_df.target.std()

print("Taget Mean:", target_mean)
print("Taget Std:", target_std)

In [None]:
train_df['normalized_target']=(train_df.target - target_mean)/target_std
sns.histplot(data=train_df, x='normalized_target')

In [None]:
train_df['normalized_target'].describe()

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, phase):
        self.df=df
        self.phase=phase
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        doc=row.doc
        
        X=torch.zeros((MAX_SEQ_LEN, 100), dtype=torch.float32)
        for i, word in enumerate(doc):
            if i >= MAX_SEQ_LEN:
                break
            if word in glove_embeddings:
                X[i]=torch.tensor(glove_embeddings[word])
        
        if self.phase=='train':
            y=torch.tensor(row.normalized_target, dtype=torch.float32)
            return (X, y)
        return X
    def __len__(self):
        return len(self.df)

# Model

In [None]:
class ProjectionHead(nn.Module):
    def __init__(self, in_features,out_feat):
        super().__init__()
        self.linear1=nn.Linear(in_features, 512)
        self.bn=nn.BatchNorm1d(512)
        self.dropout=nn.Dropout(0.2)
        self.relu=nn.ReLU()
        self.linear2=nn.Linear(512, out_feat)
    def forward(self, x):
        x=self.linear1(x)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=self.linear2(x)
        return x

class Model(nn.Module):
    def __init__(self, embedd_size, hidden_size):
        super().__init__()
        self.hidden_size=hidden_size
        self.gru=nn.GRU(embedd_size, hidden_size, num_layers=2, 
                        dropout=0.2, bidirectional=True,batch_first=True)
        self.bn=nn.BatchNorm1d(2*hidden_size)
        self.relu=nn.ReLU()
        self.dropout=nn.Dropout(0.2)
        self.proj_head=ProjectionHead(2*hidden_size, 1)
    def forward(self, x):
        batch_size=x.shape[0]
        (_, h_n)=self.gru(x)
        h_n=h_n.view(2, 2, batch_size, self.hidden_size)
        h_n=h_n[1, :, :, :].permute(1, 0, 2)
        h_n1=h_n[:, 0, :]
        h_n2=h_n[:, 1, :]
        h=torch.cat([h_n1, h_n2], dim=1)
        
        h=self.bn(h)
        h=self.relu(h)
        h=self.dropout(h)
        
        y=self.proj_head(h)
        return y

In [None]:
def lr_test(min_lr, max_lr, train_dataloader):
    model=Model(100, 128)
    model.train()
    optimizer=torch.optim.AdamW(model.parameters(), lr=min_lr)
    mse_loss=nn.MSELoss(reduction='mean')
    schedular=torch.optim.lr_scheduler.StepLR(optimizer, 1, 1.05)

    lrs=[]
    losses=[]

    for i in range(10):
        print('Epoch:', i+1)
        for j, (X, y) in enumerate(train_dataloader):
            y_hat=model(X).view(-1)
            loss=mse_loss(y_hat, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            schedular.step()

            lrs+=schedular.get_last_lr()
            losses.append(loss.item())

            if j%10==0:
                print(lrs[-1])

            if lrs[-1]>max_lr:
                break
        if lrs[-1]>max_lr:
            break
    return lrs, losses

In [None]:
def train_epoch(model, optimizer, criterion, train_dataloader, scheduler):
    epoch_loss=0.0
    model.train()
    for (X, y) in train_dataloader:
        batch_size=X.shape[0]
        y_hat=model(X).view(-1)
        loss=criterion(y_hat, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        epoch_loss+=loss.item()
    epoch_loss/=len(train_dataloader)
    return epoch_loss


def get_val_loss(model, val_dataloader):
    y_diff=[]
    model.eval()
    for (X, y) in val_dataloader:
        with torch.no_grad():
            y_hat=model(X).view(-1)
            y_diff=list(torch.abs(y_hat-y).numpy())
    
    y_diff=np.sqrt( np.mean(np.square(y_diff) ))
    return y_diff

In [None]:
train_dataset=Dataset(train_df, 'train')
val_dataset=Dataset(train_df, 'train')


train_dataloader=torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader=torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
train_losses=[]
val_losses=[]
models=[]
for i in range(5):
    best_loss=None
    max_patience=5
    patience=7
    epochs=30

    model=Model(100, 128)
    optimizer=torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-6)
    scheduler=torch.optim.lr_scheduler.OneCycleLR(optimizer, 
                                                  max_lr=2e-4,
                                                  epochs=epochs,
                                                  steps_per_epoch=len(train_dataloader))
    mse_loss=nn.MSELoss(reduction='mean')

    for e in range(epochs):
        if patience==0:
            print("Stoping at the epoch:{}".format(e+1))
            break
        t1=time.time()
        epoch_loss=train_epoch(model, optimizer, mse_loss, train_dataloader, scheduler)
        val_loss=get_val_loss(model, val_dataloader)
        t2=time.time()

        if (best_loss is None) or (val_loss<=best_loss):
            best_loss=val_loss
            patience=max_patience
            torch.save(model, 'model_{}.pt'.format(i+1))
        if val_loss > best_loss:
            patience-=1
        print('Epoch:{} | Time Taken: {:.2f} | Train LOSS:{:.4f} | ValLoss:{:.4f}'.format(e+1, (t2-t1)/60, epoch_loss, val_loss))
    model=torch.load('model_{}.pt'.format(i+1))
    models.append(model)

In [None]:
def infer(models, dataloader):
    preds=[]
    for X in dataloader:
        y_hat=torch.zeros(X.shape[0])
        for model in models:
            model.eval()
            with torch.no_grad():
                y=model(X).view(-1)
                y_hat+=(target_std*y) + target_mean
            
        preds+=list(y_hat.numpy()/len(models))
    return preds

In [None]:
infer_train_dataset=Dataset(train_df, 'test')
infer_train_dataloader=torch.utils.data.DataLoader(infer_train_dataset, batch_size=200, shuffle=False)
train_df['preds'] = infer(models, infer_train_dataloader)
train_df[['id', 'target', 'normalized_target', 'preds']].head()

In [None]:
(np.sqrt((train_df.preds-train_df.target)**2)).mean()

In [None]:
_, ax=plt.subplots(2, 1)
sns.boxplot(data=train_df, x='target', ax=ax[0])
sns.boxplot(data=train_df, x='preds', ax=ax[1])


In [None]:
sns.histplot(train_df, x='target', bins=100, color='red')
sns.histplot(train_df, x='preds', bins=100,)


In [None]:
train_df.to_csv('train_with_preds.csv', index=False)

# Submission

In [None]:
test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_df['doc']=test_df.excerpt.apply(tokenizer)
test_df.head()

In [None]:
infer_test_dataset=Dataset(test_df, 'test')
infer_test_dataloader=torch.utils.data.DataLoader(infer_test_dataset, batch_size=200, shuffle=False)
test_df['target'] = infer(models, infer_test_dataloader)


In [None]:
submission_df=test_df[['id', 'target']].copy()
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)