## Imports ðŸ“—

In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version 20210331 --apt-packages libomp5 libopenblas-dev
!rm -rf /kaggle/working/*.whl
!rm -rf /kaggle/working/*.py

In [None]:
import os
os.environ['XLA_USE_BF16'] = "1"
os.environ['XLA_TENSOR_ALLOCATOR_MAXSIZE'] = '100000000'

import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold, GroupKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)

import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,
                          get_constant_schedule_with_warmup,get_cosine_schedule_with_warmup)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

## Getting Data ðŸ’¾

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

# train_data = train_data.loc[:,['excerpt','target']]
    
# df1 = train_data.sample(frac=1).reset_index(drop=True).rename(columns={'excerpt':'excerpt1','target':'target1'})
# df2 = train_data.sample(frac=1).reset_index(drop=True).rename(columns={'excerpt':'excerpt2','target':'target2'})

# df1['excerpt'] = df1['excerpt'].apply(lambda x:x[:int(len(x)/2)])
# df2['excerpt'] = df2['excerpt'].apply(lambda x:x[:int(len(x)/2)])

# df = pd.concat([df1,df2],axis=1)

# df['excerpt'] = df['excerpt1'] + df['excerpt2']
# df['target'] = (df['target1'] + df['target2'])/2

# df = df.loc[:,train_data.columns.tolist()]

# train_data = pd.concat([train_data,df],axis=0).reset_index(drop=True)

# for kfold  
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)
bins = train_data.bins.to_numpy()

train_data['is_positive'] = (train_data['target'] >=0)

train_data['text_len']= train_data['excerpt'].apply(lambda x: len(x.split()))

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
print("{0}Number of rows in train data: {1}{2}\n{0}Number of columns in train data: {1}{3}".format(y_,r_,train_data.shape[0],train_data.shape[1]))
print("{0}Number of rows in test data: {1}{2}\n{0}Number of columns in test data: {1}{3}".format(m_,r_,test_data.shape[0],test_data.shape[1]))
print("{0}Number of rows in sample : {1}{2}\n{0}Number of columns in sample : {1}{3}".format(c_,r_,sample.shape[0],sample.shape[1]))

In [None]:
train_data.head()

## EDA

In [None]:
plt.style.use('fivethirtyeight')
def distribution1(feature,color1,color2,df=train_data):
    plt.figure(figsize=(15,7))
    
    plt.subplot(121)
    dist = sns.distplot(df[feature],color=color1)
    a = dist.patches
    xy = [(a[i].get_x() + a[i].get_width() / 2,a[i].get_height()) \
          for i in range(1,len(a)-1) if (a[i].get_height() > a[i-1].get_height() and a[i].get_height() > a[i+1].get_height())]
    
    for i,j in xy:
        dist.annotate(
            s=f"{i:.3f}",
            xy=(i,j), 
            xycoords='data',
            ha='center', 
            va='center', 
            fontsize=11, 
            color='black',
            xytext=(0,7), 
            textcoords='offset points',
        )
    
    qnt = df[feature].quantile([.25, .5, .75]).reset_index(level=0).to_numpy()
    plt.subplot(122)
    box = sns.boxplot(df[feature],color=color2)
    for i,j in qnt:
        box.annotate(str(j)[:4],xy= (j-.05,-0.01),horizontalalignment='center')
        
    print("{}Max value of {} is: {} {:.2f} \n{}Min value of {} is: {} {:.2f}\n{}Mean of {} is: {}{:.2f}\n{}Standard Deviation of {} is:{}{:.2f}"\
      .format(y_,feature,r_,df[feature].max(),g_,feature,r_,df[feature].min(),b_,feature,r_,df[feature].mean(),m_,feature,r_,df[feature].std()))

### Distribution of target in train

In [None]:
distribution1('target','yellow','red')

### Distribution of length of text

In [None]:
distribution1('text_len','red','blue')

### Distribution of standard error

In [None]:
distribution1('standard_error','blue','green')

### Scatter plot standard error vs target

In [None]:
def scatterplot1(feature1,feature2,category,df=train_data):
    fig = px.scatter(train_data, x=feature1, y=feature2, color=category, marginal_y="violin",
               marginal_x="box", trendline="ols", template="simple_white")
    fig.show()

In [None]:
scatterplot1('standard_error','target','is_positive')

# ROBERTA moDEL

In [None]:
config = {
    'lr': 5e-5,
    'wd':1e-1,
    'batch_size':64,
    'max_len':256,
    'epochs':15,
    'nfolds':5,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
class CLRPDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.targets = df['target'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        target = torch.tensor(self.targets[idx],dtype=torch.float)
        
        return encode, target
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def run(plot_losses=True, verbose=True):
        
    def loss_fn(outputs,targets):
        outputs = outputs.logits.squeeze(-1)
        return torch.sqrt(nn.MSELoss()(outputs,targets))
    
    def train_loop(train_loader, model, loss_fn, device,optimizer,lr_scheduler=None):
        model.train()
        total_loss = 0
        for i, (inputs,targets) in enumerate(train_loader):
            optimizer.zero_grad()
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            targets = targets.to(device)
            outputs = model(**inputs)
            loss = loss_fn(outputs,targets)
            loss.backward()
            xm.optimizer_step(optimizer, barrier=True)
            if lr_scheduler:
                lr_scheduler.step()
                
            total_loss += loss.item()
        total_loss /= len(train_loader)
        return total_loss
        
    def valid_loop(valid_loader, model, loss_fn, device):
        model.eval()
        total_loss = 0
        valid_predictions = list()
        with torch.no_grad():
            for i, (inputs,targets) in enumerate(valid_loader):
                inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
                targets = targets.to(device)
                
                outputs = model(**inputs)
                loss = loss_fn(outputs,targets)
                total_loss += loss.item()
                outputs = outputs.logits.squeeze(-1).cpu().detach().numpy().tolist()
                valid_predictions.extend(outputs)
            total_loss /= len(valid_loader)
        return total_loss ,valid_predictions
    
    fold_train_losses = list()
    fold_valid_losses = list()
    fold_valid_predictions = list()
    fold_valid_targets = list()

    device = xm.xla_device(config['nfolds'] + 1)
    print(f"{device} is used")
    
    train = train_data
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train,y=bins)):
        x_train,x_valid = train.loc[train_idx],train.loc[valid_idx]
        
        MODEL_PATH = 'roberta-large'
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH,num_labels=1)
        model.to(device)
        
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

        train_ds = CLRPDataset(x_train,tokenizer,config['max_len'])
        train_dl = DataLoader(train_ds,
                              batch_size = config["batch_size"],
                              shuffle=True,
                              num_workers = 4,
                              pin_memory=True,
                              drop_last=False
                             )

        valid_ds = CLRPDataset(x_valid,tokenizer,config['max_len'])
        valid_dl = DataLoader(valid_ds,
                              batch_size = config["batch_size"],
                              shuffle=False,
                              num_workers = 4,
                              pin_memory=True,
                              drop_last=False,
                             )
        
        optimizer = optim.AdamW(model.parameters(),lr=config['lr'],weight_decay=config['wd'])
#         lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer,max_lr=1e-4,
#                                                     steps_per_epoch=len(train_dl), epochs=config['epochs'])
        
        
        lr_scheduler = None
    
        print(f"Fold {k}")
        best_loss = 99999
        
        train_losses = list()
        valid_losses = list()
        best_valid_predictions = list()
        start = time.time()
        for i in range(config["epochs"]):
            train_loss = train_loop(train_dl,model,loss_fn,device,optimizer,lr_scheduler=lr_scheduler)
            valid_loss,valid_predictions = valid_loop(valid_dl,model,loss_fn,device)

            train_losses.append(train_loss)
            valid_losses.append(valid_loss)
            
            end = time.time()
            epoch_time = end - start
            start = end
            
            valid_targets = x_valid['target'].to_list()
                                                  
            if verbose:
                print(f"epoch:{i} Training loss:{train_loss} | Validation loss:{valid_loss} |epoch time {epoch_time:.2f}s ")

            if valid_loss <= best_loss:
                if verbose:
                    print(f"{g_}Validation loss Decreased from {best_loss} to {valid_loss}{sr_}")
                    
                best_loss = valid_loss
                best_valid_predictions = valid_predictions
#                 xm.save(model.state_dict(),f'./model{k}/model{k}.bin')
                model.save_pretrained(f'./model{k}')
                tokenizer.save_pretrained(f'./model{k}')
                
        fold_train_losses.append(train_losses)
        fold_valid_losses.append(valid_losses)
        fold_valid_predictions.append(best_valid_predictions)
        fold_valid_targets.append(x_valid['target'].tolist())
        
        if k == 0:
            break
        
    if plot_losses == True:
        plt.figure(figsize=(20,14))
        for i, (t,v) in enumerate(zip(fold_train_losses,fold_valid_losses)):
            plt.subplot(2,5,i+1)
            plt.title(f"Fold {i}")
            plt.plot(t,label="train_loss")
            plt.plot(v,label="valid_loss")
            plt.legend()
        plt.show()
        
        plt.figure(figsize=(20,14))
        for i, (p,t) in enumerate(zip(fold_valid_predictions,fold_valid_targets)):
            plt.subplot(2,5,i+1)
            plt.title(f"Fold {i}")
            sns.distplot(p,label="predictions")
            sns.distplot(t,label="targets")
            plt.legend()
        plt.show()

In [None]:
run()