In [None]:
import os
import re
import gc
import sys
import time
import string
import random

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from joblib import Parallel, delayed

import warnings
warnings.filterwarnings('ignore')

import transformers
from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,
                          get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup)

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold, GroupKFold

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)

In [None]:
from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [None]:
class Config():
    def __init__(self):
        # maximum number of tokens in the sentence
        self.max_len = 256

        # batch size
        self.batch_size = 8
        #self.valid_batch_size = 4

        # number of epochs
        self.num_epochs = 10
        
        # number of folds
        self.num_folds = 5

        # define the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            'roberta-large'
        )
        
        # define seed
        self.seed = 23
        
        # training hyperparameters
        self.learning_rate = 5e-5
        self.weight_decay = 1e-1

In [None]:
data_dir = '../input/commonlitreadabilityprize/'
train_df = pd.read_csv(data_dir + 'train.csv')
test_df = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
class CLRPDataset(nn.Module):
    def __init__(self, df, config):
        self.excerpt = df['excerpt'].to_numpy()
        self.target = df['target'].to_numpy()
        self.tokenizer = config.tokenizer
        self.max_len = config.max_len
        
    def __len__(self)->int:
        return len(self.excerpt)
    
    def __getitem__(self, idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        target = torch.tensor(self.target[idx], dtype=torch.float)
        
        return encode, target

In [None]:
def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

def loss_fn(outputs, targets):
    outputs = outputs.logits.squeeze(-1)
    return torch.sqrt(nn.MSELoss()(outputs, targets))

In [None]:
def train_loop(dataloader, model, loss_fn, device, optimizer, lr_scheduler = None):
    model.train()
    total_loss = 0
    for i, (inputs, targets) in enumerate(dataloader):
        # clear previous gradients
        optimizer.zero_grad()

        # compute model output and loss
        inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
        targets = targets.to(device)
        outputs = model(**inputs)
        loss = loss_fn(outputs,targets)

        # compute gradients of all variables wrt loss
        loss.backward()

        # performs updates using calculated gradients
        optimizer.step()

        if lr_scheduler:
            lr_scheduler.step()

        total_loss += loss.item()

    total_loss /= len(dataloader)
    return total_loss      

In [None]:
def eval_loop(dataloader, model, loss_fn, device):
    model.eval()
    total_loss = 0
    valid_predictions = list()
    with torch.no_grad():
        for i, (inputs,targets) in enumerate(dataloader):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            targets = targets.to(device)

            outputs = model(**inputs)
            
            loss = loss_fn(outputs,targets)
            total_loss += loss.item()
            outputs = outputs.logits.squeeze(-1).cpu().detach().numpy().tolist()
            
            valid_predictions.extend(outputs)
        total_loss /= len(dataloader)
    return total_loss, valid_predictions

In [None]:
# for kfold  
num_bins = int(np.floor(1 + np.log2(len(train_df))))
train_df.loc[:,'bins'] = pd.cut(train_df['target'],bins=num_bins,labels=False)
bins = train_df.bins.to_numpy()

train_df['is_positive'] = (train_df['target'] >=0)

train_df['text_len']= train_df['excerpt'].apply(lambda x: len(x.split()))

train_df.head()

In [None]:
def run(train_df, bins, config, plot_losses=True, verbose=True):
    fold_train_losses = list()
    fold_valid_losses = list()
    fold_valid_predictions = list()
    fold_valid_targets = list()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
    
    kfold = StratifiedKFold(n_splits=config.num_folds, shuffle=True, random_state=config.seed)
    for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_df,y=bins)):
        x_train, x_valid = train_df.loc[train_idx],train_df.loc[valid_idx]
        
        model = AutoModelForSequenceClassification.from_pretrained('roberta-large',num_labels=1)
        model.to(device)
        tokenizer = config.tokenizer

        train_ds = CLRPDataset(x_train, config)
        train_dl = DataLoader(train_ds,
                              batch_size = config.batch_size,
                              shuffle=True,
                              num_workers = 4,
                              pin_memory=True,
                              drop_last=False
                             )

        valid_ds = CLRPDataset(x_valid,config)
        valid_dl = DataLoader(valid_ds,
                              batch_size = config.batch_size,
                              shuffle=False,
                              num_workers = 4,
                              pin_memory=True,
                              drop_last=False,
                             )
        
        optimizer = optim.AdamW(model.parameters(),lr=config.learning_rate, weight_decay=config.weight_decay)
        #         lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer,max_lr=1e-4,
        #                                                     steps_per_epoch=len(train_dl), epochs=config['epochs'])
        
        lr_scheduler = None
    
        print(f"Fold {k}")
        best_loss = 99999
        
        train_losses = list()
        valid_losses = list()
        best_valid_predictions = list()
        start = time.time()
        for i in range(config.num_epochs):
            train_loss = train_loop(train_dl, model, loss_fn, device, optimizer, lr_scheduler=lr_scheduler)
            valid_loss, valid_predictions = eval_loop(valid_dl, model, loss_fn, device)

            train_losses.append(train_loss)
            valid_losses.append(valid_loss)
            
            end = time.time()
            epoch_time = end - start
            start = end
            
            valid_targets = x_valid['target'].to_list()
                                                  
            if verbose:
                print(f"epoch:{i} Training loss:{train_loss} | Validation loss:{valid_loss} |epoch time {epoch_time:.2f}s ")

            if valid_loss <= best_loss:
                if verbose:
                    print(f"{g_}Validation loss Decreased from {best_loss} to {valid_loss}{sr_}")
                    
                best_loss = valid_loss
                best_valid_predictions = valid_predictions
                model.save_pretrained(f'./model{k}')
                tokenizer.save_pretrained(f'./model{k}')
                
        fold_train_losses.append(train_losses)
        fold_valid_losses.append(valid_losses)
        fold_valid_predictions.append(best_valid_predictions)
        fold_valid_targets.append(x_valid['target'].tolist())
        
        if k == 0:
            break
        
    if plot_losses == True:
        plt.figure(figsize=(20,14))
        for i, (t,v) in enumerate(zip(fold_train_losses,fold_valid_losses)):
            plt.subplot(2,5,i+1)
            plt.title(f"Fold {i}")
            plt.plot(t,label="train_loss")
            plt.plot(v,label="valid_loss")
            plt.legend()
        plt.show()
        
        plt.figure(figsize=(20,14))
        for i, (p,t) in enumerate(zip(fold_valid_predictions,fold_valid_targets)):
            plt.subplot(2,5,i+1)
            plt.title(f"Fold {i}")
            sns.distplot(p,label="predictions")
            sns.distplot(t,label="targets")
            plt.legend()
        plt.show()

In [None]:
config = Config()

gc.collect()
torch.cuda.empty_cache()
run(train_df, bins, config)