In [None]:
import torch
import torchvision
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import ToTensor
from torchvision.utils import make_grid
from torch.utils.data import random_split

import pandas as pd
import seaborn as sns
import gc
import time
from tqdm import tqdm
import datatable as dt
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.model_selection import StratifiedKFold,KFold
warnings.filterwarnings("ignore")
%matplotlib inline

import os
import random

from colorama import Fore, Back, Style
red = Fore.RED
grn = Fore.GREEN
blu = Fore.BLUE
ylw = Fore.YELLOW
wht = Fore.WHITE
bred = Back.RED
bgrn = Back.GREEN
bblu = Back.BLUE
bylw = Back.YELLOW
bwht = Back.WHITE
rst = Style.RESET

import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff

from xgboost import XGBRegressor
import xgboost as xgb
from catboost import CatBoostRegressor, Pool, CatBoost

In [None]:
!pip install transformers

In [None]:
path = '../input/commonlitreadabilityprize/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sample = pd.read_csv(path + 'sample_submission.csv')

nbins = 12
train.loc[:,'bins'] = pd.cut(train['target'],nbins,labels=False)
bins = train.bins.to_numpy()

In [None]:
train.head()

In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification,BertModel
from transformers import InputExample, InputFeatures
config = {
    'MAX_LEN' : 256,
    'TRAIN_BATCH_SIZE' : 8,
    'VALID_BATCH_SIZE' : 4,
    'EPOCHS' : 10,
    'SEED': 43,
    'FOLDS': 6,
    'BERT_PATH' : "roberta-base",
    'CSV_PATH' : 'lgbmtrain.csv',
    'AUGMENTED_CSV' : 'lgbmtrainAUG.csv',
    'MODEL_PATH' : './CLRPmodel',
    'TOKENIZER' : AutoTokenizer.from_pretrained('roberta-base'),
}

In [None]:
y_train = train['target'].to_numpy()

In [None]:
class CommonLitDataset(nn.Module):
    def __init__(self, data, tokenizer, max_len = config['MAX_LEN']):
        self.excerpt = data['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.targets = data['target']
        
    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        inputs = self.tokenizer(self.excerpt[item],
                            max_length=self.max_len,
                            padding='max_length',
                            truncation=True,
                            return_tensors='pt')
        target = torch.tensor(self.targets[item], dtype=torch.float)   
        
        return inputs,target
        

In [None]:
def loss_fn(output,target):
    return torch.sqrt(nn.MSELoss()(output,target))

In [None]:
def seed_everything(seed=43):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    config['SEED'] = seed
seed_everything(43)

In [None]:
def train_fn(data_loader, model, optimizer, scheduler, device):
    model.train()
    losses = []

    for idx, (data,targets) in tqdm(enumerate(data_loader), total = len(data_loader)):
        data = {key:val.reshape(val.shape[0],-1).to(device) for key,val in data.items()}
        
#         targets = data['targets']
        targets = targets.to(device)
        outputs = model(**data)

        optimizer.zero_grad()
        outputs = model(**data)
        outputs = outputs["logits"].squeeze(-1)
        
        loss = loss_fn(outputs, targets)

#         loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Train Loss:- {np.mean(losses)}")

In [None]:
def eval(data_loader, model, device):
    model.eval()
    with torch.no_grad():
        fin_targets = []
        fin_outputs = []
        for idx, (data,targets) in tqdm(enumerate(data_loader), total = len(data_loader)):
            data = {key:val.reshape(val.shape[0],-1).to(device) for key,val in data.items()}
            
            targets = targets.to(device)
            outputs = model(**data)
            
            outputs = outputs["logits"].squeeze(-1)

#             targets = data['targets']

#             outputs = model(data['input_ids'], data['attention_mask'])
            
            fin_targets.extend(targets.detach().cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.detach().cpu().numpy().tolist())
        fin_targets = torch.tensor(fin_targets)
        fin_outputs = torch.tensor(fin_outputs)
        loss = loss_fn(fin_outputs,fin_targets)
    return loss,fin_outputs

In [None]:
from sklearn import model_selection,metrics
import numpy as np
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

def run():
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    
    tokenizer = config['TOKENIZER']
    kfold = StratifiedKFold(n_splits=config['FOLDS'],shuffle=True,random_state=config['SEED'])
    for fold , (train_idx,valid_idx) in enumerate(kfold.split(X=train,y=bins)):
        start_time = time.time()
        train_x,valid_x = train.loc[train_idx],train.loc[valid_idx]
        
        train_x = train_x.reset_index(drop=True)
        valid_x = valid_x.reset_index(drop=True)

        train_ds = CommonLitDataset(train_x, tokenizer)

        train_loader = torch.utils.data.DataLoader(
            train_ds,
            pin_memory = True,
            batch_size = config['TRAIN_BATCH_SIZE'],
            num_workers = 3
        )

        valid_ds = CommonLitDataset(valid_x, tokenizer)

        valid_loader = torch.utils.data.DataLoader(
            valid_ds,
            pin_memory = True,
            batch_size = config['VALID_BATCH_SIZE'],
            num_workers = 1
        )

        
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        print(f"========== USING {device} ==========")
        print(f'========== Fold: {fold} ==========')
        model = AutoModelForSequenceClassification.from_pretrained(config['BERT_PATH'],num_labels=1)
        model.to(device)
        
        tokenizer = config['TOKENIZER']

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias','LayerNorm.bias','LayerNorm.weight']
        optimizer_parameters = [
            {'params' : [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay' : 0.001},
            {'params' : [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay' : 0.0},
        ]

        num_train_steps = int(len(train_ds) / config['TRAIN_BATCH_SIZE'] * config['EPOCHS'])

#         optimizer = AdamW(optimizer_parameters, lr = 3e-5, betas=(0.9, 0.999))
        optimizer = AdamW(model.parameters(), lr = 3e-5, betas=(0.9, 0.999), weight_decay=1e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps = 0,
            num_training_steps = num_train_steps
        )
#         scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, steps_per_epoch=len(train_ds), max_lr=1e-4, epochs=config['EPOCHS'])

        best_loss = 99999
        
        losses_valid = list()
        best_preds = list()
        
        for epoch in range(config['EPOCHS']):
            start = time.time()
            
            print(f'========== epoch : {epoch+1} / {config["EPOCHS"]} ==========')
            train_fn(train_loader, model, optimizer,scheduler,device)
            loss,outputs = eval(valid_loader,model,device)
            print(f'Loss : {loss}')
            losses_valid.append(loss)
            
            end = time.time()
            elapsed_time = end - start
            start = end
            
            print(f'===== epoch time {elapsed_time} =====')

            if loss < best_loss:
                print(f'{blu} Loss decreased from {best_loss} -> {loss}')
                model.save_pretrained(f'{config["MODEL_PATH"]}_{fold}_{epoch}')
                tokenizer.save_pretrained(f'{config["MODEL_PATH"]}_{fold}_{epoch}')
                best_preds = outputs
    #             torch.save(model.state_dict(), config['MODEL_PATH'])
                best_loss = loss
        end_time = time.time()
        elp_fold = end_time - start_time
        print(f'===== Fold Time: {elp_fold} =====')

In [None]:
run()