# Acknowledgement: 
This is just a cleaner version for https://www.kaggle.com/maunish/clrp-pytorch-train-tpu to finetune RoberTa with 5-fold CV. Output models can be found in https://www.kaggle.com/maunish/clrp-roberta-svm/data. I do this just to solve some confusion for code. If you like, just give credit to [Maunish](https://www.kaggle.com/maunish). 

The confusion is this line of code `inputs = {key:val.reshape(val.shape[0], -1).to(device) for key, val in inputs.items()}`. Finally, I figure out that pytorch dataset will output extra dimension due to huggingface tokenizer with `return_tensors='pt'`.

## Imports ðŸ“—

In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold, GroupKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,
                          get_constant_schedule_with_warmup,get_cosine_schedule_with_warmup)



## Getting Data ðŸ’¾

In [None]:
# data reading
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

class CLRPDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.targets = df['target'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt', # this will add a extra dimension
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        target = torch.tensor(self.targets[idx],dtype=torch.float)
        return encode, target
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
# finetune setting
config = {
    'lr': 1e-5,
    'wd':1e-1,
    'batch_size':16,
    'max_len':256,
    'epochs':4,
    'nfolds':5,
    'seed':42,
}

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

def loss_fn(outputs,targets):
    outputs = outputs.logits.squeeze(-1)
    return torch.sqrt(nn.MSELoss()(outputs,targets))

def train_loop(train_loader, model, loss_fn, device,optimizer,lr_scheduler=None):
    model.train()
    total_loss = 0
    for i, (inputs,targets) in enumerate(train_loader):
        optimizer.zero_grad()
        inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
        targets = targets.to(device)
        outputs = model(**inputs)
        loss = loss_fn(outputs,targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    total_loss /= len(train_loader)
    return total_loss

def valid_loop(valid_loader, model, loss_fn, device):
    model.eval()
    total_loss = 0
    valid_predictions = list()
    with torch.no_grad():
        for i, (inputs,targets) in enumerate(valid_loader):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            targets = targets.to(device)

            outputs = model(**inputs)
            loss = loss_fn(outputs,targets)
            total_loss += loss.item()
            outputs = outputs.logits.squeeze(-1).cpu().detach().numpy().tolist()
#                 outputs = outputs.cpu().detach().numpy().tolist()
            valid_predictions.extend(outputs)
        total_loss /= len(valid_loader)
    return total_loss ,valid_predictions

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# training
train = train_data
kfold = KFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train)):
    x_train,x_valid = train.loc[train_idx],train.loc[valid_idx]

    MODEL_PATH = 'roberta-large'
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH,num_labels=1)
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

    train_ds = CLRPDataset(x_train,tokenizer,config['max_len'])
    train_dl = DataLoader(train_ds,
                          batch_size = config["batch_size"],
                          shuffle=True,
                          num_workers = 4,
                          pin_memory=True,
                          drop_last=False
                         )

    valid_ds = CLRPDataset(x_valid,tokenizer,config['max_len'])
    valid_dl = DataLoader(valid_ds,
                          batch_size = config["batch_size"],
                          shuffle=False,
                          num_workers = 4,
                          pin_memory=True,
                          drop_last=False,
                         )

    optimizer = optim.AdamW(model.parameters(),lr=config['lr'],weight_decay=config['wd'])
    lr_scheduler = None

    best_loss = 99999
    best_valid_predictions = list()
    
    for i in range(config["epochs"]):
        train_loss = train_loop(train_dl,model,loss_fn,device,optimizer,lr_scheduler=lr_scheduler)
        valid_loss,valid_predictions = valid_loop(valid_dl,model,loss_fn,device)

        valid_targets = x_valid['target'].to_list()

        if lr_scheduler:
            lr_scheduler.step()
            
        if valid_loss <= best_loss:
            best_loss = valid_loss
            best_valid_predictions = valid_predictions
#                 torch.save(model.state_dict(),f'./model{k}/model{k}.bin')
            model.save_pretrained(f'./model{k}')
            tokenizer.save_pretrained(f'./model{k}')


