In [None]:
import numpy as np 
import pandas as pd 
import os, random, sys, time, re, copy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as D
from torch.nn.utils.rnn import pad_sequence

# K-Fold spliter
from sklearn.model_selection import StratifiedKFold, KFold

import warnings
warnings.filterwarnings('ignore')

# Transformers library
from transformers import *

In [None]:
# Path to train csv file
DATA_PATH = "../input/commonlitreadabilityprize/"

# Path to model weights and vocab (select desirable): 

# MODEL_PATH = '../input/distilbertbaseuncased'
# MODEL_PATH = '../input/pretrained-albert-pytorch/albert-base-v1'
# MODEL_PATH = '../input/pretrained-albert-pytorch/albert-xlarge-v1'
# MODEL_PATH = '../input/camembertbasesquadfrfquadpiaf/camembert-base-squadFR-fquad-piaf'
# MODEL_PATH = '../input/roberta-transformers-pytorch/distilroberta-base'
MODEL_PATH = '../input/roberta-transformers-pytorch/roberta-base'
# MODEL_PATH = '../input/roberta-transformers-pytorch/roberta-large'
# MODEL_PATH = '../input/bart-models-hugging-face-model-repository/bart-base'
# MODEL_PATH = '../input/electra-base'
# MODEL_PATH = '../input/deberta/base'

MODEL_NAME = 'optimus_prime'
# VOCAB_PATH = '../input/roberta-transformers-pytorch/roberta-base' 
VOCAB_PATH = MODEL_PATH

N_FOLDS = 5
EPOCHES = 5
BATCH_SIZE = 24
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MAX_SEQUENCE_LENGTH = 320
LR = 2e-5

# error log
sys.stderr = open('err.txt', 'w')

In [None]:
SEED = 7117
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
train_csv = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'), index_col='id')
test_csv = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'), index_col='id')

subm = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'), index_col='id')

# https://www.kaggle.com/abhishek/step-1-create-folds
df_size = train_csv.shape[0]
num_bins = int(np.floor(1 + np.log2(df_size)))
# bin targets
y = pd.cut(train_csv["target"], bins=num_bins, labels=False)

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(VOCAB_PATH,
                   model_max_length=MAX_SEQUENCE_LENGTH)
train_csv['token'] = train_csv.excerpt.apply(tokenizer)
test_csv['token'] = test_csv.excerpt.apply(tokenizer)

In [None]:
class LitDataset(D.Dataset):
    
    def __init__(self, token, target):
        self.token = token
        self.target = target
        
    def __len__(self):
        return self.token.shape[0]

    def __getitem__(self, idx):
        return torch.tensor(self.token[idx].input_ids), \
                torch.tensor(self.token[idx].attention_mask), self.target[idx]
    
def collate_fn(batch):
    ids, attns, targets = zip(*batch)
    ids = pad_sequence(ids, batch_first=True, padding_value=tokenizer.pad_token_id).to(DEVICE)
    attns = pad_sequence(attns, batch_first=True, padding_value=tokenizer.pad_token_id).to(DEVICE)
    targets = torch.tensor(targets).float().to(DEVICE)
    return ids, attns, targets

def collate_fn_test(batch):
    ids, attns, idxs = zip(*batch)
    ids = pad_sequence(ids, batch_first=True, padding_value=tokenizer.pad_token_id).to(DEVICE)
    attns = pad_sequence(attns, batch_first=True, padding_value=tokenizer.pad_token_id).to(DEVICE)
    return idxs, ids, attns

In [None]:
ds = LitDataset(train_csv.token, train_csv.target)
test_ds = LitDataset(test_csv.token, test_csv.index)

tloader = D.DataLoader(test_ds, batch_size=BATCH_SIZE,
                       shuffle=False, collate_fn = collate_fn_test, num_workers=0)

In [None]:
### Table for results
header = r'''
            Train         Validation
Epoch |  MSE  |  RMSE |  MSE  |  RMSE | Time, m
'''
#          Epoch         metrics            time
raw_line = '{:6d}' + '\u2502{:7.3f}'*4 + '\u2502{:6.2f}'

## K-fold Cross-Validation

In [None]:
@torch.no_grad()
def validation_fn(model, loader, loss_fn):
    tloss = []
    model.eval();
    for texts, attns, target in loader:
        outputs = model(texts, attention_mask=attns)
        loss = loss_fn(outputs.logits.squeeze(-1), target)
        tloss.append(loss.item())
    tloss = np.array(tloss).mean()
    return tloss

def oof_train(ds, cv, y, epochs = EPOCHES):
    
    loss_fn = torch.nn.MSELoss()
    
    for fold, (train_idx, valid_idx) in enumerate(cv.split(range(len(ds)), y)):
        
        train_ds = D.Subset(ds, train_idx)
        loader = D.DataLoader(train_ds, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn = collate_fn,num_workers=0)
        
        valid_ds = D.Subset(ds, valid_idx)
        vloader = D.DataLoader(valid_ds, batch_size=BATCH_SIZE,
                      shuffle=False, collate_fn = collate_fn,num_workers=0)
        
        model = AutoModelForSequenceClassification.from_pretrained( 
                          MODEL_PATH, num_labels=1).to(DEVICE);
        
        optimizer = optim.AdamW(model.parameters(), LR,
                                betas=(0.9, 0.999), weight_decay=1e-1)
        scheduler = get_constant_schedule_with_warmup(optimizer, 35)
        print(header)
        
        # init state
        best_metric = np.inf
        best_model = model.state_dict()
        
        for epoch in range(1, epochs+1):      
            start_time = time.time()
            tloss = []          
            model.train()
            
            for texts, attns, target in loader:
                optimizer.zero_grad()
                outputs = model(texts, attention_mask=attns)
                loss = loss_fn(outputs.logits.squeeze(-1), target)
                tloss.append(loss.item())
                loss.backward()
                optimizer.step()
                scheduler.step()
            tloss = np.array(tloss).mean()
            vloss = validation_fn(model, vloader, loss_fn)
            tmetric = tloss**.5
            vmetric = vloss**.5
            print(raw_line.format(epoch,tloss,tmetric,vloss,vmetric,(time.time()-start_time)/60**1))
            del loss, outputs
            
            if best_metric > vmetric:
                with torch.no_grad():
                    best_metric = vmetric
                    best_model = copy.deepcopy(model.state_dict())
            
        # Save final state to the checkpoint
        filename = f'{MODEL_NAME}_fold_{fold:02}.pt'
        checkpoint = {
            'model' : model.state_dict(),
            'best_model' : best_model,
            'best_metric' : best_metric,
        }
        torch.save(checkpoint,  filename)
    
        del model, vloader, loader, train_ds, valid_ds
        torch.cuda.empty_cache()

In [None]:
oof_train(ds, cv, y, epochs = EPOCHES)

## Inference

In [None]:
model = AutoModelForSequenceClassification.from_pretrained( 
                  MODEL_PATH, num_labels=1).to(DEVICE);

for fold in range(N_FOLDS):
    
    filename = f'{MODEL_NAME}_fold_{fold:02}.pt'
    weights = torch.load(filename)['model']
    model.load_state_dict(weights);
    model.eval();
    del weights
    # Get prediction for test set
    ids, preds = [], [] 
    with torch.no_grad():
        for batch_ids, texts, attn in tloader:
            outputs = model(texts, attention_mask=attn)
            ids += batch_ids
            preds.append(outputs.logits.detach().squeeze(-1).cpu().numpy())

    # Save prediction of test set
    preds = np.concatenate(preds)
    subm.loc[ids, 'target']  =  subm.loc[ids, 'target'].values + preds / N_FOLDS

# Save to the file
subm.to_csv('submission.csv')

In [None]:
# clean saves
!rm -r *.pt
!rm err.txt