In [None]:
import numpy as np 
import pandas as pd 
import os, random, sys, time, re
import math

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as D
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import StratifiedKFold, KFold

import warnings
warnings.filterwarnings('ignore')

In [None]:
from transformers import *
# from transformers import (DistilBertTokenizer, DistilBertForSequenceClassification,
#                           AlbertTokenizer, AlbertForSequenceClassification,
#                           RobertaTokenizer, RobertaForSequenceClassification,
#                           ElectraTokenizer, ElectraForSequenceClassification,
#                          CamembertTokenizer, CamembertForSequenceClassification)

In [None]:
DATA_PATH = "../input/commonlitreadabilityprize/"

# MODEL_PATH = '../input/distilbertbaseuncased'
# MODEL_PATH = '../input/pretrained-albert-pytorch/albert-base-v2'
# MODEL_PATH = '../input/camembertbasesquadfrfquadpiaf/camembert-base-squadFR-fquad-piaf'
# MODEL_PATH = '../input/roberta-transformers-pytorch/distilroberta-base'
# MODEL_PATH = '../input/roberta-transformers-pytorch/roberta-base'
MODEL_PATH = '../input/bart-models-hugging-face-model-repository/bart-base'
# MODEL_PATH = '../input/electra-base'
VOCAB_PATH = '../input/roberta-transformers-pytorch/roberta-base' 
# MODEL_PATH

N_FOLDS = 4
EPOCHES = 2
BATCH_SIZE = 12
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MAX_SEQUENCE_LENGTH = 256
LR = 2.5e-5
get_tokenizer = RobertaTokenizer
get_model = BartForSequenceClassification


# error log
sys.stderr = open('err.txt', 'w')

In [None]:
SEED = 7117
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
train_csv = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'), index_col='id')
test_csv = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'), index_col='id')
# print("before filter")
# print(len(train_csv))
# train_csv['standard_error'].hist(bins=12)
# train_csv = train_csv[abs(train_csv['standard_error'])<0.6]
# print("after filter")
# print(len(train_csv))

subm = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'), index_col='id')

y = (train_csv.target.values > 0).astype(int)
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

In [None]:
tokenizer = get_tokenizer.from_pretrained(VOCAB_PATH,
                   model_max_length=MAX_SEQUENCE_LENGTH)
train_csv['token'] = train_csv.excerpt.apply(tokenizer)
test_csv['token'] = test_csv.excerpt.apply(tokenizer)

In [None]:
class LitDataset(D.Dataset):
    
    def __init__(self, token, target,standard_error = None ,training = False):
        self.token = token
        self.target = target
        self.training = training
        if(training):
            self.standard_error = standard_error
    def __len__(self):
        return self.token.shape[0]

    def __getitem__(self, idx):
        if self.training:
            return torch.tensor(self.token[idx].input_ids), \
                torch.tensor(self.token[idx].attention_mask),\
                self.target[idx], self.standard_error[idx]
    
        return torch.tensor(self.token[idx].input_ids), \
                torch.tensor(self.token[idx].attention_mask), self.target[idx]
    
def collate_fn(batch):
    ids, attns, targets, errors = zip(*batch)
    ids = pad_sequence(ids, batch_first=True).to(DEVICE)
    attns = pad_sequence(attns, batch_first=True).to(DEVICE)
    targets = torch.tensor(targets).float().to(DEVICE)
    errors = torch.tensor(errors).float().to(DEVICE)
    return ids, attns, targets, errors

def collate_fn_test(batch):
    ids, attns, idxs = zip(*batch)
    ids = pad_sequence(ids, batch_first=True).to(DEVICE)
    attns = pad_sequence(attns, batch_first=True).to(DEVICE)
    return idxs, ids, attns

In [None]:
ds = LitDataset(train_csv.token, train_csv.target,
                standard_error = train_csv.standard_error,training = True)
test_ds = LitDataset(test_csv.token, test_csv.index)

tloader = D.DataLoader(test_ds, batch_size=BATCH_SIZE,
                       shuffle=False, collate_fn = collate_fn_test, num_workers=0)

In [None]:
### Table for results
header = r'''
            Train         Validation
Epoch |  MSE  |  RMSE |  MSE  |  RMSE | Time, m
'''
#          Epoch         metrics            time
raw_line = '{:6d}' + '\u2502{:7.3f}'*4 + '\u2502{:6.2f}'

In [None]:
l_params = 0.1
leaky_func = nn.LeakyReLU(l_params)
def loss_with_error(output,target,error):
    
    return torch.mean(leaky_func((output-target)**2-error**2/4)+l_params*(error**2/4))/2


In [None]:

@torch.no_grad()
def validation_fn(model, loader, loss_fn):
    tloss = []
    for texts, attns, target, error in loader:
        outputs = model(texts, attention_mask=attns)
        loss = loss_fn(outputs.logits.squeeze(-1), target)
        tloss.append(loss.item())
    tloss = np.array(tloss).mean()
    return tloss

def oof_preds(ds, tloader, cv, y, epochs = EPOCHES):
    
    loss_fn = loss_with_error
    display_loss = torch.nn.MSELoss()
    model_id = 0
    for train_idx, valid_idx in cv.split(range(len(ds)), y):
        
        train_ds = D.Subset(ds, train_idx)
        loader = D.DataLoader(train_ds, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn = collate_fn,num_workers=0)
        
        valid_ds = D.Subset(ds, valid_idx)
        vloader = D.DataLoader(valid_ds, batch_size=BATCH_SIZE,
                      shuffle=False, collate_fn = collate_fn,num_workers=0)
        
        model = get_model.from_pretrained( 
                          MODEL_PATH, num_labels=1).to(DEVICE);
       
        optimizer = optim.AdamW(model.parameters(), LR,
                                betas=(0.9, 0.999), weight_decay=1e-1)
        scheduler = get_constant_schedule_with_warmup(optimizer, 35)
        print(header)
        for epoch in range(1, epochs+1):      
            start_time = time.time()
            tloss = []          
            model.train()
            
            for texts, attns, target,error in loader:
                optimizer.zero_grad()
                outputs = model(texts, attention_mask=attns)
                gloss = loss_fn(outputs.logits.squeeze(-1), target, error)
                loss = display_loss(outputs.logits.squeeze(-1), target)
                tloss.append(loss.item())
                gloss.backward()
                optimizer.step()
                scheduler.step()
            tloss = np.array(tloss).mean()
            vloss = validation_fn(model, vloader, display_loss)
            tmetric = tloss**.5
            vmetric = vloss**.5
            print(raw_line.format(epoch,tloss,tmetric,vloss,vmetric,(time.time()-start_time)/60**1))
            del loss, outputs
            
        model.eval();
        # Get prediction for test set
        ids, preds = [], [] 
        with torch.no_grad():
            for batch_ids, texts, attn in tloader:
                outputs = model(texts, attention_mask=attn)
                ids += batch_ids
                preds.append(outputs.logits.detach().squeeze(-1).cpu().numpy())
            
        # Save prediction of test set
        preds = np.concatenate(preds)
        subm.loc[ids, 'target']  =  subm.loc[ids, 'target'].values + preds / N_FOLDS
        
        output_dir = "./pretrained_bart_{}/".format(model_id)
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        model_id += 1
        del model, vloader, loader, train_ds, valid_ds
        torch.cuda.empty_cache()

In [None]:
oof_preds(ds, tloader, cv, y, epochs = EPOCHES)

In [None]:
subm.to_csv('submission.csv')

In [None]:
subm
