In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import torch
import copy
import math
import gc
from tqdm import tqdm
import torch.utils.data as D
import random
import os
from transformers import AutoModelWithLMHead, AutoTokenizer,RobertaConfig, RobertaModel,AutoModelForSequenceClassification,AutoModelForMaskedLM
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet,Lasso,RidgeCV,LassoCV
from torch.nn.utils.rnn import pad_sequence
from torch import nn
from torch import optim
import time
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    SchedulerType,
    get_scheduler,
    set_seed,
)

# Version changes:
* roberta base ITPT pretrain

# parameters for this notebook

In [None]:
class CFG:
    seed=777
    max_len=256
    batch_size=24
    lr=2*10**(-5)
    weight_decay=0.01
    keep_last_model=True
    dropout_p=0.1
    mlm_probability=0.15
    epochs=5
    folds=5
    cv_shuffle=False
    early_stop_epoch=1000
    sub_task_weight=1
    device=torch.device('cuda:0')
    dtype=torch.float32

In [None]:
if CFG.dtype==torch.float64:
    torch.set_default_tensor_type(torch.DoubleTensor)
else:
    torch.set_default_tensor_type(torch.FloatTensor)
torch.set_default_dtype(CFG.dtype)

In [None]:
random.seed(CFG.seed)
os.environ['PYTHONHASHSEED'] = str(CFG.seed)
np.random.seed(CFG.seed)
torch.manual_seed(CFG.seed)
torch.cuda.manual_seed(CFG.seed)
torch.cuda.manual_seed_all(CFG.seed)
torch.backends.cudnn.deterministic = True

# Load Dataset

In [None]:
train_df=pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df=pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
res_df=pd.read_csv('/kaggle/input/commonlitreadabilityprize/sample_submission.csv')

# Helpers 

In [None]:
class RobertaDataset(D.Dataset):
    def __init__(self, token, target):
        self.token = token
        self.target = target
        
    def __len__(self):
        return self.token.shape[0]

    def __getitem__(self, idx):
        return torch.tensor(self.token[idx].input_ids), \
                torch.tensor(self.token[idx].attention_mask), self.target[idx]
    
def collate(batch):
    ids, attns, targets = zip(*batch)
    ids = pad_sequence(ids, batch_first=True).to(CFG.device)
    attns = pad_sequence(attns, batch_first=True).to(CFG.device)
    targets = torch.tensor(targets).float().to(CFG.device)
    return ids, attns, targets

# Load pretrained model

In [None]:
name="roberta-base"
config = AutoConfig.from_pretrained(name, output_hidden_states=True,attention_probs_dropout_prob=CFG.dropout_p,hidden_dropout_prob=CFG.dropout_p)
tokenizer = AutoTokenizer.from_pretrained(name,model_max_length=CFG.max_len)
model = AutoModelForMaskedLM.from_pretrained(name,config=config)

In [None]:
model

In [None]:
config

In [None]:
def tokenize(tokenizer,texts):
    tokens=[]
    for text in texts:
        token=tokenizer(text,max_length=CFG.max_len,truncation=True, padding='max_length',add_special_tokens=True)
        #print(len(token['input_ids']))
        tokens.append(token)
    return tokens

In [None]:
train_df['token'] = tokenize(tokenizer,train_df.excerpt)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=CFG.mlm_probability)
data_collator_val = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
train_dataloader = DataLoader(
        train_df['token'], shuffle=True, collate_fn=data_collator, batch_size=CFG.batch_size
        )
val_dataloader = DataLoader(
        train_df['token'], shuffle=False, collate_fn=data_collator_val, batch_size=CFG.batch_size
        )

In [None]:
model=model.to(CFG.device)

# Cross Validation

In [None]:

def val_rmse(model,target,loader,mode='train'):
    loss_seq = []
    pred_seq=[]
    target_seq=[]
    if mode=='eval':
        model.eval()
    elif mode=='train':
        model.train()
    embs=[]
    for batch in tqdm(loader):
        batch_={}
        for key,val in batch.items():
            batch_[key]=val.to(CFG.device)
        with torch.no_grad():
            outputs = model(**batch_)
        embs.append(outputs['hidden_states'][-1][:,:,:].mean(axis=1))
    embs=torch.cat(embs,axis=0).to('cpu').numpy()
    simple_model=RidgeCV(alphas=np.logspace(-3,2,20),cv=5,normalize=True,scoring='neg_mean_squared_error').fit(embs,target)
    print(np.log10(simple_model.alpha_))
    rmse=np.sqrt(-simple_model.best_score_)
    return rmse

In [None]:
class StateRecorder:
    def __init__(self):
        self.best_score=float('inf')
        self.best_state_dict=None
        self.best_epoch=0
        self.stop=False
        return
    
    def record(self,score,epoch,model,keep_last_model=False):
        if score<self.best_score or keep_last_model:
            self.best_score=score
            self.best_state_dict=copy.deepcopy(model.state_dict())
            self.best_epoch=epoch
        else:
            if (epoch-self.best_epoch)>CFG.early_stop_epoch:
                self.stop=True
        return

In [None]:
def train(model,train_ldr,target,val_ldr):
    val_epoch=1
    epoch_L=len(train_ldr)
    val_freq=1
    val_gap=epoch_L//val_freq
    print(f'validation gap iteration is:{val_gap}')
    
    
    recorder=StateRecorder()
    optimizer = AdamW(model.parameters(), CFG.lr,
                            betas=(0.9, 0.999), weight_decay=CFG.weight_decay,correct_bias=True)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=CFG.epochs*len(train_ldr),eta_min=0)
    for i in range(CFG.epochs):      
        loss_seq = []          
        for itr,batch in enumerate(train_ldr):
            model.train()
            batch_={}
            for key,val in batch.items():
                batch_[key]=val.to(CFG.device)
            outputs = model(**batch_)
            loss = outputs.loss
            loss_seq.append(loss.item())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            if itr%val_gap==0:
                val_loss=val_rmse(model, target,val_ldr,mode='train')
                #val_loss2,_,_ = val_rmse(model, val_ldr,wMSE,mode='eval')
                recorder.record(val_loss,val_epoch,model,keep_last_model=CFG.keep_last_model)
                print(f'################  epoch {i+1}   val epoch {val_epoch} #################################')
                print(f'training loss:{np.mean(loss_seq).round(6)}')
                print(f'validation rmse (train):{val_loss.round(6)}')
                val_epoch+=1
                #scheduler.step(val_loss)
    torch.cuda.empty_cache()
    return recorder

In [None]:
recorder=train(model,train_dataloader,train_df.target,val_dataloader)
state_dict=recorder.best_state_dict

# save model state dicts

In [None]:
torch.save({'model_state_dict':state_dict},'ITPT_state_dict')