In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import torch
import copy
import math
import gc
from tqdm import tqdm
import torch.utils.data as D
import random
import os
from transformers import AutoModelWithLMHead, AutoTokenizer,RobertaConfig, RobertaModel,AutoModelForSequenceClassification,AutoModelForMaskedLM
from torch.nn.utils.rnn import pad_sequence
from torch import nn
from torch import optim
import time
import torch.nn.functional as F
from transformers import (
    AutoModel,
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    SchedulerType,
    get_scheduler,
    set_seed,
)

# Version changes:
* deberta large
* seed 12222
* no re init
* bs 4
* grad accum 2 lr 2
* 1 linear layer

# parameters for this notebook

In [None]:
class CFG:
    seed=12222
    path='microsoft/deberta-large'
    checkpoint=None
    max_len=256
    batch_size=4
    grad_avg_n=2
    lr=2e-5
    betas=(0.9, 0.999)
    lr_diff_rate=0.95
    weight_decay=0.01
    dropout_p=0.1
    initializer=None
    re_init_n=0
    epochs=5
    folds=5
    cv_shuffle=False
    val_freq=10
    patience=1
    lr_factor=0.1
    score_avg_n=1
    pad_token_id=1
    early_stop_epoch=1000
    device=torch.device('cuda:0')
    dtype=torch.float32

In [None]:
if CFG.dtype==torch.float64:
    torch.set_default_tensor_type(torch.DoubleTensor)
else:
    torch.set_default_tensor_type(torch.FloatTensor)
torch.set_default_dtype(CFG.dtype)

In [None]:
random.seed(CFG.seed)
os.environ['PYTHONHASHSEED'] = str(CFG.seed)
np.random.seed(CFG.seed)
torch.manual_seed(CFG.seed)
torch.cuda.manual_seed(CFG.seed)
torch.cuda.manual_seed_all(CFG.seed)
torch.backends.cudnn.deterministic = True

# Load Dataset

In [None]:
train_df=pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df=pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
res_df=pd.read_csv('/kaggle/input/commonlitreadabilityprize/sample_submission.csv')

# Helpers 

In [None]:
def CV_split(m,k=5,shuffle=False,seed=7):
    index=np.arange(m)
    if shuffle:
        np.random.seed(seed)
        np.random.shuffle(index)
    test_size=math.ceil(m/k)
    split_indices=[]
    for i in range(k):
        bool_index=np.zeros(m)
        bool_index[test_size*i:test_size*(i+1)]=1
        bool_index=bool_index.astype('bool')
        val_index=index[bool_index]
        train_index=index[~bool_index]
        split_indices.append((train_index,val_index))
    return split_indices

def score_test(model,test_ldr):
    print('start eval')
    model.eval()
    preds=[]
    for texts, attns, idx in tqdm(test_ldr):
        with torch.no_grad():
            pred = model(texts,attns)
            preds.append(pred)
    preds=torch.cat(preds,axis=0)
    preds=preds.to('cpu').numpy().reshape(-1)
    return preds
    
def rmse(y1,y2):
    score=np.sqrt(((y1-y2)**2).mean())
    return score

In [None]:
class RobertaDataset(D.Dataset):
    def __init__(self, token, target):
        self.token = token
        self.target = target
        
    def __len__(self):
        return self.token.shape[0]

    def __getitem__(self, idx):
        return torch.tensor(self.token[idx].input_ids), \
                torch.tensor(self.token[idx].attention_mask), self.target[idx]
    
def collate(batch):
    ids, attns, targets = zip(*batch)
    ids = pad_sequence(ids, batch_first=True,padding_value=CFG.pad_token_id).to(CFG.device)
    attns = pad_sequence(attns, batch_first=True,padding_value=CFG.pad_token_id).to(CFG.device)
    targets = torch.tensor(targets).float().to(CFG.device)
    return ids, attns, targets


# Load pretrained model

In [None]:
path=CFG.path
config = AutoConfig.from_pretrained(path, output_hidden_states=True,attention_probs_dropout_prob=CFG.dropout_p,hidden_dropout_prob=CFG.dropout_p)
tokenizer = AutoTokenizer.from_pretrained(path,model_max_length=CFG.max_len)
CFG.pad_token_id=tokenizer.pad_token_id
model = AutoModel.from_pretrained(path,config=config)

# LOAD state dict with IPTP

In [None]:
if CFG.checkpoint is not None:
    checkpoint=torch.load('../input/'+CFG.checkpoint+'/ITPT_state_dict',map_location=CFG.device)
    model.load_state_dict(checkpoint['model_state_dict'])

# save init model

In [None]:
config.save_pretrained('model_init')
model.save_pretrained('model_init')
tokenizer.save_pretrained('model_init')

In [None]:
model

In [None]:
config

In [None]:
def tokenize(tokenizer,texts):
    tokens=[]
    for text in texts:
        token=tokenizer(text,max_length=CFG.max_len,truncation=True, padding='max_length',add_special_tokens=True)
        #print(len(token['input_ids']))
        tokens.append(token)
    return tokens

In [None]:
train_df['token'] = tokenize(tokenizer,train_df.excerpt)
test_df['token'] = tokenize(tokenizer,test_df.excerpt)

ds1 = RobertaDataset(train_df.token, train_df.target)
ds2 = RobertaDataset(test_df.token, test_df.index)

test_ldr = D.DataLoader(ds2, batch_size=CFG.batch_size,
                        shuffle=False, collate_fn = collate,num_workers=0)

# My Model with additional layers

In [None]:
class MyModel(nn.Module):
    def __init__(self, model):
        super(MyModel, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2

In [None]:
model=MyModel(model)
init_state=copy.deepcopy(model.state_dict())

# Cross Validation

In [None]:
@torch.no_grad()
def val_rmse(model,loader, f_loss,mode='train'):
    loss_seq = []
    pred_seq=[]
    target_seq=[]
    if mode=='eval':
        model.eval()
    elif mode=='train':
        model.train()
    for texts, attns, target in loader:
        pred = model(texts, attention_mask=attns).reshape(-1)
        loss = f_loss(pred, target).item()
        loss_seq.append(loss)
        pred_seq.append(pred.to('cpu').numpy())
        target_seq.append(target.to('cpu').numpy())
    loss = np.sqrt(np.array(loss_seq).mean())
    pred=np.concatenate(pred_seq)
    target=np.concatenate(target_seq)
    return loss,pred,target

In [None]:
class StateRecorder:
    def __init__(self):
        self.best_score=float('inf')
        self.best_state_dict=None
        self.best_epoch=0
        self.stop=False
        self.scores=[]
        self.cv_scores=[]
        return
    
    def record(self,score,epoch,model):
        self.scores.append(score)
        avg_score=np.mean(self.scores[-CFG.score_avg_n:])
        self.cv_scores.append(avg_score)
        print(f'average({CFG.score_avg_n}) validation (train) rmse:{avg_score.round(6)}')
        if avg_score<self.best_score:
            self.best_score=avg_score
            self.best_state_dict={}
            for k,v in model.state_dict().items():
                self.best_state_dict[k] = v.cpu()
            self.best_epoch=epoch
        else:
            if (epoch-self.best_epoch)>CFG.early_stop_epoch:
                self.stop=True
        return

In [None]:
def get_group_parameters(model,lr,lr_diff_rate=1,weight_decay=0.01):
    init_lr=1
    last_layer=-1
    opt_params=[]
    for name,matrix in model.named_parameters():
        info=name.split('.')
        params_dict={'params':matrix}
        ###############################
#         if info[-1]=='bias':
#             params_dict['weight_decay']=0
#         else:
        params_dict['weight_decay']=weight_decay
#         ###############################
        if len(info)>=4 and info[1]=='encoder' and info[2]=='layer':
            layer=int(info[3])
            if layer!=last_layer:
                last_layer=layer
                init_lr/=lr_diff_rate
        params_dict['lr']=init_lr
        opt_params.append(params_dict)
    scale=lr/init_lr
    for params_dict in opt_params:
        params_dict['lr']*=scale
    return opt_params

In [None]:
def re_init(model,n=4,initializer=None):
    # plz modify name here based on model
    if n==0:
        return
    if initializer is None:
        initializer=nn.init.kaiming_normal_
    for layer in model.model.encoder.layer[-n:]:
        for module in layer.modules(): 
            if isinstance(module, nn.Linear):
                initializer(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.LayerNorm):
                nn.init.zeros_(module.bias)
                nn.init.ones_(module.weight)
    return

In [None]:
def train(model,train_ldr,val_ldr,fold):
    epoch_res=[]
    val_epoch=1
    epoch_L=len(train_ldr)
    val_freq=CFG.val_freq
    val_gap=epoch_L//val_freq
    print(f'validation gap iteration is:{val_gap}')
    
    mse= torch.nn.MSELoss()
    
    recorder=StateRecorder()
    re_init(model,n=CFG.re_init_n,initializer=CFG.initializer)
    opt_params=get_group_parameters(model,CFG.lr,CFG.lr_diff_rate,CFG.weight_decay)
    optimizer = AdamW(opt_params,correct_bias=True)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(CFG.epochs*len(train_ldr))//CFG.grad_avg_n,eta_min=0)
    total_iter=0
    for i in range(CFG.epochs):      
        start_time = time.time()
        loss_seq = []          
        for itr,(texts, attns, target) in enumerate(train_ldr):
            model.train()
            outputs = model(texts, attention_mask=attns)
            loss = mse(outputs.reshape(-1), target)/CFG.grad_avg_n
            loss_seq.append(loss.item()*CFG.grad_avg_n)
            loss.backward()
            total_iter+=1
            if total_iter%CFG.grad_avg_n==0:
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                if total_iter%val_gap==0:
                    val_loss,pred,target = val_rmse(model, val_ldr,mse,mode='train')
                    val_loss2,_,_ = 0,None,None#val_rmse(model, val_ldr,mse,mode='eval')

                    res=pd.DataFrame()
                    res['target']=target
                    res['pred']=pred
                    res['fold']=fold
                    res['epoch']=val_epoch
                    epoch_res.append(res)
                    print(f'################  epoch {i+1}   val epoch {val_epoch} #################################')
                    print(f'training rmse:{np.sqrt(np.mean(loss_seq)).round(6)}')
                    print(f'validation rmse (train):{val_loss.round(6)}')#  (eval):{val_loss2.round(6)}')
                    recorder.record(val_loss,val_epoch,model)

                    val_epoch+=1
                #scheduler.step(recorder.cv_scores[-1])
    torch.cuda.empty_cache()
    fold_res=pd.concat(epoch_res,axis=0).reset_index(drop=True)
    fold_res['is_best_epoch']=False
    fold_res.loc[fold_res[fold_res.epoch==recorder.best_epoch].index,'is_best_epoch']=True
    return fold_res,recorder

In [None]:
split_indices=CV_split(len(train_df),k=CFG.folds,shuffle=CFG.cv_shuffle,seed=7)
models=[]
cv_res=[]
for fold in range(1,CFG.folds+1):
    print(f'fold {fold}')
    train_index,val_index=split_indices[fold-1]
    train_ds=D.Subset(ds1, train_index)
    valid_ds = D.Subset(ds1, val_index)
    ################
    random.seed(CFG.seed)
    os.environ['PYTHONHASHSEED'] = str(CFG.seed)
    np.random.seed(CFG.seed)
    torch.manual_seed(CFG.seed)
    torch.cuda.manual_seed(CFG.seed)
    torch.cuda.manual_seed_all(CFG.seed)
    torch.backends.cudnn.deterministic = True
    ##################
    train_ldr = D.DataLoader(train_ds, batch_size=CFG.batch_size,
                             shuffle=True, collate_fn = collate,num_workers=0)
    val_ldr = D.DataLoader(valid_ds, batch_size=CFG.batch_size,
                           shuffle=False, collate_fn = collate,num_workers=0)
    
    model.load_state_dict(init_state)
    model.to(CFG.device)
    fold_res,recorder=train(model,train_ldr,val_ldr,fold)
    print(f'best epoch: {recorder.best_epoch} with best rmse:{recorder.best_score}')
    cv_res.append(fold_res)
    preds=score_test(model,test_ldr)
    test_df[f'fold {fold} preds']=preds
    torch.save({'model_state_dict':recorder.best_state_dict},f'fold_{fold}_model')
    del recorder
    gc.collect()
cv_res=pd.concat(cv_res,axis=0).reset_index(drop=True)

# deeper Look into oof_pred

In [None]:
cv_res['ae']=np.abs(cv_res.target-cv_res.pred)

* oof score for each epoch

In [None]:
rmse_curve=[]
for epoch in range(1,cv_res.epoch.max()+1):
    epoch_data=cv_res[cv_res.epoch==epoch].reset_index(drop=True)
    score=rmse(epoch_data.pred,epoch_data.target)
    rmse_curve.append(score)
    print(f'abs error auto correlation: {epoch_data.ae.autocorr()}')
    print(f'epoch {epoch} rmse: {score} oof pred error std:{np.std(epoch_data.ae)/np.sqrt(len(epoch_data))}')
plt.plot(rmse_curve)

* best oof score

In [None]:
epoch_data=cv_res[cv_res.is_best_epoch==True].reset_index(drop=True)
rmse(epoch_data.pred,epoch_data.target)

In [None]:
final_preds=0
for i in range(CFG.folds):
    final_preds+=test_df[f'fold {i+1} preds']/CFG.folds
res_df['target']=final_preds

In [None]:
res_df