In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install accelerate
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from accelerate import Accelerator
from transformers import (AutoModel,AutoConfig,AdamW,
                          AutoTokenizer,get_cosine_schedule_with_warmup)

from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

train_data['excerpt'] = train_data['excerpt'].apply(lambda x: x.replace('\n',''))

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

bins = train_data.bins.to_numpy()
target = train_data.target.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
config = {
    'lr': 2e-5,
    'wd':0.01,
    'batch_size':8,
    'valid_step':10,
    'max_len':256,
    'epochs':4,
    'nfolds':5,
    'seed':1000,
    'model_path':'../input/notebookb33113bcde/epoch2roberta_large',
}

for i in range(config['nfolds']):
    os.makedirs(f'model{i}',exist_ok=True)

def seed_everything(seed=1000):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

train_data['Fold'] = -1
kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=bins)):
    train_data.loc[valid_idx,'Fold'] = k
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer,max_len=248):
        self.excerpt = df['excerpt'].to_numpy()
        self.targets = df['target'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        target = torch.tensor(self.targets[idx],dtype=torch.float) 
        return encode, target
    
    def __len__(self):
        return len(self.excerpt)
class Model(nn.Module): 
    def __init__(self, model_name):
        super().__init__() 


        config = AutoConfig.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, config=config)
        
        self.drop_out1 = nn.Dropout(0)
        self.drop_out2 = nn.Dropout(0.1)

        self.layer_norm1 = nn.LayerNorm(1024)
        self.l1 = nn.Linear(1024, 512)
        self.l2 = nn.Linear(512, 1)

        self._init_weights(self.layer_norm1)
        self._init_weights(self.l1)
        self._init_weights(self.l2)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self, input_ids, attention_mask):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]     
        out = torch.mean(last_hidden_state, 1)
        out = self.layer_norm1(out)
        out = self.drop_out1(out)
        out = self.l1(out)
       
        out = self.drop_out2(out)
        out = self.l2(out)
#         print("out:",out.shape)
        
        preds = out.squeeze(-1)
#         raise

        return preds    
def run(fold):
    
    def loss_fn(outputs,targets):
        return torch.sqrt(nn.MSELoss()(outputs.view(-1),targets.view(-1)))
    
    def evaluate(model,valid_loader):
        model.eval()
        valid_loss = 0
        all_targets, all_outputs = [], []
        with torch.no_grad():
            for i, (inputs,targets) in enumerate(valid_loader):
                inputs = {key:val.reshape(val.shape[0],-1) for key,val in inputs.items()}
                outputs = model(**inputs)
                valid_loss += loss_fn(outputs,targets).item()
                all_outputs.extend(outputs.cpu().detach().numpy().tolist())
                all_targets.extend(targets.cpu().detach().numpy().tolist())

        valid_loss /= len(valid_loader)
        valid_rmse = rmse_score(all_outputs,all_targets)
        return valid_loss,valid_rmse
        
    def train_and_evaluate_loop(train_loader,valid_loader,model,loss_fn,optimizer,
                                epoch,fold,best_score,valid_step,lr_scheduler=None):
        train_loss = 0
        for i, (inputs,targets) in enumerate(train_loader):
            optimizer.zero_grad()
            model.train()
            inputs = {key:val.reshape(val.shape[0],-1) for key,val in inputs.items()}
            outputs = model(**inputs)
            loss = loss_fn(outputs,targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            if lr_scheduler:
                lr_scheduler.step()
                        
            if (i% valid_step ==0) or ((i+1) == len(train_loader)):
                valid_loss,valid_rmse = evaluate(model,valid_loader) 
                        
                if valid_loss <= best_score:
                    print(f"Epoch:{epoch}|Batch: {i}|Step:{valid_step}|Train Loss:{train_loss/(i+1)}|Valid Loss:{valid_loss}|Rmse Score: {valid_rmse}")
                    print(f"{g_}Validation loss Decreased from {best_score} to {valid_loss}{sr_}")

                    best_score = valid_loss
                    torch.save(model.state_dict(),f'./model{fold}/model{fold}.bin')
                    tokenizer.save_pretrained(f'./model{fold}')
                    
        return best_score
        
    accelerator = Accelerator()
    print(f"{accelerator.device} is used")
    
    x_train,x_valid = train_data.query(f"Fold != {fold}"),train_data.query(f"Fold == {fold}")
    
    model = Model(config['model_path'])
    tokenizer = AutoTokenizer.from_pretrained('../input/huggingface-roberta/roberta-large')
    
    train_ds = CLRPDataset(x_train,tokenizer,config['max_len'])
    train_dl = DataLoader(train_ds,
                        batch_size = 8,
                        num_workers = 4,
                        shuffle=True,
                        pin_memory=True,
                        drop_last=True)

    valid_ds = CLRPDataset(x_valid,tokenizer,config['max_len'])
    valid_dl = DataLoader(valid_ds,
                        batch_size = 10,
                        num_workers = 4,
                        shuffle=False,
                        pin_memory=True,
                        drop_last=False)
        
    def create_optimizer(model):
        parameters = []
        lr = 3e-5
        for layer in range(23,-1,-1):
            
            layer_params = {
            'params': [p for n,p in model.named_parameters() if f'encoder.layer.{layer}.' in n],
                "weight_decay": 0.01,
            'lr': lr
                
        }
            parameters.append(layer_params)
            lr *= 0.975
        classifier_params = {
        'params': [p for n,p in model.named_parameters() if 'layer_norm' in n or 'linear' in n 
                   or 'pooling' in n],
            "weight_decay": 0.01,
        'lr': 2e-5
    }
        parameters.append(classifier_params)
        return AdamW(parameters)
    optimizer = create_optimizer(model)
    lr_scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps= 4* len(train_dl))

    model,train_dl,valid_dl,optimizer,lr_scheduler = accelerator.prepare(model,train_dl,valid_dl,optimizer,lr_scheduler)

    print(f"Fold: {fold}")
    best_score = 9999
    start_time = time.time()
    for epoch in range(config["epochs"]):
        print(f"Epoch Started:{epoch}")
        best_score = train_and_evaluate_loop(train_dl,valid_dl,model,loss_fn,optimizer,epoch,fold,
                                             best_score,config['valid_step'],lr_scheduler)
        
        end_time = time.time()
        print(f"{m_}Time taken by epoch {epoch} is {end_time-start_time:.2f}s{sr_}")
        start_time = end_time
        
    return best_score    
best_score_per_fold = [run(f) for f in range(config['nfolds'])]