In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import string
import re
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm.notebook import tqdm
import transformers
import torch
import torch.nn as nn
from transformers import get_linear_schedule_with_warmup, AdamW, get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup
from transformers import get_polynomial_decay_schedule_with_warmup
from sklearn.metrics import mean_squared_error
import torch.nn.functional as F

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained('distilroberta-base')

In [None]:
MAX_LEN = 256
tokenizer(train.excerpt[0], return_tensors='pt', max_length=256, padding='max_length')

In [None]:
# tokenizer(train.excerpt[0], return_tensors='pt', max_length=256, padding='max_length')
class train_valid_dataset():
    def __init__(self, excerpt, target):
        self.excerpt = excerpt
        self.target = target
        self.maxlen = MAX_LEN
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self, index):
        excerpt = str(self.excerpt[index])
        
        tokenizer_dict = tokenizer(excerpt,
                                  return_tensors='pt',
                                  max_length=self.maxlen,
                                  padding='max_length',
                                  truncation=True)
        
        ids = tokenizer_dict['input_ids']
        mask = tokenizer_dict['attention_mask']
        
        return {'ids': torch.tensor(ids, dtype=torch.long),
               'mask': torch.tensor(mask, dtype=torch.long),
               'target': torch.tensor(self.target[index], dtype=torch.float)}

In [None]:
model = transformers.AutoModelForSequenceClassification.from_pretrained('distilroberta-base',
                                                                        num_labels=1)

In [None]:
def training_function(dataloader, model, device, optimizer, scheduler):
    model.train()
    sum_loss = 0.0
    total = 0
    iterator = tqdm(enumerate(dataloader),
                   total=len(dataloader))
    for index, data in iterator:
        ids = data['ids']
        mask = data['mask']
        targets = data['target']
        
        ids = ids.squeeze()
        mask = mask.squeeze()
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        
        outputs = model(ids,
                       mask)
        
        outputs = outputs.logits.squeeze()
        
        loss = F.mse_loss(outputs, targets)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        sqrt_loss = np.sqrt(loss.detach().cpu().numpy())
        
        sum_loss += sqrt_loss*targets.shape[0]
        
        total += targets.shape[0]
    
    return sum_loss/total

In [None]:
def validation_function(dataloader, model, device):
    model.eval()
    sum_loss = 0.0
    total = 0
    iterator = tqdm(enumerate(dataloader),
                   total=len(dataloader))
    for index, data in iterator:
        ids = data['ids']
        mask = data['mask']
        targets = data['target']
        
        ids = ids.squeeze()
        mask = mask.squeeze()
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        with torch.no_grad():
            outputs = model(ids,
                           mask)
        
        outputs = outputs.logits.squeeze()
        
        loss = F.mse_loss(outputs, targets)
        
        sqrt_loss = np.sqrt(loss.detach().cpu().numpy())
        
        sum_loss += sqrt_loss*targets.shape[0]
        
        total += targets.shape[0]
    
    return sum_loss/total

In [None]:
num_bins = int(np.floor(1 + np.log2(len(train))))
train.loc[:, 'bins'] = pd.cut(train['target'], bins=num_bins, labels=False)
bins = train.bins.to_numpy()
skfold = StratifiedKFold(n_splits=5)
new = skfold.split(train, bins)

In [None]:
epochs = 5
device = torch.device('cuda')

for k, (train_idx, valid_idx) in enumerate(new):
    print("******************** FOLD: %d ********************" % (k+1))
    x_train, y_train = train.iloc[train_idx], train.target.iloc[train_idx]
    x_valid, y_valid = train.iloc[valid_idx], train.target.iloc[valid_idx]
    
    x_train = x_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    x_valid = x_valid.reset_index(drop=True)
    y_valid = y_valid.reset_index(drop=True)
    
    model = transformers.AutoModelForSequenceClassification.from_pretrained('distilroberta-base',
                                                                            num_labels=1)
    model.to(device)

    train_dataset = train_valid_dataset(excerpt=x_train.excerpt,
                                       target=y_train)
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                  batch_size=16)
    valid_dataset = train_valid_dataset(excerpt=x_valid.excerpt,
                                       target=y_valid)
    valid_dataloader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                                  batch_size=8)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0},]
    optimizer = AdamW(optimizer_parameters, lr=0.00002)
    training_steps = int(len(train_dataloader)*epochs)
#     num_warmup_steps = int(training_steps*0.1)
    num_warmup_steps = 0
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=num_warmup_steps,
                                                num_training_steps=training_steps)
    
    for epoch in range(epochs):
        print("#################### EPOCH: %d ####################" % (epoch+1))
        train_loss = training_function(train_dataloader, model, device, optimizer, scheduler)
        valid_loss = validation_function(valid_dataloader, model, device)
        print("Training Loss: %f, Validation Loss: %f" % (train_loss, valid_loss))
    model.save_pretrained('./model_'+str(k+1))
    tokenizer.save_pretrained('./model_'+str(k+1))