In [None]:
import pandas as pd
import numpy as np
import torch

import transformers
import torch.nn as nn
from torch.utils.data import Dataset
import torch.nn as nn
from sklearn import model_selection
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5

TOKENIZER = transformers.XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

In [None]:
%cd ../input/cpythonlibrary/cpython-master
from Lib import copy
%cd /kaggle/working

In [None]:
from torch.utils.data import DataLoader
import gc

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv", usecols = ['id','excerpt', 'target'])
df.head()

In [None]:
df["kfold"] = -1    
df = df.sample(frac=1).reset_index(drop=True)
y = df.target.values
kf = model_selection.KFold(n_splits=5) # KFold for regression problems

for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f
df.head(10)

In [None]:
class XLNETDataset:
    def __init__(self, excerpt, target):
        self.excerpt = excerpt
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())

        inputs = self.tokenizer.encode_plus(
            excerpt,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        padding_length  =  self.max_len - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float),
        }

In [None]:
fold = 0
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

train_data = XLNETDataset(excerpt = df_train.excerpt.values, 
                         target = df_train.target.values)

val_data = XLNETDataset(excerpt = df_valid.excerpt.values, 
                       target = df_valid.target.values)

idx = 7

print(val_data[idx]['ids'])
print(val_data[idx]['mask'])
print(val_data[idx]['token_type_ids'])
print(val_data[idx]['targets'])

In [None]:
training_dataloader = DataLoader(train_data,
                        num_workers= 4,
                        batch_size= TRAIN_BATCH_SIZE,
                        shuffle=True,
                        drop_last=True
                       )

val_dataloader = DataLoader(val_data,
                        num_workers= 4,
                        batch_size= VALID_BATCH_SIZE,
                        shuffle=False,
                        drop_last=False
                       )

In [None]:
# checking if cuda is available
from torch import device as device_

device = device_("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
class XLNETBERTBaseCased(nn.Module):
    def __init__(self):
        super(XLNETBERTBaseCased, self).__init__()
        self.bert = transformers.XLNetModel.from_pretrained('xlnet-base-cased', return_dict=False)
        self.bert_drop = nn.Dropout(0.5)
        self.out = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        x = self.bert(ids, attention_mask=mask, token_type_ids = token_type_ids)
        
        o1 = x[0]
        
        mean_pooling = torch.mean(o1, 1)
        max_pooling, _ = torch.max(o1, 1) 
        avg_sum = torch.add(mean_pooling, max_pooling)/2
        
        output = self.out(avg_sum)
        return output
    
model = XLNETBERTBaseCased()
model = model.to(device)

In [None]:
param_optimizer = list(model.named_parameters())

no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.01,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)

optimizer = AdamW(optimizer_parameters, lr=2e-5)

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
)

In [None]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))

loss_fn = RMSELoss()

In [None]:
# defining the training loop
def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
    running_loss = 0.0
    all_targets = 0
    all_predictions = 0
    
    model.train()
    
    for batch_index,dataset in enumerate(data_loader):
        ids = dataset['ids']
        token_type_ids = dataset['token_type_ids']
        mask = dataset['mask']
        targets = dataset['targets']
        
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        optimizer.zero_grad()

        outputs = model(ids = ids,
                        mask = mask,
                        token_type_ids = token_type_ids)
        
        loss = loss_fn(outputs, targets)

        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()
        
        del ids, token_type_ids, mask, targets
        gc.collect()
        torch.cuda.empty_cache()
            
    train_loss = running_loss / float(len(train_data))
    
    return train_loss



def eval_loop_fn(data_loader, model, device):
    running_loss = 0.0
    all_targets = 0
    all_predictions = 0
    
    model.eval()
    
    for batch_index,dataset in enumerate(data_loader):
        ids = dataset['ids']
        token_type_ids = dataset['token_type_ids']
        mask = dataset['mask']
        targets = dataset['targets']
        
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        outputs = model(ids = ids,
                        mask = mask,
                        token_type_ids = token_type_ids)
        
        loss = loss_fn(outputs, targets)
        
        running_loss += loss.item()
        
        del ids, token_type_ids, mask, targets
        gc.collect()
        torch.cuda.empty_cache()
    
    valid_loss = running_loss / float(len(val_data))
    
    return valid_loss

In [None]:
def _run():
    no_of_folds = 5
    for i in range(no_of_folds):
        a_string = "*" * 20

        print(a_string, " FOLD NUMBER ", i, a_string)
        
        df_train = df[df.kfold != i].reset_index(drop=True)
        df_valid = df[df.kfold == i].reset_index(drop=True)
        
        all_RMSE = []
        
        for epoch in range(EPOCHS):
            print(f"Epoch --> {epoch+1} / {EPOCHS}")
            print(f"-------------------------------")

            train_loss = train_loop_fn(training_dataloader, model, optimizer, device, scheduler)
            print('RMSE training Loss: {:.4f}'.format(train_loss))

            valid_loss = eval_loop_fn(val_dataloader, model, device)
            print('RMSE validation Loss: {:.4f}\n'.format(valid_loss))
            
            all_RMSE.append(valid_loss)
        print('\n')
        
        if i < 1:
            best_loss = min(all_RMSE)
            best_model = copy.deepcopy(model)
        else:
            if best_loss < min(all_RMSE):
                continue
            else:
                best_loss = min(all_RMSE)
                best_model = copy.deepcopy(model)
    
    torch.save(best_model,'./bert_model2.bin')
    print()
    print("The least loss we got among all the folds is {:.4f}".format(best_loss))
        
if __name__ == "__main__":
    _run()