# About this notebook

* Pytorch DistilBert training code.
* Inference notebook is [here](https://www.kaggle.com/snnclsr/commonlit-pytorch-distilbert-inference/).


If this notebook is helpful, feel free to upvote :)

**Some of the parts of this notebook taken from [Y.Nakama](https://www.kaggle.com/yasufuminakama)'s notebooks. Please also check his notebooks as well from [here](https://www.kaggle.com/yasufuminakama/code)**

# Imports

In [None]:
import os
import time
import math
import random
from pathlib import Path

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer
from transformers import DistilBertTokenizer, DistilBertModel

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Data Loading

In [None]:
OUTPUT_DIR = "./"
BASE_DATA_PATH = Path("../input/commonlitreadabilityprize/")

!ls {BASE_DATA_PATH}

In [None]:
df_train = pd.read_csv(BASE_DATA_PATH / "train.csv")
df_test = pd.read_csv(BASE_DATA_PATH / "test.csv")
df_sub = pd.read_csv(BASE_DATA_PATH / "sample_submission.csv")

In [None]:
df_train.head(1)

In [None]:
df_train.target.max(), df_train.target.min()

In [None]:
df_test.head(1)

# Utils

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))



def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
        
LOGGER = init_logger()

# Dataset

In [None]:
class CommonLitDataset(Dataset):
    
    def __init__(self, df, tokenizer, max_length):
    
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        tokenized_input = self.tokenizer(row.excerpt, return_tensors="pt", 
                                         max_length=self.max_length, 
                                         padding="max_length", truncation=True)
        return {
            "ids": tokenized_input["input_ids"][0],
            "masks": tokenized_input["attention_mask"][0],
            "targets": torch.tensor(row.target).float()
        }

# Model

In [None]:
class TextRegressionModel(nn.Module):
    
    def __init__(self, model_name, dropout_p=0.1):
        super(TextRegressionModel, self).__init__()
        
        self.model = AutoModel.from_pretrained(model_name)
        self.features = nn.Linear(768, 768)
        self.dropout = nn.Dropout(dropout_p)
        self.out = nn.Linear(768, 1)
        
    def forward(self, input_ids, attention_mask):
        
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        output = F.relu(self.features(output.last_hidden_state[:, 0]))
        output = self.dropout(output)
        output = self.out(output)
        return output

# Helper Functions

In [None]:
def train_step(model, criterion, optimizer, data_loader, epoch, device=device):
    
    batch_time = AverageMeter()
    data_time  = AverageMeter()
    train_loss = AverageMeter()
    model.train()
    
    start = end = time.time()
    
    for step, batch in enumerate(data_loader):
        data_time.update(time.time() - end)
        
        input_ids = batch["ids"].to(device)
        attention_masks = batch["masks"].to(device)
        targets = batch["targets"].to(device)
        bs = input_ids.size(0)
        
        output = model(input_ids, attention_masks)
        loss = criterion(output.squeeze(1), targets)
        train_loss.update(loss.item(), bs)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        batch_time.update(time.time() - end)
        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(data_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                      epoch+1, step, len(data_loader), batch_time=batch_time,
                      data_time=data_time, loss=train_loss,
                      remain=timeSince(start, float(step+1)/len(data_loader)))
                 )

    return train_loss.avg
        
    
def eval_step(model, criterion, data_loader, epoch, device=device):
    
    batch_time = AverageMeter()
    data_time  = AverageMeter()
    eval_loss = AverageMeter()
    model.eval()
    
    start = end = time.time()
    
    for step, batch in enumerate(data_loader):
        data_time.update(time.time() - end)
        
        input_ids = batch["ids"].to(device)
        attention_masks = batch["masks"].to(device)
        targets = batch["targets"].to(device)
        bs = input_ids.size(0)
        
        output = model(input_ids, attention_masks)
        loss = criterion(output.squeeze(1), targets)
        eval_loss.update(loss.item(), bs)
        batch_time.update(time.time() - end)
        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(data_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                      epoch+1, step, len(data_loader), batch_time=batch_time,
                      data_time=data_time, loss=eval_loss,
                      remain=timeSince(start, float(step+1)/len(data_loader)))
                 )

    return eval_loss.avg

# Config

In [None]:
class CFG:
    model_name = "distilbert-base-cased"
    
    max_length = 256
    dropout_p = 0.5
    batch_size = 32
    n_epochs = 5
    weight_decay = 1e-6
    lr = 3e-4
    min_lr = 1e-6
    scheduler = "CosineAnnealingLR"
    T_max = 10
    seed = 42
    n_folds = 5    
    print_freq = 50
    num_workers = 4


# Train Loop

In [None]:
def train_loop(folds, fold):
    
    train_index = folds[folds["fold"] != fold].index
    valid_index = folds[folds["fold"] == fold].index
    
    train_folds = folds.loc[train_index].reset_index(drop=True)
    valid_folds = folds.loc[valid_index].reset_index(drop=True)
    
    tokenizer = DistilBertTokenizer.from_pretrained(CFG.model_name)
    tokenizer.save_pretrained(f"{CFG.model_name}_tokenizer")
    
    train_dataset = CommonLitDataset(df=train_folds, tokenizer=tokenizer, max_length=CFG.max_length)
    valid_dataset = CommonLitDataset(df=valid_folds, tokenizer=tokenizer, max_length=CFG.max_length)
    
    train_data_loader = DataLoader(train_dataset, 
                                   batch_size=CFG.batch_size, 
                                   shuffle=True, 
                                   num_workers=CFG.num_workers, 
                                   pin_memory=True)
    valid_data_loader = DataLoader(valid_dataset, 
                                   batch_size=CFG.batch_size, 
                                   shuffle=False, 
                                   num_workers=CFG.num_workers, 
                                   pin_memory=True)
    
    def get_scheduler(optimizer):
        if CFG.scheduler=='CosineAnnealingLR':
            scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler
    
    model = TextRegressionModel(model_name=CFG.model_name, dropout_p=CFG.dropout_p)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    scheduler = get_scheduler(optimizer)
    
    criterion = nn.MSELoss().to(device)
    best_loss = np.inf
    
    for epoch in range(CFG.n_epochs):
        
        start_time = time.time()
        train_loss = train_step(model, criterion, optimizer, train_data_loader, epoch)
        eval_loss = eval_step(model, criterion, valid_data_loader, epoch)
        scheduler.step()
        
        elapsed = time.time() - start_time
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {train_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - avg_eval_loss: {eval_loss:.4f}')
        
        if eval_loss < best_loss:
            best_loss = eval_loss
            
            torch.save({
                "model": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict()
            }, f"{CFG.model_name}_fold_{fold}_best.pth")
        

# CV Split

In [None]:
from sklearn.model_selection import KFold

folds = df_train.copy()
Fold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
for n, (train_idx, valid_idx) in enumerate(Fold.split(folds)):
    folds.loc[valid_idx, "fold"] = int(n)
    
folds["fold"] = folds["fold"].astype(int)
print(folds.groupby(["fold"]).size())

# Main

In [None]:
def main():
    
    # oof_df = pd.DataFrame()
    for fold in range(CFG.n_folds):
        train_loop(folds, fold)
        

In [None]:
seed_torch(seed=CFG.seed)
main()