## COLAB One-timers

In [None]:
# !nvidia-smi
# from google.colab import drive

# drive.mount('/content/drive', force_remount=True)
# !mkdir dataset

# !cp /content/drive/MyDrive/Research/commonlit/*.csv dataset


# !ls dataset
# !pip3 install -q transformers tensorboard_logger seqeval sentencepiece tokenizers sentence_transformers


In [None]:
# import sys
# DRIVE_DIR="/content/drive/MyDrive/Research/commonlit/"
# sys.path.insert(0, DRIVE_DIR)
# from utils import seed_everything, save_model_weights, count_params

## Imports

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" 
import time, torch, random, glob, re, gc, datetime, tokenizers, pdb

import numpy as np
import transformers
import pandas as pd
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt

from tokenizers import *
from transformers import *
from functools import partial
from pathlib import Path
from tqdm.notebook import tqdm
from torch.nn import functional as F
from itertools import cycle, chain
from torch.utils.data import Dataset, DataLoader, IterableDataset, TensorDataset
from sklearn.model_selection import train_test_split, RepeatedKFold, KFold
from ast import literal_eval as eval

# Configs and Globals

In [None]:
class Config:

    random_state=2021
    k=4
    device="cuda"
    # selected_folds=[3,4]
    selected_folds=list(range(k))
    seed = 2021
    model="bert-base-uncased"
    # checkpoint="/content/drive/MyDrive/Research/commonlit/2021-06-24/Regression_roberta-base_ fold - 5_bs768.pt"
    checkpoint="/content/drive/MyDrive/Research/commonlit/2021-06-20/Regression_bert-base-cased_fold-1_epoch-10_full_model.pt"
#     checkpoint=None
    task="Regression"
    pretrained=True
    lowercase = False

    num_labels=1
    batch_size = 768
    batch_size_val = int(batch_size * 1.5)
    weight_decay = 1.
    
    epochs = 100
    lr = 1e-3
    warmup_prop = 0.1
    save_every_epoch=10

    
    freeze_transformer=True #freeze the transformer model and train only the final dense layer
    model_names=["transformer."+i for i in ["roberta", "bert", "albert", "transformer", "distilbert"]]


CP_DIR=Path("/content/drive/MyDrive/Research/commonlit")
NUM_WORKERS = 2
DATA_DIR=Path("../input/commonlitreadabilityprize/")
TEST=DATA_DIR/"test.csv"
TRAIN=DATA_DIR / "train.csv"

In [None]:
TRANSFORMERS={
    "roberta-base":{
        "model_config":(RobertaForSequenceClassification, RobertaConfig),
        "tokenizer":RobertaTokenizer,
    },
    "bert-base-cased":{
        "model_config":(BertForSequenceClassification, BertConfig),
        "tokenizer":BertTokenizer,
    },
    "bert-base-uncased":{
        "model_config":(BertForSequenceClassification, BertConfig),
        "tokenizer":BertTokenizer,
        },
    "albert-base-v2":{
        "model_config":(AlbertForTokenClassification,AlbertConfig),
        "tokenizer":AlbertTokenizer,
    },
    "gpt2":{
        "model_config":(GPT2ForSequenceClassification, GPT2Config),
        "tokenizer":GPT2Tokenizer,
    },
    "distilbert-base-cased":{
        "model_config":(DistilBertForSequenceClassification, DistilBertConfig),
        "tokenizer":DistilBertTokenizer,
        
    }
}

# Helper functions

In [None]:
def get_checkpoint_dir():
    today=str(datetime.date.today())
    checkpoint_dir=CP_DIR/today

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    return checkpoint_dir

def checkpoint_name():
      return Config.task+"_"+Config.model

def save_log(list_, logdir):
    if os.path.exists(logdir):
        mode="a"
    else:
        mode="w"
    with open(logdir, mode) as f:
        f.writelines("\n".join(list_))
        f.writelines("\n")
    
def load(model, with_checkpoint=None):
    model=Transformer(model)
    if with_checkpoint:
        checkpoint=torch.load(with_checkpoint, map_location="cpu")
        model.load_state_dict(checkpoint)
        print("Checkpoint loaded!", end="\r")
    return model

def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
        

def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    
def save_model_weights(model, filename, cp_folder=""):
    torch.save(model.state_dict(), os.path.join(cp_folder, filename))
    print(f"Saved weights to {os.path.join(cp_folder, filename)}!!!\n")

# Dataset

In [None]:
class ComprehensionDataset(IterableDataset):
    def __init__(self, df, tokenizer, shuffle=False, max_len=256, train=True, normalize_label=False):
        if shuffle:
            self.df=df.sample(frac=1).reset_index(drop=True)
        else:
            self.df=df
            
        targets=df['target']
            
        self.tokenizer=tokenizer
        self.ids=list(range(len(df)))
        self.maxlen=max_len
        self.train=train

        if not tokenizer.pad_token_id:
            #GPT2
            self.pad_token=tokenizer.eos_token_id
        else:
            self.pad_token=tokenizer.pad_token_id
          
    def __len__(self):
        return len(self.ids)
    
    def pad(self, array):
        end_=len(array)%self.maxlen
        if end_>0:
            full_len=((len(array)//self.maxlen)+1)*self.maxlen
        else:
            full_len=self.maxlen
             
        newArray=np.full(full_len, self.pad_token)
        end=end_+(len(array)//self.maxlen)*self.maxlen
        newArray[:end]=array
        return newArray, end_
    
    def chunks(self, list_):

        "for an array longer than the maxlen, this function returns a 2d array bending the 1d"
        l=len(list_)
        n=self.maxlen
        if l%n>0:
            N=n*((l//n)+1)
        else:
            N=l-l%n
        for i in range(0, N, n):
            yield np.array(list_[i:i+n], dtype="long")

    
    
    def getitems(self, idx):
        row=self.df.iloc[idx]
        row_id=row['id']
        input_ids=self.tokenizer.encode(row['excerpt'])
        target=self.targets.iloc[idx]
  
        input_ids, end=self.pad(input_ids)
        input_ids=torch.LongTensor(list(self.chunks(input_ids)))
        shape=input_ids.shape
        
        attention_mask=torch.ones((shape[0]-1, shape[1]))
        last_mask=torch.zeros(self.maxlen)
        last_mask[:end]=torch.ones(end)
        attention_mask=torch.cat([attention_mask, last_mask.unsqueeze(0)]).long()
        
        for i, batch in enumerate(input_ids):
            yield {
                "id":row_id,
                "input_ids":torch.as_tensor(input_ids[i], dtype=torch.long),
                "attention_mask":torch.as_tensor(attention_mask[i], dtype=torch.long),
                "target":torch.as_tensor(target, dtype=torch.float),
            }

    
    
    def get_stream(self, ids):
        yield from chain.from_iterable(map(self.getitems, ids))
        
    def __iter__(self):
        return self.get_stream(self.ids)
    

# Model

In [None]:
class Transformer(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.name = model
        model_type, config_type=TRANSFORMERS[model]['model_config']
        if Config.pretrained:
            self.transformer=model_type.from_pretrained(model, output_hidden_states=True, num_labels=Config.num_labels)
        else:
            config_file=TRANSFORMERS[model]['config']
            config=config_type.from_json_file(config_file)
            config.num_labels=Config.num_labels
            config.output_hidden_states=True
            self.transformer=model_type(config)

    def forward(self, input_ids, attention_mask=None):
        output=self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        return output


# Fitting and K-fold

Much of the code below is borrowed from The√≤.

In [None]:
def fit(model,train_dataset,val_dataset, fold, epochs,batch_size, weight_decay=0,warmup_prop=0.0,lr=5e-4):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=NUM_WORKERS)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=NUM_WORKERS)

    #just the iterabledataset things :D
    val_loader_len=0
    train_loader_len=0
    for batch in train_loader:
        train_loader_len+=1
    for batch in val_loader:
        val_loader_len+=1

    
    opt_params = []
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    for n, p in model.named_parameters():
        wd = 0 if any(nd in n for nd in no_decay) else weight_decay
        opt_params.append(
            {"params": [p], "weight_decay": wd, "lr": lr}
        )

    optimizer = AdamW(opt_params, lr=lr, betas=(0.5, 0.999))

    n_steps=epochs*train_loader_len
    num_warmup_steps = int(warmup_prop * n_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, n_steps)

    total_steps = 0
    epoch=0

    avg_transformer_loss=0
    loss_function=nn.MSELoss()
    # loss_function=nn.L1Loss()

    save_log(["\n",str(datetime.datetime.now()).split(".")[0],"\n", checkpoint_name()+f"_fold_{fold+1}"], logdir=get_checkpoint_dir()/"log.txt")
    with tqdm(total=epochs, desc="Epoch {}/{}".format(epoch + 1, epochs), unit="sections", position=0,leave=True) as pbar:
        for epoch in range(epochs):
            model.train()
            start_time = time.time()
            optimizer.zero_grad()
            avg_loss = 0

            with tqdm(total=train_loader_len, desc="training iterations", unit="batch", position=1, leave=True) as pbar2:
                for step, data in enumerate(train_loader):
                  total_steps+=1
                  input_ids=data['input_ids']
                  labels=data['target']
                  attention_mask=data['attention_mask']
                  logits=model(input_ids=input_ids.to(Config.device), attention_mask=attention_mask.to(Config.device))['logits']
                  #pdb.set_trace()
                  loss=loss_function(logits.squeeze(1), labels.to(Config.device))
  
                  avg_loss += loss.item() / train_loader_len
                  nn.utils.clip_grad_norm_(model.parameters(), 10.0)
                  loss.backward()
                  optimizer.step()
                  scheduler.step()
                  model.zero_grad()
                  pbar2.update()

            model.eval()
            avg_val_loss = 0.
            avg_transformer_val_loss=0.
            preds, truths = [], []
            with torch.no_grad():
                with tqdm(total=val_loader_len, desc="validation iterations", unit="batch", position=2, leave=True) as pbar3:
                    for idx_val, data in enumerate(val_loader):
                      input_ids=data['input_ids']
                      labels=data['target']
                      attention_mask=data['attention_mask']
                      logits=model(input_ids=input_ids.to(Config.device), attention_mask=attention_mask.to(Config.device))['logits']
                      loss=loss_function(logits.squeeze(1), labels.to(Config.device))
                      avg_val_loss += loss.item() / val_loader_len
                      pbar3.update()
                      
            dt = time.time() - start_time
            lr = scheduler.get_last_lr()[0]
            if epochs!=1:
              if (epoch+1)%Config.save_every_epoch==0:
                save_model_weights(model, f'{checkpoint_name()}_fold-{fold+1}_epoch-{epoch+1}_{CHECKPOINT_KEYWORD}.pt', cp_folder=get_checkpoint_dir())

            log_lr=f"Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t \n"
            print(log_lr)
            log_score=f"loss={avg_loss:.3f}\t val_loss={avg_val_loss:.3f}  \n"
            print(log_score)
            save_log([log_lr, log_score], logdir=get_checkpoint_dir()/"log.txt")
            pbar.update()


    del loss, data, avg_val_loss, avg_loss, train_loader, val_loader
    if DEVICE != "cpu":
        torch.cuda.empty_cache()
    gc.collect()

    return preds


def fit(model,train_dataset,val_dataset, fold, epochs,batch_size, weight_decay=0,warmup_prop=0.0,lr=5e-4):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=NUM_WORKERS)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=NUM_WORKERS)

    val_loader_len=0
    train_loader_len=0

    #just the iterabledataset things :D
    for batch in train_loader:
        train_loader_len+=1
    for batch in val_loader:
        val_loader_len+=1

    
    opt_params = []
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    for n, p in model.named_parameters():
        wd = 0 if any(nd in n for nd in no_decay) else weight_decay
        opt_params.append(
            {"params": [p], "weight_decay": wd, "lr": lr}
        )

    optimizer = AdamW(opt_params, lr=lr, betas=(0.5, 0.999))

    n_steps=epochs*train_loader_len
    num_warmup_steps = int(warmup_prop * n_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, n_steps)

    total_steps = 0
    epoch=0

    avg_transformer_loss=0
    loss_function=nn.MSELoss()
    # loss_function=nn.L1Loss()

    save_log(["\n",str(datetime.datetime.now()).split(".")[0],"\n", checkpoint_name()+f"_fold_{fold+1}"], logdir=get_checkpoint_dir()/"log.txt")
    with tqdm(total=epochs, desc="Epoch {}/{}".format(epoch + 1, epochs), unit="sections", position=0,leave=True) as pbar:
        for epoch in range(epochs):
            model.train()
            start_time = time.time()
            optimizer.zero_grad()
            avg_loss = 0

            with tqdm(total=train_loader_len, desc="training iterations", unit="batch", position=1, leave=True) as pbar2:
                for step, data in enumerate(train_loader):
                  total_steps+=1
                  input_ids=data['input_ids']
                  labels=data['target']
                  attention_mask=data['attention_mask']
                  logits=model(input_ids=input_ids.to(Config.device), attention_mask=attention_mask.to(Config.device))['logits']
                  loss=loss_function(logits.squeeze(1), labels.to(Config.device))
  
                  avg_loss += loss.item() / len(train_loader)
                  nn.utils.clip_grad_norm_(model.parameters(), 10.0)
                  loss.backward()
                  optimizer.step()
                  scheduler.step()
                  model.zero_grad()
                  pbar2.update()

            model.eval()
            avg_val_loss = 0.
            avg_transformer_val_loss=0.
            preds, truths = [], []
            with torch.no_grad():
                with tqdm(total=val_loader_len, desc="validation iterations", unit="batch", position=2, leave=True) as pbar3:
                    for idx_val, data in enumerate(val_loader):
                      input_ids=data['input_ids']
                      labels=data['target']
                      attention_mask=data['attention_mask']
                      logits=model(input_ids=input_ids.to(Config.device), attention_mask=attention_mask.to(Config.device))['logits']
                      loss=loss_function(logits.squeeze(1), labels.to(Config.device))
                      avg_val_loss += loss.item() / len(val_loader)
                      pbar3.update()
                      
            dt = time.time() - start_time
            lr = scheduler.get_last_lr()[0]
            if epochs!=1:
              if (epoch+1)%Config.save_every_epoch==0:
                save_model_weights(model, f'{checkpoint_name()}_fold-{fold+1}_epoch-{epoch+1}_{CHECKPOINT_KEYWORD}.pt', cp_folder=get_checkpoint_dir())

            log_lr=f"Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t \n"
            print(log_lr)
            log_score=f"loss={avg_loss:.3f}\t val_loss={avg_val_loss:.3f}  \n"
            print(log_score)
            save_log([log_lr, log_score], logdir=get_checkpoint_dir()/"log.txt")
            pbar.update()


    del loss, data, avg_val_loss, avg_loss, train_loader, val_loader
    if DEVICE != "cpu":
        torch.cuda.empty_cache()
    gc.collect()

    return preds


# Train/Tune

In [None]:
df=pd.read_csv("dataset/train.csv")
df=df[df['standard_error']!=0.0]
len(df)

In [None]:
import pdb
CHECKPOINT_KEYWORD="frozenTransformer"
k_fold(df,save=True, config=Config)