In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR
import tqdm
from matplotlib import pyplot as plt
import copy
import gc
import pickle

In [None]:
%%bash
cp ../input/roberta-base-save/rob.zip .
cp ../input/roberta-base-save/rob_tok.zip .
unzip rob.zip
unzip rob_tok.zip 
rm -r rob.zip rob_tok.zip

In [None]:
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import StratifiedKFold
import tqdm
import gc
from sklearn.svm import SVR
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope as ho_scope

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
data.head()

In [None]:
class ContinuousStratifiedKFold(StratifiedKFold):
    def split(self, X, y, groups=None):
        num_bins = int(np.floor(1 + np.log2(len(y))))
        bins = pd.cut(y, bins=num_bins, labels=False)
        return super().split(X, bins, groups)
    
class BERTRegressor(torch.nn.Module): 
    def __init__(self, pretrained_src = 'rob'): 
        super().__init__()
        self.bert = RobertaModel.from_pretrained(pretrained_src)
        self.linear = torch.nn.Linear(768, 1)
        self.dropout = torch.nn.Dropout(0.15)
        
    def forward(self, input_ids, attention_mask): #x - tokenized batch
        hidden = self.bert(input_ids, 
                           attention_mask=attention_mask)[0][:, 0, :]#CLS token output                                                          
        output = self.linear(self.dropout(hidden))
        return output

class RMSELoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = torch.nn.MSELoss()
        
    def forward(self,yhat,y):
        loss = torch.sqrt(self.mse(yhat,y))
        return loss

def rmse_metric(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))


def evaluate(dataloader_val, model):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm.notebook.tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
        }
        target = batch[2]

        with torch.no_grad():        
            output = model(**inputs)
            
        loss = criterion(output, target.view(-1,1))
        loss_val_total += loss

        output = output.detach().cpu().numpy()
        target = target.cpu().numpy()
        predictions.append(output)
        true_vals.append(target)
    
    loss_val_avg = loss_val_total / len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


def get_bert_embeddings(embedder, dataloader, device = device):
    embedder.eval()
    embeddings_all = []
    for batch in tqdm.notebook.tqdm(dataloader):        
        batch = tuple(b.to(device) for b in batch)        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
                 }
        with torch.no_grad():        
            output = embedder(**inputs)[0][:, 0, :]#CLS token output 

        embeddings_batch = output.detach().cpu().numpy()
        embeddings_all.append(embeddings_batch)

    return np.vstack(embeddings_all)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(
    'rob_tok'
)

In [None]:
BATCH_SIZE = 16

warm_prop = 0.1
epochs = 8
clip = 1

In [None]:
kf = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print('START')
for k, (train_ids, val_ids) in enumerate(tqdm.notebook.tqdm(kf.split(X=data, y=data['target'].values))):
    print('**************')
    print('------------')
    print('**************')
    print(f'ITERATION {k} starts')

    gc.collect()

    print('------------')
    print('data slicing ...')

    data_train = data.iloc[train_ids]['excerpt'].values
    data_val = data.iloc[val_ids]['excerpt'].values
    print(f'train/val data shapes: {data_train.shape}, {data_val.shape}')
    target_train = data.iloc[train_ids]['target'].values
    target_val = data.iloc[val_ids]['target'].values


    print('------------')
    print('data preparation ...')
    encoded_data_train = tokenizer.batch_encode_plus(
        data_train,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=512,
        return_tensors='pt',
    )

    encoded_data_val = tokenizer.batch_encode_plus(
        data_val,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=512,
        return_tensors='pt'
    )


    input_ids_train = encoded_data_train['input_ids']
    attention_masks_train = encoded_data_train['attention_mask']
    values_train = torch.tensor(target_train, dtype=torch.float)

    input_ids_val = encoded_data_val['input_ids']
    attention_masks_val = encoded_data_val['attention_mask']
    values_val = torch.tensor(target_val, dtype=torch.float)

    dataset_train = TensorDataset(input_ids_train,
                                 attention_masks_train,
                                 values_train)
    dataset_val = TensorDataset(input_ids_val,
                                attention_masks_val,
                                values_val)

    dataloader_train = DataLoader(
        dataset_train,
        sampler=RandomSampler(dataset_train),
        batch_size=BATCH_SIZE
    )

    dataloader_val = DataLoader(
        dataset_val,
        sampler=RandomSampler(dataset_val),
        batch_size=2*BATCH_SIZE
    )

    print('------------')
    print('Roberta finetuning ...')

    model = BERTRegressor().to(device)
    criterion = RMSELoss()
    optimizer = AdamW(
        model.parameters(),
        lr= 3e-5,#the original paper:2e-5 -> 5e-5
        eps=1e-8
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(len(dataloader_train)*epochs * warm_prop),
        num_training_steps=len(dataloader_train)*epochs
    )


    best_val_loss = float('inf')
    for epoch in tqdm.notebook.tqdm(range(epochs)):
        print(f'Epoch {epoch}')
        model.train()

        epoch_loss = 0
        for batch in tqdm.notebook.tqdm(dataloader_train):

            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids':      batch[0],
                    'attention_mask': batch[1]
              }
            target = batch[2]

            optimizer.zero_grad()        

            output = model(**inputs)     
            loss = criterion(output, target.view(-1,1))      
            loss.backward()
            epoch_loss += loss.item()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)      
            optimizer.step()
            scheduler.step()     

        val_loss, predictions, true_vals = evaluate(dataloader_val, model)
        if val_loss < best_val_loss:
            best_val_loss = val_loss        
            torch.save(model.state_dict(), f'roberta_base_{k}.pt')
            best_model = copy.deepcopy(model).cpu()
        train_loss = epoch_loss / len(dataloader_train)
        rmse_val = rmse_metric(true_vals, predictions)
        print('-------')
        print(f'Training loss: {train_loss}')
        print(f'Validation loss: {val_loss}')
        print(f"RMSE on validation: {rmse_val}")
    
    print('loaded best model with lm head performance...')
    dataloader_val_frozen = DataLoader(
        dataset_val,
        batch_size=2*BATCH_SIZE
    )  
    
    model = BERTRegressor()
    PATH = f'roberta_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    model.to(device)
    val_loss, predictions, true_vals = evaluate(dataloader_val_frozen, model)
    rmse_val = rmse_metric(true_vals, predictions)
    print(f"RMSE on validation: {rmse_val}")
    
    del model
    gc.collect()

    print('------------')
    print('Embeddings extraction ...')

    embedder = best_model.bert.to(device)    
    
    dataloader_train_frozen = DataLoader(
        dataset_train,
        batch_size=2*BATCH_SIZE,
    )

    embeddings_train = get_bert_embeddings(embedder, dataloader_train_frozen)
    embeddings_val = get_bert_embeddings(embedder, dataloader_val_frozen)
    
    del embedder
    gc.collect()
    
    print('------------')
    print('SVR head HP tuning ...')
    def hyperopt_train_test(params):
        estimator = SVR(**params)  
        estimator.fit(embeddings_train, target_train)
        preds = estimator.predict(embeddings_val)
        metric = rmse_metric(target_val, preds)
        return metric

    space_svr = {'C':  hp.loguniform('C', np.log(0.0001), np.log(1000)) - 0.0001,
                 'gamma':  hp.loguniform('gamma', np.log(0.0001), np.log(1000)) - 0.0001
                }
    def f(params):
        rmse_metric_val = hyperopt_train_test(params)
        return {'loss': rmse_metric_val, 'status': STATUS_OK}

    trials = Trials()
    best = fmin(f, space_svr, algo = tpe.suggest, max_evals = 50, trials=trials)
    print('best', best)


    svr_head = SVR(**best)    
    svr_head.fit(embeddings_train, target_train)
    filename = f'svr_head_{k}.pkl'
    pickle.dump(svr_head, open(filename, 'wb'))
    svr_head = pickle.load(open(filename, 'rb'))
    preds_svr = svr_head.predict(embeddings_val)
    rmse_val = rmse_metric(target_val, preds_svr)
    print(f"RMSE on validation: {rmse_val}")    

    print('------------')
    print(f'Iteration {k} completed.')
    
print('**************')
print('------------')
print('**************')    
print('FINISH')

In [None]:
!rm -r rob rob_tok