In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the libraries

In [None]:
import pandas as pd
import numpy as np

%matplotlib inline
from glob import glob
import os
import matplotlib.pyplot as plt
import json
from collections import defaultdict
import gc
gc.enable()

import torch
import torch.nn as nn #nn class to make code more concise
import torch.nn.functional as F #contains all functions of nn class as well as a wide range of activation and loss functions
import torch.optim as optim #optimization algorithms
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, 
    SequentialSampler, RandomSampler
)
from transformers import RobertaConfig #For RoBERTa configuration. It is used to instantiate a RoBERTa model according to the specified arguments, defining the model architecture.
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup
)
from transformers import RobertaTokenizer #byte-level BPE (Byte Pair Encoding) for tokenizer
from transformers import RobertaModel #Initializing the model
from IPython.display import clear_output
from tqdm import tqdm, trange

## Reading the datasets

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv') #Training data set
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv') # Test set

### Basic data exploration

#### Trainset

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

#### Testset

In [None]:
test_df.head()

In [None]:
test_df.info()

# Feature Engineering

In [None]:
# Excerpts (passages) to features
def excerpt_to_feature(data, tokenizer, max_len, is_test=False):
    data = data.replace('\n', '') #Removing \n
    
    #Tokenize and prepare a sequence (of tokens) for the model
    tok = tokenizer.encode_plus( 
        data, 
        max_length=max_len, 
        truncation=True, #truncate token by token, to a maximum length specified with the argument max_length
        return_attention_mask=True,
        return_token_type_ids=True
    )
    curr_sent = {}
    padding_length = max_len - len(tok['input_ids']) #eg: tok['input_ids'] -> [12, 11, 12038]
    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length) #Pad each sentence to the maxi length there is in your batch.
    curr_sent['token_type_ids'] = tok['token_type_ids'] + \
        ([0] * padding_length) #indicate to the model which part of the inputs correspond to the first sentence (value is 1) and which part corresponds to second sentence/segment (0).
    curr_sent['attention_mask'] = tok['attention_mask'] + \
        ([0] * padding_length) # attention_mask points out which tokens the model should pay attention to (ignore padding tokens). 1 for tokens which are not masked else 0.
    return curr_sent

## Retrieve the dataset

In [None]:
class DatasetRetriever(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.values.tolist() #list of excerpt values
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt, label = self.excerpts[item], self.targets[item]
            features = excerpt_to_feature (
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.double),
            }
        else:
            excerpt = self.excerpts[item]
            features = excerpt_to_feature (
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

# Base RoBERTa Model

In [None]:
class ReadabilityModel(nn.Module):
    def __init__(
        self, 
        model_name, #roberta-base
        config,  
        multisample_dropout=False,
        output_hidden_states=False
    ):
        super(ReadabilityModel, self).__init__()
        self.config = config
        self.roberta = RobertaModel.from_pretrained( #instantiate the model
            model_name, 
            output_hidden_states=output_hidden_states
        )
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)]) #Reguralization/Scaling: During training, randomly zeroes some of the elements of the input tensor with probability p (0.3)
        self.regressor = nn.Linear(config.hidden_size, 1) #Applies a linear transformation to the incoming data
        self._init_weights(self.layer_norm)
        self._init_weights(self.regressor)
        
    #To initialize the layers of the model 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
 
    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        sequence_output = outputs[1]
        sequence_output = self.layer_norm(sequence_output)
 
        # multi-sample dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.regressor(dropout(sequence_output))
            else:
                logits += self.regressor(dropout(sequence_output))
        
        logits /= len(self.dropouts)
 
        # calculate loss
        loss = None
        if labels is not None:
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = torch.sqrt(loss_fn(logits, labels.view(-1)))
        
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

#### MakeModel and MakeLoader

In [None]:
def make_model(model_name, num_labels=1):
    tokenizer = RobertaTokenizer.from_pretrained(model_name) #construct RoBERTa tokenizer
    config = RobertaConfig.from_pretrained(model_name) #instantiate a RoBERTa model according to the specified arguments
    config.update({'num_labels':num_labels})
    model = ReadabilityModel(model_name, config=config) #Initializing the model from the configuration
    return model, tokenizer

def make_loader(
    data, 
    tokenizer, 
    max_len,
    batch_size,
):
    
    test_dataset = DatasetRetriever(data, tokenizer, max_len, is_test=True)
    test_sampler = SequentialSampler(test_dataset) #Samples elements sequentially, always in the same order.
    test_loader = DataLoader( #Loads data from the test_dataset
        test_dataset, #map-style dataset (implements __getitem__())
        batch_size=batch_size // 2, 
        sampler=test_sampler, 
        pin_memory=False, 
        drop_last=False, 
        num_workers=0
    )

    return test_loader

### Evaluation

In [None]:
class Evaluation:
    def __init__(self, model, scalar=None):
        self.model = model
        self.scalar = scalar

    def evaluate(self, data_loader, tokenizer):
        preds = []
        self.model.eval()
        total_loss = 0
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(data_loader):
                input_ids, attention_mask, token_type_ids = batch_data['input_ids'], \
                    batch_data['attention_mask'], batch_data['token_type_ids']
                input_ids, attention_mask, token_type_ids = input_ids.cuda(), \
                    attention_mask.cuda(), token_type_ids.cuda()
                
                if self.scalar is not None:
                    with torch.cuda.amp.autocast(): #Instances of autocast serve as context managers that allow regions of script to run in mixed precision.
                        outputs = self.model(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids
                        )
                else:
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids
                    )
                
                logits = outputs[0].detach().cpu().numpy().squeeze().tolist() #input ids as list
                preds += logits #adding list of input ids; list of lists
        return preds

### Configurations

In [None]:
def config(fold, model_name, load_model_path):
    torch.manual_seed(2021)
    torch.cuda.manual_seed(2021)
    torch.cuda.manual_seed_all(2021)
    
    max_len = 250
    batch_size = 8

    model, tokenizer = make_model(
        model_name=model_name, 
        num_labels=1
    )
    model.load_state_dict(
        torch.load(f'{load_model_path}/model{fold}.bin')
    )
    test_loader = make_loader(
        test_df, tokenizer, max_len=max_len,
        batch_size=batch_size
    )

    if torch.cuda.device_count() >= 1:
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), 
            torch.cuda.get_device_name(0))
        )
        model = model.cuda() 
    else:
        raise ValueError('CPU training is not supported')

    # scaler = torch.cuda.amp.GradScaler()
    scaler = None
    return (
        model, tokenizer, 
        test_loader, scaler
    )

## Running the model for predictions

In [None]:
def run(fold=0, model_name=None, load_model_path=None):
    model, tokenizer, \
        test_loader, scaler = config(fold, model_name, load_model_path)
    
    import time

    evaluator = Evaluation(model, scaler)

    test_time_list = []

    torch.cuda.synchronize()
    tic1 = time.time()

    preds = evaluator.evaluate(test_loader, tokenizer)

    torch.cuda.synchronize()
    tic2 = time.time() 
    test_time_list.append(tic2 - tic1)
    
    del model, tokenizer, test_loader, scaler
    gc.collect()
    torch.cuda.empty_cache()
    
    return preds

In [None]:
%%time
pred_base = pd.DataFrame()
for fold in tqdm(range(5)):
    pred_base[f'fold{fold}'] = run(fold, '../input/roberta-base/', '../input/commonlit-roberta-base-i/')

### Saving the predictions to submission file

In [None]:
sub = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
sub['target'] = pred_base.mean(axis=1).values.tolist()
sub.to_csv('submission.csv', index=False)

In [None]:
#Output
sub