In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel, AlbertModel, AlbertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
import tqdm
import gc
from sklearn.svm import SVR
import xgboost as xgb
import pickle

In [None]:
df_test = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
df_test

In [None]:
data_test = df_test['excerpt'].values
ind_test = df_test['id'].values
data_test[0], ind_test[0]

RoBerta base

In [None]:
%%bash
cp ../input/roberta-base-save/rob.zip .
cp ../input/roberta-base-save/rob_tok.zip .
unzip rob.zip
unzip rob_tok.zip 
rm -r rob.zip rob_tok.zip 

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(
    'rob_tok'
)

In [None]:
batch_size = 32

encoded_data_test = tokenizer.batch_encode_plus(
    data_test,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512,
    return_tensors='pt'
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
ids_test_tensor = torch.tensor(df_test.index, dtype=torch.float)

dataset_test = TensorDataset(input_ids_test,
                            attention_masks_test,
                            ids_test_tensor)

dataloader_test = DataLoader(
    dataset_test,
    batch_size=batch_size
)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
class BERTRegressor(torch.nn.Module): 
    def __init__(self, pretrained_src = 'rob'): 
        super().__init__()
        self.bert = RobertaModel.from_pretrained(pretrained_src)
        self.linear = torch.nn.Linear(768, 1)
        self.dropout = torch.nn.Dropout(0.15)
        
    def forward(self, input_ids, attention_mask): #x - tokenized batch
        hidden = self.bert(input_ids, 
                           attention_mask=attention_mask)[0][:, 0, :]#CLS token output                                                          
        output = self.linear(self.dropout(hidden))
        return output

    
def get_bert_embeddings(embedder, dataloader, device = device):
    embedder.eval()
    embeddings_all = []
    for batch in tqdm.notebook.tqdm(dataloader):        
        batch = tuple(b.to(device) for b in batch)        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
                 }
        with torch.no_grad():        
            output = embedder(**inputs)[0][:, 0, :]#CLS token output 

        embeddings_batch = output.detach().cpu().numpy()
        embeddings_all.append(embeddings_batch)

    return np.vstack(embeddings_all)

class BERTRegressorMP(torch.nn.Module): 
    def __init__(self, pretrained_src = 'rob'): 
        super().__init__()
        self.bert = RobertaModel.from_pretrained(pretrained_src)
        self.linear = torch.nn.Linear(768, 1)
        self.dropout = torch.nn.Dropout(0.1)        
        
    def forward(self, input_ids, attention_mask):           
        last_hidden_state = self.bert(input_ids, 
                           attention_mask=attention_mask)[0]   
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        output = self.linear(self.dropout(mean_embeddings))
        return output
    
class EmbedderMP(torch.nn.Module):
    def __init__(self, model): 
        super().__init__()
        self.bert = model.bert
    
    def forward(self, input_ids, attention_mask): 
        last_hidden_state = self.bert(input_ids, 
                           attention_mask=attention_mask)[0]   
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
def get_bert_embeddings_MP(embedder, dataloader, device = device):
    embedder.eval()
    embeddings_all = []
    for batch in tqdm.notebook.tqdm(dataloader):        
        batch = tuple(b.to(device) for b in batch)        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
                 }
        with torch.no_grad():        
            output = embedder(**inputs)

        embeddings_batch = output.detach().cpu().numpy()
        embeddings_all.append(embeddings_batch)

    return np.vstack(embeddings_all)

In [None]:
import gc
gc.collect()

In [None]:
preds_all = []

roberta-base-svr-5-folds-training

In [None]:
model_ids = [0, 1, 2, 3, 4]

In [None]:
for k in tqdm.notebook.tqdm(model_ids):
    model = BERTRegressor()
    PATH = f'../input/roberta-base-svr-5-folds-training/roberta_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    embedder = model.bert.to(device)
    embeddings_test = get_bert_embeddings(embedder, dataloader_test)
    filename = f'../input/roberta-base-svr-5-folds-training/svr_head_{k}.pkl'
    new_head = pickle.load(open(filename, 'rb'))
    preds = new_head.predict(embeddings_test)
    preds_all.append(preds.reshape((-1, 1)))
len(preds_all)

roberta-base-xgb-5-folds-training

In [None]:
model_ids = [0, 1, 2, 3, 4]

In [None]:
for k in tqdm.notebook.tqdm(model_ids):
    model = BERTRegressor()
    PATH = f'../input/roberta-base-xgb-5-folds-training/roberta_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    embedder = model.bert.to(device)
    embeddings_test = get_bert_embeddings(embedder, dataloader_test)
    filename = f'../input/roberta-base-xgb-5-folds-training/xgb_head_{k}.pkl'
    new_head = pickle.load(open(filename, 'rb'))
    preds = new_head.predict(embeddings_test)
    preds_all.append(preds.reshape((-1, 1)))
len(preds_all)

roberta-base-svr-5-folds-training-skf

In [None]:
model_ids = [0, 1, 2, 3, 4]

In [None]:
for k in tqdm.notebook.tqdm(model_ids):
    model = BERTRegressor()
    PATH = f'../input/roberta-base-svr-5-folds-training-skf/roberta_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    embedder = model.bert.to(device)
    embeddings_test = get_bert_embeddings(embedder, dataloader_test)
    filename = f'../input/roberta-base-svr-5-folds-training-skf/svr_head_{k}.pkl'
    new_head = pickle.load(open(filename, 'rb'))
    preds = new_head.predict(embeddings_test)
    preds_all.append(preds.reshape((-1, 1)))
len(preds_all)

roberta-base-mp-svr-5-folds-training

In [None]:
model_ids = [0, 1, 2, 3, 4]

In [None]:
for k in tqdm.notebook.tqdm(model_ids):
    model = BERTRegressorMP()
    PATH = f'../input/roberta-base-mp-svr-5-folds-training/roberta_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    embedder = EmbedderMP(model).to(device)
    embeddings_test = get_bert_embeddings_MP(embedder, dataloader_test)
    filename = f'../input/roberta-base-mp-svr-5-folds-training/svr_head_{k}.pkl'
    new_head = pickle.load(open(filename, 'rb'))
    preds = new_head.predict(embeddings_test)
    preds_all.append(preds.reshape((-1, 1)))
len(preds_all)

roberta-base-mp-xgb-5-folds-training

In [None]:
model_ids = [0, 1, 2, 3, 4]

In [None]:
for k in tqdm.notebook.tqdm(model_ids):
    model = BERTRegressorMP()
    PATH = f'../input/roberta-base-mp-xgb-5-folds-training/roberta_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    embedder = EmbedderMP(model).to(device)
    embeddings_test = get_bert_embeddings_MP(embedder, dataloader_test)
    filename = f'../input/roberta-base-mp-xgb-5-folds-training/xgb_head_{k}.pkl'
    new_head = pickle.load(open(filename, 'rb'))
    preds = new_head.predict(embeddings_test)
    preds_all.append(preds.reshape((-1, 1)))
len(preds_all)

roberta-base-mp-svr-5-folds-skf-training

In [None]:
model_ids = [0, 1, 2, 3, 4]

In [None]:
for k in tqdm.notebook.tqdm(model_ids):
    model = BERTRegressorMP()
    PATH = f'../input/roberta-base-mp-svr-5-folds-skf-training/roberta_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    embedder = EmbedderMP(model).to(device)
    embeddings_test = get_bert_embeddings_MP(embedder, dataloader_test)
    filename = f'../input/roberta-base-mp-svr-5-folds-skf-training/svr_head_{k}.pkl'
    new_head = pickle.load(open(filename, 'rb'))
    preds = new_head.predict(embeddings_test)
    preds_all.append(preds.reshape((-1, 1)))
len(preds_all)

roberta-base-mp-xgb-5-folds-training-skf

In [None]:
model_ids = [0, 1, 2, 3, 4]

In [None]:
for k in tqdm.notebook.tqdm(model_ids):
    model = BERTRegressorMP()
    PATH = f'../input/roberta-base-mp-xgb-5-folds-training-skf/roberta_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    embedder = EmbedderMP(model).to(device)
    embeddings_test = get_bert_embeddings_MP(embedder, dataloader_test)
    filename = f'../input/roberta-base-mp-xgb-5-folds-training-skf/xgb_head_{k}.pkl'
    new_head = pickle.load(open(filename, 'rb'))
    preds = new_head.predict(embeddings_test)
    preds_all.append(preds.reshape((-1, 1)))
len(preds_all)

In [None]:
!rm -r rob rob_tok

Distill RoBerta

In [None]:
%%bash
cp ../input/distil-roberta-base-save/rob.zip .
cp ../input/distil-roberta-base-save/rob_tok.zip .
unzip rob.zip
unzip rob_tok.zip 
rm -r rob.zip rob_tok.zip 

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(
    'rob_tok'
)

In [None]:
batch_size = 32

encoded_data_test = tokenizer.batch_encode_plus(
    data_test,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512,
    return_tensors='pt'
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
ids_test_tensor = torch.tensor(df_test.index, dtype=torch.float)

dataset_test = TensorDataset(input_ids_test,
                            attention_masks_test,
                            ids_test_tensor)

dataloader_test = DataLoader(
    dataset_test,
    batch_size=batch_size
)

distil-roberta-base-svr-5-folds-training

In [None]:
model_ids = [0, 1, 2, 3, 4]

In [None]:
for k in tqdm.notebook.tqdm(model_ids):
    model = BERTRegressor()
    PATH = f'../input/distil-roberta-base-svr-5-folds-training/distroberta_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    embedder = model.bert.to(device)
    embeddings_test = get_bert_embeddings(embedder, dataloader_test)
    filename = f'../input/distil-roberta-base-svr-5-folds-training/svr_head_{k}.pkl'
    new_head = pickle.load(open(filename, 'rb'))
    preds = new_head.predict(embeddings_test)
    preds_all.append(preds.reshape((-1, 1)))
len(preds_all)

In [None]:
!rm -r rob rob_tok

Albert base

In [None]:
%%bash
cp ../input/albert-base-save/al.zip .
cp ../input/albert-base-save/al_tok.zip .
unzip al.zip
unzip al_tok.zip 
rm -r al.zip al_tok.zip 

In [None]:
tokenizer = AlbertTokenizer.from_pretrained(
    'al_tok'
)

In [None]:
batch_size = 32

encoded_data_test = tokenizer.batch_encode_plus(
    data_test,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512,
    return_tensors='pt'
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
ids_test_tensor = torch.tensor(df_test.index, dtype=torch.float)

dataset_test = TensorDataset(input_ids_test,
                            attention_masks_test,
                            ids_test_tensor)

dataloader_test = DataLoader(
    dataset_test,
    batch_size=batch_size
)

albert-base-svr-5-folds-training

In [None]:
class BERTRegressor(torch.nn.Module): 
    def __init__(self, pretrained_src = 'rob'): 
        super().__init__()
        self.bert = AlbertModel.from_pretrained(pretrained_src)
        self.linear = torch.nn.Linear(768, 1)
        self.dropout = torch.nn.Dropout(0.15)
        
    def forward(self, input_ids, attention_mask): #x - tokenized batch
        hidden = self.bert(input_ids, 
                           attention_mask=attention_mask)[0][:, 0, :]#CLS token output                                                          
        output = self.linear(self.dropout(hidden))
        return output

In [None]:
model_ids = [0, 1, 2, 3, 4]

In [None]:
for k in tqdm.notebook.tqdm(model_ids):
    model = BERTRegressor(pretrained_src = 'al')
    PATH = f'../input/albert-base-svr-5-folds-training/albert_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    embedder = model.bert.to(device)
    embeddings_test = get_bert_embeddings(embedder, dataloader_test)
    filename = f'../input/albert-base-svr-5-folds-training/svr_head_{k}.pkl'
    new_head = pickle.load(open(filename, 'rb'))
    preds = new_head.predict(embeddings_test)
    preds_all.append(preds.reshape((-1, 1)))
len(preds_all)

In [None]:
!rm -r al al_tok

Final predictions

In [None]:
predictions_mean = np.hstack(preds_all).mean(axis = 1)
predictions_mean

In [None]:
ids = pd.DataFrame(ind_test, columns=['id'])['id']

In [None]:
sub_df = pd.DataFrame(predictions_mean, index=ids,
                      columns = ['target'])
sub_df

In [None]:
sub_df.to_csv('submission.csv')