In [None]:
# Imports

import os
import numpy as np
import pandas as pd
import random
import gc
import sys
import cv2
import math
import time

from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, logging
from transformers import AutoModelForSequenceClassification

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, SequentialSampler, RandomSampler, DataLoader
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
import torchvision
import torch.optim as optim


from tqdm.notebook import tqdm

import gc; gc.enable()
from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold, KFold

import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error

logging.set_verbosity_error()

## Mean Pooling

In [None]:
# Definitions

INPUT_DIR = '../input/commonlitreadabilityprize'
MODEL_DIR = '../input/roberta-transformers-pytorch/roberta-large'

HIDDEN_SIZE = 1024
NUM_HIDDEN_LAYERS = 24
LAYER_START = 4   # for WeightedLayerPoolingModel

HIDDEN_DIM_FC = 128    # for AttentionPooling

MAX_LENGTH = 300
LR = 2e-5
EPS = 1e-8

SEED = 42

NUM_FOLDS = 5

SEEDS = [64]

EPOCHS = 5
TRAIN_BATCH_SIZE = 8
VAL_BATCH_SIZE = 32

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# K Fold
class ContinuousStratifiedKFold(StratifiedKFold):
    def split(selfself, x, y, groups=None):
        num_bins = int(np.floor(1 + np.log2(len(y))))
        bins = pd.cut(y, bins=num_bins, labels=False)
        return super().split(x, bins, groups)
    
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
class MeanPoolingModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(HIDDEN_SIZE, 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        logits = self.linear(mean_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds     

In [None]:
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
test.head(2)

In [None]:
TEST_BATCH_SIZE = 1

def get_test_loader(data):

    x_test = data.excerpt.tolist()
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

    encoded_test = tokenizer.batch_encode_plus(
        x_test, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )

    dataset_test = TensorDataset(
        encoded_test['input_ids'],
        encoded_test['attention_mask']
    )

    dataloader_test = DataLoader(
        dataset_test,
        sampler = SequentialSampler(dataset_test),
        batch_size=TEST_BATCH_SIZE
    )
    
    return dataloader_test

test_dataloader = get_test_loader(test)

In [None]:
DIR = '../input/clrp-mp/'


all_predictions = [] 
for seed in SEEDS:
    
    fold_predictions = []
    
    for fold in tqdm(range(NUM_FOLDS)):
        
        model_path = DIR + f"model_{seed + 1}_{fold + 1}.pth" 
        print(f"\nUsing {model_path}")        
        
        model = MeanPoolingModel(MODEL_DIR)
        model.load_state_dict(torch.load(model_path)) 
        model.to(DEVICE)
        model.eval()

        predictions = []
        for batch in test_dataloader:

            batch = tuple(b.to(DEVICE) for b in batch)

            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         None,
                     }

     
            preds = model(**inputs).item()
            predictions.append(preds)
            
        del model 
        gc.collect()
            
        fold_predictions.append(predictions)
    all_predictions.append(np.mean(fold_predictions, axis=0).tolist())
    
sub_pred1 = np.mean(all_predictions,axis=0)

## RoBerta + xgb

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
config = {
    'batch_size':128,
    'max_len':256,
    'nfolds':5,
    'seed':42,
}

seed_everything(seed=config['seed'])

In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class AttentionModel(nn.Module):
    def __init__(self):
        super(AttentionModel,self).__init__()
        self.roberta = AutoModel.from_pretrained('../input/roberta-base')    
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        return x

In [None]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = AttentionModel()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
train_embeddings1 =  get_embeddings(train_data,'../input/clr-roberta/model0/model0.bin')
test_embeddings1 = get_embeddings(test_data,'../input/clr-roberta/model0/model0.bin')

train_embeddings2 =  get_embeddings(train_data,'../input/clr-roberta/model1/model1.bin')
test_embeddings2 = get_embeddings(test_data,'../input/clr-roberta/model1/model1.bin')

train_embeddings3 =  get_embeddings(train_data,'../input/clr-roberta/model2/model2.bin')
test_embeddings3 = get_embeddings(test_data,'../input/clr-roberta/model2/model2.bin')

train_embeddings4 =  get_embeddings(train_data,'../input/clr-roberta/model3/model3.bin')
test_embeddings4 = get_embeddings(test_data,'../input/clr-roberta/model3/model3.bin')

train_embeddings5 =  get_embeddings(train_data,'../input/clr-roberta/model4/model4.bin')
test_embeddings5 = get_embeddings(test_data,'../input/clr-roberta/model4/model4.bin')

In [None]:
def get_preds_xgb(X,y,X_test,bins=bins,nfolds=5):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = XGBRegressor()
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
preds1 = get_preds_xgb(train_embeddings1,target,test_embeddings1)
preds2 = get_preds_xgb(train_embeddings2,target,test_embeddings2)
preds3 = get_preds_xgb(train_embeddings3,target,test_embeddings3)
preds4 = get_preds_xgb(train_embeddings4,target,test_embeddings4)
preds5 = get_preds_xgb(train_embeddings5,target,test_embeddings5)

In [None]:
sub_preds2 = (preds1 + preds2 + preds3 + preds4 + preds5)/5

In [None]:
submit = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))
submit.target = (sub_pred1 + sub_preds2)/2
submit

In [None]:
submit.to_csv('submission.csv',index=False)