In [None]:
import os
import re
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

import sklearn
from scipy import stats, special
from sklearn.metrics import mean_squared_error, accuracy_score

import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import RobertaTokenizer, RobertaModel, RobertaConfig

In [None]:
class CONFIG:

    len248 = 248
    len256 = 256
    len307 = 307
    
    batch = 512
    
    whole_model = [
        r"../input/clrpensemblemodels/whole-model/model_1.pth",
        r"../input/clrpensemblemodels/whole-model/model_2.pth",
        r"../input/clrpensemblemodels/whole-model/model_3.pth",
        r"../input/clrpensemblemodels/whole-model/model_4.pth",
        r"../input/clrpensemblemodels/whole-model/model_5.pth"
    ]
    onesided_target = [
        r"../input/clrpensemblemodels/onesided-target/best_model_0.pt",
        r"../input/clrpensemblemodels/onesided-target/best_model_1.pt",
        r"../input/clrpensemblemodels/onesided-target/best_model_2.pt",
        r"../input/clrpensemblemodels/onesided-target/best_model_3.pt",
        r"../input/clrpensemblemodels/onesided-target/best_model_4.pt"
    ]
    logit_model = [
        r"../input/clrpensemblemodels/logit-model/roberta-base-attention-logit-model-88.52.bin",
        r"../input/clrpensemblemodels/logit-model/roberta-base-attention-logit-model-88.71.bin"
    ]
    logit_cnn_model = r"../input/clrpensemblemodels/logit-model/roberta-base-cnn-concat-conv3579-logit-model-88.18.bin"
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def prep_text(str_):
    str_ = re.sub("\n", "", str_)
    str_ = re.sub("\'s",r"'s", str_)
    return str_

In [None]:
median_shift = -0.91332221

test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_data["excerpt"] = test_data["excerpt"].apply(lambda x: prep_text(x))


# test_data = test_data[~(test_data.target==0.0) & ~(test_data.standard_error==0.0)].reset_index(drop=True)
# test_data['mod_target'] = test_data['target']-median_shift
# test_data['is_difficult'] = test_data['mod_target'] < 0
# test_data['is_difficult'] = test_data['is_difficult'].astype(int)

# test_data['mod_abs'] = test_data['mod_target'].abs()

In [None]:
class CLRPDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.text = df['excerpt'].values 
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer(text, 
                                return_tensors='pt', 
                                max_length=self.max_len, 
                                padding='max_length', 
                                truncation=True)
        return {
            'ids': inputs['input_ids'].squeeze(),
            'mask': inputs['attention_mask'].squeeze()        
        }
    

In [None]:
class OneSidedNet(nn.Module):
    def __init__(self, model_path, model_config):
        super(OneSidedNet, self).__init__()
        
        self.base_model = RobertaModel.from_pretrained(model_path, config=model_config)
        
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        
    def forward(self, inputs):
        ids, mask = inputs
        
        base_out = self.base_model(ids, attention_mask=mask)[0]
            
        weights = self.attention(base_out)
                
        
        context_vector = torch.sum(weights * base_out, dim=1)        
        

        return self.regressor(context_vector)

In [None]:
class WholeNet(nn.Module):
    def __init__(self, model_path, model_config):
        super(WholeNet, self).__init__()
        
        self.roberta = RobertaModel.from_pretrained(model_path, config=model_config)
        
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        
    def forward(self, inputs):
        ids, mask = inputs
        
        base_out = self.roberta(ids, attention_mask=mask)[0]
            
        weights = self.attention(base_out)
                
        
        context_vector = torch.sum(weights * base_out, dim=1)        
        

        return self.regressor(context_vector)

In [None]:
class LogitNet(nn.Module):
    def __init__(self, model_path, model_config):
        super(LogitNet, self).__init__()
        
        self.base_model = RobertaModel.from_pretrained(model_path, config=model_config)
        
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        
    def forward(self, inputs):
        ids, mask = inputs
        
        base_out = self.base_model(ids, attention_mask=mask)[0]
            
        weights = self.attention(base_out)
                
        
        context_vector = torch.sum(weights * base_out, dim=1)        
        

        return self.regressor(context_vector)

In [None]:
class CNNLogitNet(nn.Module):
    def __init__(self, model_path, model_config):
        super(CNNLogitNet, self).__init__()
        
        self.base_model = RobertaModel.from_pretrained(model_path, config=model_config)
        
        self.conv1 = nn.Conv1d(in_channels=768, out_channels=1, kernel_size=3, padding=2, stride=1)
        self.conv2 = nn.Conv1d(in_channels=768, out_channels=1, kernel_size=5, padding=4, stride=1)
        self.conv3 = nn.Conv1d(in_channels=768, out_channels=1, kernel_size=7, padding=6, stride=1)
        self.conv4 = nn.Conv1d(in_channels=768, out_channels=1, kernel_size=9, padding=8, stride=1)      
        
        self.bn = nn.BatchNorm1d(1)
        
        self.pool1 = nn.MaxPool1d(kernel_size=3, stride=1)
        self.pool2 = nn.MaxPool1d(kernel_size=5, stride=1)
        self.pool3 = nn.MaxPool1d(kernel_size=7, stride=1)
        self.pool4 = nn.MaxPool1d(kernel_size=9, stride=1)
        
        self.hidden_layer = nn.Linear(4*307, 256)
        self.final_out = nn.Linear(256, 1)
        
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, inputs):
        ids, mask = inputs
        
        base_out = self.base_model(ids, attention_mask=mask)[0]
        base_out = base_out.permute(0, 2, 1)
        
        x1 = self.bn(self.conv1(base_out))
        x1 = F.relu(x1)
        x1 = self.pool1(x1)
        x1 = x1.view(-1, 307)
        
        x2 = self.bn(self.conv2(base_out))
        x2 = F.relu(x2)
        x2 = self.pool2(x2)
        x2 = x2.view(-1, 307)
        
        x3 = self.bn(self.conv3(base_out))
        x3 = F.relu(x3)
        x3 = self.pool3(x3)
        x3 = x3.view(-1, 307)
        
        x4 = self.bn(self.conv4(base_out))
        x4 = F.relu(x4)
        x4 = self.pool4(x4)
        x4 = x4.view(-1, 307)
        
        
        out = torch.cat([x1, x2, x3, x4], dim=1)
        
        out = self.dropout(out)
        out = self.hidden_layer(out)
        
        out = self.dropout(out)
        out = self.final_out(out)

        return out

In [None]:
model_path = r"../input/clrpensemblemodels/roberta-base"
model_config = RobertaConfig.from_pretrained(model_path)
model_config.output_hidden_states = True
tokenizer = RobertaTokenizer.from_pretrained(model_path)

In [None]:
gc.collect()
torch.cuda.empty_cache()

model = CNNLogitNet(model_path, model_config)
model.to(CONFIG.device)
model.load_state_dict(torch.load(CONFIG.logit_cnn_model, map_location=CONFIG.device))

test_dataset = CLRPDataset(test_data, tokenizer, CONFIG.len307)
test_dataloader = DataLoader(test_dataset, batch_size=CONFIG.batch, shuffle=False)

preds = []
for batch in test_dataloader:
    with torch.no_grad():
        device=CONFIG.device
        ids = batch["ids"].to(device)
        mask = batch["mask"].to(device)
        output= model((ids, mask)) 

        preds.append(output)
        
preds = torch.cat(preds).cpu().detach().numpy()
preds = torch.sigmoid(torch.tensor(preds)).squeeze().cpu().detach().numpy()
test_data['cnn_logits_0'] = preds

In [None]:
model_path = r"../input/clrpensemblemodels/pretrained_roberta_base"
model_config = RobertaConfig.from_pretrained(model_path)
model_config.output_hidden_states = True
model_config.hidden_dropout_prob = 0.0
model_config.layer_norm_eps = 1e-7
tokenizer = RobertaTokenizer.from_pretrained(model_path)

In [None]:
gc.collect()
torch.cuda.empty_cache()

idx=1
test_dataset = CLRPDataset(test_data, tokenizer, CONFIG.len256)
test_dataloader = DataLoader(test_dataset, batch_size=CONFIG.batch, shuffle=False)
    
for trained_model in CONFIG.logit_model:   
    model = LogitNet(model_path, model_config)
    model.to(CONFIG.device)
    model.load_state_dict(torch.load(trained_model, map_location=CONFIG.device))
    
    preds = []
    for batch in test_dataloader:
        with torch.no_grad():
            device=CONFIG.device
            ids = batch["ids"].to(device)
            mask = batch["mask"].to(device)
            output= model((ids, mask)) 

            preds.append(output)
            
    preds = torch.cat(preds).cpu().detach().numpy()
    preds = torch.sigmoid(torch.tensor(preds)).squeeze().cpu().detach().numpy()
    
    test_data['cnn_logits_'+str(idx)] = preds
    idx+=1

In [None]:
gc.collect()
torch.cuda.empty_cache()

idx=0
test_dataset = CLRPDataset(test_data, tokenizer, CONFIG.len256)
test_dataloader = DataLoader(test_dataset, batch_size=CONFIG.batch, shuffle=False)

model = OneSidedNet(model_path, model_config)
model.to(CONFIG.device)    

for trained_model in CONFIG.onesided_target:
    model.load_state_dict(torch.load(trained_model, map_location=CONFIG.device))
    
    preds = []
    for batch in test_dataloader:
        with torch.no_grad():
            device=CONFIG.device
            ids = batch["ids"].to(device)
            mask = batch["mask"].to(device)
            output= model((ids, mask))

            preds.append(output)
            
    preds = torch.cat(preds).cpu().detach().numpy()
    
    test_data['onesided_model_'+str(idx)] = preds
    idx+=1

In [None]:
gc.collect()
torch.cuda.empty_cache()

idx=0
test_dataset = CLRPDataset(test_data, tokenizer, CONFIG.len248)
test_dataloader = DataLoader(test_dataset, batch_size=CONFIG.batch, shuffle=False)
model = WholeNet(model_path, model_config)
model.to(CONFIG.device)    

for trained_model in CONFIG.whole_model:   
    
    model.load_state_dict(torch.load(trained_model, map_location=CONFIG.device))
    
    preds = []
    for batch in test_dataloader:
        with torch.no_grad():
            device=CONFIG.device
            ids = batch["ids"].to(device)
            mask = batch["mask"].to(device)
            output= model((ids, mask)) 

            preds.append(output)
            
    preds = torch.cat(preds).cpu().detach().numpy()
    
    test_data['whole_model_'+str(idx)] = preds
    idx+=1
    
    
    

In [None]:
xgb_preds = []
columns = ['cnn_logits_0', 'cnn_logits_1', 'cnn_logits_2', 'onesided_model_0', 'onesided_model_1', 'onesided_model_2', 'onesided_model_3', 'onesided_model_4', 'whole_model_0', 'whole_model_1', 'whole_model_2', 'whole_model_3', 'whole_model_4']

for k in range(5):
    xgb_model = xgb.Booster()
    xgb_model.load_model('../input/clrpensemblemodels/xgb_regression/xgb_fold{}.json'.format(str(k)))
    ensemble_values = xgb.DMatrix(test_data[columns].values)
    xgb_preds.append(xgb_model.predict(ensemble_values))

submission_df = pd.DataFrame()
submission_df['id'] = test_data['id']
submission_df['target'] = np.mean(xgb_preds, axis=0)

In [None]:
submission_df.to_csv("submission.csv", index=False)

In [None]:
# submission_df = pd.DataFrame()
# submission_df['id'] = test_data['id']
# submission_df['logit'] = np.where(test_data.cnn_logits_0.round() + test_data.cnn_logits_0.round() + test_data.cnn_logits_0.round()>1, -1, 1)
# submission_df['onesided'] = test_data[['onesided_model_0', 'onesided_model_1', 'onesided_model_2', 'onesided_model_3', 'onesided_model_4']].mean(axis=1)
# submission_df['target'] = (submission_df['logit']*submission_df['onesided']) + median_shift

# submission_df = submission_df[['id', 'target']]

# submission_df['onesided_inf'] = submission_df['logit']*submission_df['onesided'] + median_shift
# submission_df['whole'] = test_data[['whole_model_0', 'whole_model_1', 'whole_model_2', 'whole_model_3', 'whole_model_4']].mean(axis=1)
# submission_df['target'] = test_data['target']