In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from sklearn.preprocessing import KBinsDiscretizer
import transformers
from transformers import AutoModel, BertTokenizerFast
from torch.utils.data import TensorDataset
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset
import random

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(768, 512)
        self.V = nn.Linear(512, 1)
        
    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class CLRPModel(nn.Module):
    def __init__(self,transformer,config):
        super(CLRPModel,self).__init__()
        self.h_size = config.hidden_size
        self.transformer = transformer
        self.head = AttentionHead(self.h_size)
        self.linear = nn.Linear(self.h_size, 1)
              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
        x = self.head(transformer_out.last_hidden_state)
        x = self.linear(x)
        return x



In [None]:

def convert_examples_to_features(text, tokenizer):

    tok = tokenizer.encode_plus(
        text, 
        max_length=Config.max_len, 
        truncation=True,
        padding='max_length',
    )
    return tok


class CLRPDataset(Dataset):
    def __init__(self, data, tokenizer, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.tolist()
        if not is_test:
            self.targets = self.data.target.tolist()
            
        self.tokenizer = tokenizer
        self.is_test = is_test
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt = self.excerpts[item]
            label = self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.float),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }
        
        
        

In [None]:

class Config:
    model_name = 'roberta-base'
    pretrained_model_path = '../input/clrp-roberta-base/clrp_roberta_base'
    output_hidden_states = True
    epochs = 3
    batch_size = 16
    device = 'cuda'
    seed = 42
    max_len = 256


test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
test_df.head()
tokenizer = torch.load('../input/tokenizers/roberta-tokenizer.pt')

models_preds = []
n_models = 5

# config = AutoConfig.from_pretrained(Config.pretrained_model_path)
# config.update({
#     "hidden_dropout_prob": 0.0,
#     "layer_norm_eps": 1e-7
# }) 
    
    
# for model_num in range(n_models):
for model_num in [0]:
    print(f'Inference#{model_num+1}/{n_models}')
    test_ds = CLRPDataset(data=test_df, tokenizer=tokenizer, is_test=True)
    test_sampler = SequentialSampler(test_ds)
    test_dataloader = DataLoader(test_ds, sampler = test_sampler, batch_size=Config.batch_size)
    
    model = torch.load(f'../input/clrp-finetune/best_model_{model_num}.pt').to(Config.device)

    all_preds = []
    model.eval()

    for step,batch in enumerate(test_dataloader):
        sent_id, mask = batch['input_ids'].to(Config.device), batch['attention_mask'].to(Config.device)
        with torch.no_grad():
            preds = model(sent_id, mask)
            all_preds += preds.flatten().cpu().tolist()
    
    models_preds.append(all_preds)
    



In [None]:
models_preds = np.array(models_preds)
print(models_preds.shape)
print(models_preds)
all_preds = models_preds.mean(axis=0)*1.2000005
print(all_preds.shape)
result_df = pd.DataFrame(
    {
        'id': test_df.id,
        'target': all_preds
    })


result_df.to_csv('/kaggle/working/submission.csv', index=False)
result_df.to_csv('submission.csv', index=False)
result_df.head(10)