In [None]:
import pandas as pd
import numpy as np
import torch

test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModel


tokenizer = AutoTokenizer.from_pretrained('../input/robertabase')
bert = AutoModel.from_pretrained('../input/robertabase', output_hidden_states=False)   

In [None]:
from torch.utils.data import Dataset, DataLoader

class TextData(Dataset):
    def __init__(self, text, labels, max_len=250):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        tokenized_text = tokenizer(
            self.text[item].replace('\n', ''), max_length=self.max_len, truncation=True, 
            return_attention_mask=True, return_token_type_ids=True)
        
        padding_length = self.max_len - len(tokenized_text['input_ids'])
        
        return {
            'input_ids':torch.tensor(tokenized_text['input_ids'] + ([0] * padding_length), dtype=torch.long),
            #'token_type_ids':torch.tensor(tokenized_text['token_type_ids'] + ([0] * padding_length), dtype=torch.long),
            'attention_mask':torch.tensor(tokenized_text['attention_mask'] + ([0] * padding_length), dtype=torch.long),
            'label':torch.tensor(self.labels[item], dtype=torch.double),
        }

In [None]:
from transformers import AutoModel

class ReadModel(torch.nn.Module): 
    def __init__(self):
        super(ReadModel, self).__init__()
        self.bert = bert
        self.dropout = torch.nn.Dropout(0.2)
        self.hidden = net = torch.nn.Sequential(
            torch.nn.Linear(768, 384),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(384, 128),
            torch.nn.LeakyReLU(),
        )
        self.regressor = torch.nn.Linear(128, 1)
        
    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids,attention_mask)
        output = output.last_hidden_state[:, 0]
        output = self.dropout(output)
        output = self.hidden(output)
        logits = self.regressor(output)  
        return logits 




model = ReadModel()

In [None]:

model.load_state_dict(torch.load('../input/commonlit-readability-lab2/n2_model.pth'))

model = model.cuda()

In [None]:
test_params = {"batch_size": 128*4,
               "shuffle": False,
               "drop_last": False}


f = TextData(test['excerpt'].values,np.zeros(len(test)))

f_generator = DataLoader(f, **test_params)

In [None]:
def predict(model, data_loader):
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for item in data_loader:
            input_ids = item['input_ids'].cuda()
            attention_mask = item['attention_mask'].cuda()
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().cpu()
            index += pred.shape[0]

    return result

In [None]:
submission_df = pd.DataFrame({
    'id': test.loc[:, 'id'].values,
    'target': predict(model,f_generator)
})

In [None]:
submission_df.to_csv("submission.csv", index = False)

In [None]:
submission_df