# Testing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import math

from collections import Counter
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader

from transformers import RobertaModel, RobertaTokenizer, BertTokenizer, BertModel, AdamW

In [2]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test = test[['id', 'excerpt']]
test.head()

Unnamed: 0,id,excerpt
0,c0f722661,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,It was a bright and cheerful scene that greete...
3,04caf4e0c,Cell division is the process by which a parent...
4,0e63f8bea,Debugging is the process of finding and resolv...


In [3]:
test['excerpt'] = test.excerpt.apply(lambda x: re.sub(r'[\n]', ' ', x))
X_test = test.excerpt.to_numpy()

In [4]:
class MyModel(nn.Module):

    def __init__(self):
        super().__init__()
        
        self.bert = torch.load("../input/roberta/bt")
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 1)
        self.do = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask,token_type_ids):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        x = torch.tanh(self.fc1(x.pooler_output))
        x = self.do(x)
        x = torch.tanh(self.fc2(x))
        x = self.do(x)
        x = self.fc3(x)
        return x

In [5]:
tokenizer = torch.load("../input/roberta/tokenizer")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

num_folds = 5
MAX_SEQ_LEN = 256
preds_folds = []
for i in range(num_folds):
    model = MyModel()
    model.to(device)
    model_path = "../input/mytrainedmodels/model_%s" % i
    model.load_state_dict(torch.load(model_path))
    
    preds = []
    for i in test.excerpt:
        test_token = tokenizer([i], padding='max_length', truncation=True, max_length=MAX_SEQ_LEN, return_token_type_ids=True, return_tensors="pt")
        input_ids = test_token['input_ids'].to(device)
        attention_mask = test_token['attention_mask'].to(device)
        token_type_ids = test_token['token_type_ids'].to(device)
        test_pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        test_pred = test_pred.cpu().detach().numpy().astype("float").item()
        preds.append(test_pred)
    
    preds_folds.append(preds)
    
tgt = np.mean(np.array(preds_folds), axis=0)
ans = pd.DataFrame({'id': test.id, 'target': tgt})
ans

Unnamed: 0,id,target
0,c0f722661,-0.348936
1,f0953f0a5,-0.66228
2,0df072751,-0.48331
3,04caf4e0c,-2.568693
4,0e63f8bea,-1.785356
5,12537fe78,-1.268913
6,965e592c0,0.078832


In [6]:
ans.to_csv('submission.csv', index=False)