This is inference notebooks that is trained using below notebooks.

This notebook uses the model created in pretrain any model notebook.

1. Pretrain Roberta Model: https://www.kaggle.com/maunish/clrp-pytorch-roberta-pretrain
2. Finetune Roberta Model: https://www.kaggle.com/maunish/clrp-pytorch-roberta-finetune <br/>
   Finetune Roberta Model TPU: https://www.kaggle.com/maunish/clrp-pytorch-roberta-finetune-tpu
3. Inference Notebook: this notebook
4. Roberta + SVM: https://www.kaggle.com/maunish/clrp-roberta-svm


In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader,Sampler

from transformers import (AutoModel, AutoTokenizer,
                          AutoConfig ,AutoModelForSequenceClassification)

from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
config = {
    'learning_rate':2e-5,
    'batch_size':16,
    'epochs':3,
    'nfolds':5,
    'seed':42,
    'max_len':256,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class Model(nn.Module):
    def __init__(self,model_path):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained(model_path) 
        self.config = AutoConfig.from_pretrained(model_path)
        self.head = AttentionHead(self.config.hidden_size,self.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.config.hidden_size,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        x = self.dropout(x)
        x = self.linear(x)
        return x

In [None]:
def get_prediction(df,path,model_path,device='cuda'):        
    #モデルセットアップお作法...後でggる
    model = Model(model_path)
    model.load_state_dict(torch.load(path,map_location=device))
    model.to(device)
    model.eval()
    
    #tokenizerの読み込み
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    #読み込んだdfをtokenizerで処理してtest_dsにする
    test_ds = CLRPDataset(df,tokenizer)
    test_dl = DataLoader(test_ds,
                        batch_size = config["batch_size"],
                        shuffle=False,
                        num_workers = 4,
                        pin_memory=True)
    
    predictions = list()
    #tqdmでシークバーを出しながら、test_dlを処理していく
    for i, (inputs) in tqdm(enumerate(test_dl)):
        inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
        outputs = model(**inputs)
        outputs = outputs.cpu().detach().numpy().ravel().tolist()
        #各inputに対する予測結果をpredictionｓに追記していく
        predictions.extend(outputs)
        
    torch.cuda.empty_cache()
    
    #predictionsを予測結果として返す
    return np.array(predictions)

In [None]:
#modelを0-4までfoldで抽出したmodelを使って予測する
pred1 = get_prediction(test_data,'../input/clrp-pytorch-roberta-finetune-roberta-large/model0/model0.bin','../input/clrp-pytorch-roberta-pretrain-roberta-large/clrp_roberta_large')
pred2 = get_prediction(test_data,'../input/clrp-pytorch-roberta-finetune-roberta-large/model1/model1.bin','../input/clrp-pytorch-roberta-pretrain-roberta-large/clrp_roberta_large')
pred3 = get_prediction(test_data,'../input/clrp-pytorch-roberta-finetune-roberta-large/model2/model2.bin','../input/clrp-pytorch-roberta-pretrain-roberta-large/clrp_roberta_large')
pred4 = get_prediction(test_data,'../input/clrp-pytorch-roberta-finetune-roberta-large/model3/model3.bin','../input/clrp-pytorch-roberta-pretrain-roberta-large/clrp_roberta_large')
pred5 = get_prediction(test_data,'../input/clrp-pytorch-roberta-finetune-roberta-large/model4/model4.bin','../input/clrp-pytorch-roberta-pretrain-roberta-large/clrp_roberta_large')

predictions1 = (pred1 + pred2 + pred3 + pred4 + pred5)/5

In [None]:
#こっちはtpuモデル
pred1 = get_prediction(test_data,'../input/clrp-pytorch-roberta-finetune-tpu/model0/model0.bin','../input/clrp-pytorch-roberta-pretrain/clrp_roberta_base')
pred2 = get_prediction(test_data,'../input/clrp-pytorch-roberta-finetune-tpu/model1/model1.bin','../input/clrp-pytorch-roberta-pretrain/clrp_roberta_base')
pred3 = get_prediction(test_data,'../input/clrp-pytorch-roberta-finetune-tpu/model2/model2.bin','../input/clrp-pytorch-roberta-pretrain/clrp_roberta_base')
pred4 = get_prediction(test_data,'../input/clrp-pytorch-roberta-finetune-tpu/model3/model3.bin','../input/clrp-pytorch-roberta-pretrain/clrp_roberta_base')
pred5 = get_prediction(test_data,'../input/clrp-pytorch-roberta-finetune-tpu/model4/model4.bin','../input/clrp-pytorch-roberta-pretrain/clrp_roberta_base')
predictions2 = (pred1 + pred2 + pred3 + pred4 + pred5)/5

In [None]:
#CPUGPU and TPUモデルの平均を予測値とする

sample['target'] = (predictions1 + predictions2)/2
sample['target'] = predictions1

sample.to_csv('submission.csv',index=False)

In [None]:
sample