# Inference for Pytorch BERT beginner's room


#### This page is the inference notebook on the following pages.
 * English
     https://www.kaggle.com/chumajin/pytorch-bert-beginner-s-room
 
 
 
 * Japanese
     https://www.kaggle.com/chumajin/pytorch-bert-beginner-s-room-version

#### The model created by Random Seed 508 is uploaded to dataset.


#### If you created it by Copy and edit, please add it to input and change the model path.


------------------日本語-------------------------------------------------------------------



#### このページは以下のノートブックのinferenceのページです。
 * English
     https://www.kaggle.com/chumajin/pytorch-bert-beginner-s-room
 
 
 
 * Japanese
     https://www.kaggle.com/chumajin/pytorch-bert-beginner-s-room-version

#### モデルは別途ランダムシード508で作成されたものをupしました。(コードは同じです)


#### Copy and editされた方は、その結果をインプットから登録してmodel pathに入れてご使用ください

In [None]:
import numpy as np 
import pandas as pd 
import os
       
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import matplotlib.pyplot as plt 

import transformers
import random


import warnings
warnings.simplefilter('ignore')

scaler = torch.cuda.amp.GradScaler() # GPUでの高速化。

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # cpuがgpuかを自動判断
device

# 0. Please enter the model path

(Those created by Copy and edit may want to include their own output)

## 0. modelパスに作成したモデルのフォルダパスを入れてください。


Copy and editで作成された方は、ご自分の結果を入れるといいかもしれません)

In [None]:
result_path = "../input/pytorchbiginnersroommodel-v2"

In [None]:
SEED = 508

def random_seed(SEED):
    
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

random_seed(SEED)

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-uncased")

In [None]:
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test.head(3)

# 1.Creating a Dataset (Since it is an inference, there is no target, so I omitted it)
## If you have changed the max length with tokenizer, change it.

## 1.データセットの作成。インファレンスなので、targetは除去しています。
### もし、tokenizerのmax lengthを変えている場合は、314というところを変えてください。

In [None]:
class BERTDataSet(Dataset):
    
    def __init__(self,sentences):
        
        self.sentences = sentences
       
        
    def __len__(self):
        
        return len(self.sentences)
    
    def __getitem__(self,idx):
        
        sentence = self.sentences[idx]
        sentence = str(sentence)
        sentence = " ".join(sentence.split())
        
        
        bert_sens = tokenizer.encode_plus(
                                sentence,
                                add_special_tokens = True, # [CLS],[SEP]
                                max_length = 314,
                                pad_to_max_length = True, # add padding to blank
                                truncation=True)

        ids = torch.tensor(bert_sens['input_ids'], dtype=torch.long)
        mask = torch.tensor(bert_sens['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(bert_sens['token_type_ids'], dtype=torch.long)
     
        
    
        
        return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
                
            }
        

In [None]:
test_dataset = BERTDataSet(test["excerpt"])

In [None]:
test_batch = 32

In [None]:
test_dataloader = DataLoader(test_dataset,batch_size=test_batch,shuffle = False,num_workers=8,pin_memory=True)

# 2. model load

In [None]:
model = transformers.BertForSequenceClassification.from_pretrained('../input/bert-base-uncased',num_labels=1)

In [None]:
pthes = [os.path.join(result_path,s) for s in os.listdir(result_path) if ".pth" in s]
pthes

In [None]:
states = [torch.load(s) for s in pthes]

# 3. prediction function

In [None]:
def predicting(
    test_dataloader,
    model,
    states
    
):

    allpreds = []
    
    for state in states:
        model.load_state_dict(state["state_dict"])
        model.to(device)
        model.eval()
    
    
        preds = []
        allvalloss=0

        with torch.no_grad():


            for a in test_dataloader:



                ids = a["ids"].to(device)
                mask = a["mask"].to(device)
                tokentype = a["token_type_ids"].to(device)

                output = model(ids,mask,tokentype)
                output = output["logits"].squeeze(-1)


                preds.append(output.cpu().numpy())

            preds = np.concatenate(preds)
            
            allpreds.append(preds)

    return allpreds


In [None]:
allpreds = predicting(test_dataloader,model,states)

# 4. Avarage the 5 model and making submission.
## 5個のモデルを平均化してサブミッションファイルを作成します。

In [None]:
findf = pd.DataFrame(allpreds)
findf = findf.T

In [None]:
findf

In [None]:
finpred = findf.mean(axis=1)
finpred

In [None]:
sample = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
sample

In [None]:
sample["target"] = finpred

In [None]:
sample

In [None]:
sample.to_csv("submission.csv",index = False)

# Thank you so much !