# About this notebook

* Pytorch DistilBert inference code.
* Training notebook is [here](https://www.kaggle.com/snnclsr/commonlit-readability-training/).

If this notebook is helpful, feel free to upvote :)

**Some of the parts of this notebook taken from [Y.Nakama](https://www.kaggle.com/yasufuminakama)'s notebooks. Please also check his notebooks as well from [here](https://www.kaggle.com/yasufuminakama/code)**

# Imports

In [None]:
import os
import time
import math
import random
from pathlib import Path

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoConfig
from transformers import AutoModel, AutoTokenizer
from transformers import DistilBertTokenizer

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Data Loading

In [None]:
BASE_DATA_PATH = Path("../input/commonlitreadabilityprize/")

!ls {BASE_DATA_PATH}

In [None]:
df_test = pd.read_csv(BASE_DATA_PATH / "test.csv")

In [None]:
df_test.head(3)

# Dataset

In [None]:
class CommonLitDataset(Dataset):
    
    def __init__(self, df, tokenizer, max_length):
    
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        tokenized_input = self.tokenizer(row.excerpt, return_tensors="pt", 
                                         max_length=self.max_length, 
                                         padding="max_length", truncation=True)
        return {
            "ids": tokenized_input["input_ids"][0],
            "masks": tokenized_input["attention_mask"][0],
        }

# Model

In [None]:
class TextRegressionModel(nn.Module):
    
    def __init__(self, model_name, dropout_p=0.1):
        super(TextRegressionModel, self).__init__()
        
        self.model = AutoModel.from_pretrained(CFG.model_path) # AutoModel.from_config(AutoConfig.from_pretrained("config.json"))
        self.features = nn.Linear(768, 768)
        self.dropout = nn.Dropout(dropout_p)
        self.out = nn.Linear(768, 1)
        
    def forward(self, input_ids, attention_mask):
        
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        output = F.relu(self.features(output.last_hidden_state[:, 0]))
        output = self.dropout(output)
        output = self.out(output)
        return output

# Config

In [None]:
class CFG:
    model_name = "distilbert-base-cased"
    # Default model (from HuggingFace) path
    model_path = "../input/pt-distilbert-base-cased/distilbert-base-cased/"
    max_length = 256
    dropout_p = 0.3
    batch_size = 16
    n_epochs = 10
    weight_decay = 1e-6
    lr = 3e-4
    min_lr = 1e-6
    scheduler = "CosineAnnealingLR"
    T_max = 10
    seed = 42
    n_folds = 5    
    print_freq = 500
    # num_workers = 4


# Inference

In [None]:
def inference(model, states, data_loader, device=device):

    results = []
    with torch.no_grad():
        for state in states:
            outputs = []
            model.load_state_dict(state)
            model.to(device)
            model.eval()
            for step, batch in enumerate(data_loader):
                input_ids = batch["ids"].to(device)
                attention_masks = batch["masks"].to(device)
                output = model(input_ids, attention_masks)
                outputs.append(output.detach().cpu().numpy())
        
            results.append(np.stack(outputs)[0])
        
    return np.array(results)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("../input/distilbertbasecased/distilbert-base-cased_tokenizer/")
dataset = CommonLitDataset(df_test, tokenizer, CFG.max_length)
data_loader = DataLoader(dataset, batch_size=CFG.batch_size, 
                         shuffle=False)

In [None]:
states = [torch.load(f"../input/distilbertbasecased/distilbert-base-cased_fold_{fold_idx}_best.pth")["model"] for fold_idx in range(5)]

In [None]:
model = TextRegressionModel(CFG.model_name, CFG.dropout_p)
# model.load_state_dict()
# model.to(device)

In [None]:
outputs = inference(model, states, data_loader, device)

In [None]:
df_sub = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
df_sub["target"] = outputs.mean(0)
df_sub.to_csv("submission.csv",index=False)

In [None]:
df_sub