# Importing Modules

In [None]:
# Standard imports
import os
from pprint import pprint
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import trange
from colorama import Fore, Back, Style
import time
from glob import glob

# For plotting
import plotly.express as px
import plotly.graph_objects as go

# For model building
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

# For Transfomers
from transformers import AutoTokenizer, BertModel

import warnings
warnings.filterwarnings("ignore")

# CONFIG

In [None]:
cfg = {}
cfg['train_csv'] = "../input/commonlitreadabilityprize/train.csv"
cfg['test_csv'] = "../input/commonlitreadabilityprize/test.csv"
cfg['sample_sub'] = "../input/commonlitreadabilityprize/sample_submission.csv"
cfg['epochs'] = 20
cfg['max-len'] = 256
cfg['train_bs'] = 8
cfg['val_bs'] = 16
cfg['active-model'] = '../input/bert-base-uncased'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

pprint(cfg)
print(f"\nCurrent Device : {DEVICE}")

# Building DataLoader

Thanks to [ABHISHEK THAKUR](https://www.kaggle.com/abhishek) for [this...](https://www.kaggle.com/abhishek/bert-base-uncased)

In [None]:
# Creating a data loader class

class BERTDataset(Dataset):
    def __init__(self, txt):
        self.txt = txt
        self.tokenizer = AutoTokenizer.from_pretrained(cfg['active-model'])
        self.max_len = cfg['max-len']
    
    def __len__(self):
        return len(self.txt)
    
    def __getitem__(self, idx):        
        # I have oberverd that some of the sentences have new line character
        txt = str(self.txt[idx]).replace("\n", "")
        
        # Inputs from hugging face tokenizer
        inputs = self.tokenizer.encode_plus(
            txt, 
            add_special_tokens = True,
            max_length = self.max_len,
            truncation = True,
            padding = 'max_length',
        )
        
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"] 
        
        return {
            'ids' : torch.tensor(ids, dtype = torch.long),
            'mask' : torch.tensor(mask, dtype = torch.long),
            'token_type_ids' : torch.tensor(token_type_ids, dtype = torch.long),
        }

# Loading Models

In [None]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        
        self.bert = BertModel.from_pretrained(cfg['active-model'])
        self.drop = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(768, 10)
        self.out = nn.Linear(10, 1)
    
    def forward(self, ids, mask):
        outs= self.bert(
            ids, 
            attention_mask = mask,            
        )
        
        mean_pool = torch.mean(outs[0], 1)
        x = self.linear(mean_pool)
        x = self.relu(x)
        x = self.drop(x)
        output = self.out(x)
        
        return output

# Loading Test Data

In [None]:
test_df = pd.read_csv(cfg['test_csv'])
test_df

# Inference 

Here, I will build an ensemble of my trained models..
You can check my training notebook [here...](https://www.kaggle.com/hotsonhonet/helpme)

In [None]:
def run():

    test_dataset = BERTDataset(test_df['excerpt'].values)
    test_dataloader = DataLoader(
                            test_dataset,
                            batch_size = cfg['val_bs'],
                            num_workers = 4,
                            shuffle = False,
                            pin_memory = True
                        )

    model_paths = glob("../input/bestbertbaseuncasedmodelsclrp/*.pth")
    test_predictions = []
    

    for idx in trange(len(model_paths), desc = "Making Predictions", bar_format="{l_bar}%s{bar:50}%s{r_bar}" % (Fore.CYAN, Fore.RESET), position = 0, leave = True):
        
        cur_model_path = model_paths[idx]
        cur_state = torch.load(cur_model_path)
        cur_model = BERTBaseUncased()
        cur_model.load_state_dict(cur_state['state_dict'])
        cur_model.to(DEVICE)
        cur_model.eval()
        
        cur_preds = []
        
        with torch.no_grad():
            for d in test_dataloader:
                ids = d['ids'].to(DEVICE)
                mask = d['mask'].to(DEVICE)
                
                output = cur_model(ids, mask)
                
                output = output.squeeze(-1)
                
                cur_preds.append(output.cpu().numpy())
            
            cur_preds = np.concatenate(cur_preds)
            test_predictions.append(cur_preds)
            
    
    return test_predictions             


In [None]:
test_preds = run()

In [None]:
pprint(test_preds)

In [None]:
sample_sub = pd.read_csv(cfg['sample_sub'])
sample_sub['target'] = np.array(test_preds).mean(axis = 0)
sample_sub

In [None]:
sample_sub.to_csv("submission.csv", index = False)