In [None]:
import os
import gc
import torch
import joblib
import numpy as np
import pandas as pd
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig

In [None]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
class Config:
    def __init__(self):
        self.TARGET_COLS = ["target"]
        self.MAX_LEN = 256
        self.AUTOMODEL_CHECKPOINT = "../input/roberta-base"
        self.TOKENIZER_CHECKPOINT = "../input/roberta-base"
        self.EPOCHS = 3
        self.TRAIN_BATCH_SIZE = 16
        self.EVAL_BATCH_SIZE = 16
        self.LR = 5e-5
        self.DEVICE = "cuda"
        self.EVAL_INTERVAL = 20
        self.LOG_INTERVAL = 20
        self.FOLDS = 5

config = Config()

In [None]:
class CLRPDataset(Dataset):
    def __init__(self, data, tokenizer_checkpoint, max_length: int = 256, is_test: bool = False):
        self.excerpts = data.excerpt.values.tolist()
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
        self.max_len = max_length
        self.is_test = is_test
        if not self.is_test:
            self.targets = data.target.values.tolist()
        
    def __getitem__(self, idx):
        item = self.tokenizer(self.excerpts[idx], max_length=self.max_len,
                             return_tensors="pt", truncation=True, padding="max_length")
        if self.is_test:
            return {
                "input_ids": torch.tensor(item["input_ids"], dtype=torch.long).squeeze(0),
                "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long).squeeze(0)
            }
        else:
            target = self.targets[idx]
            return {
                "input_ids": torch.tensor(item["input_ids"], dtype=torch.long).squeeze(0),
                "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long).squeeze(0),
                "label": torch.tensor(target, dtype=torch.float).squeeze(0)
            }

    def __len__(self):
        return len(self.excerpts)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    
class CLRPModel(nn.Module):
    def __init__(self,path):
        super(CLRPModel, self).__init__()
        self.roberta = AutoModel.from_pretrained(path)  
        self.config = AutoConfig.from_pretrained(path)
        self.head = AttentionHead(self.config.hidden_size,self.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.config.hidden_size,1)

    def forward(self, input_ids, attention_mask, labels=None):
        x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)[0]
        x = self.head(x)
        x = self.dropout(x)
        x = self.linear(x)
        loss = None
        if labels is not None:
            loss = loss_fn(x, labels)
        return (loss, x) if loss is not None else x

In [None]:
def predict(model, test_loader):
    preds = []
    for batch_idx, batch in enumerate(test_loader):
        input_ids = batch["input_ids"].to(config.DEVICE)
        attention_mask = batch["attention_mask"].to(config.DEVICE)
        model.eval()
        with torch.no_grad():
            logits = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask
                    )    
            logits = logits.view(-1).detach().cpu().numpy()
            preds.extend(logits)
    del model
    gc.collect()
    return preds

In [None]:
def run(model_dir, model_prefix, test_data):    
    all_preds = []
    dtest = CLRPDataset(
            data=test_data,
            tokenizer_checkpoint=config.TOKENIZER_CHECKPOINT,
            max_length=config.MAX_LEN,
            is_test=True
        )
    test_loader = DataLoader(
                    dataset=dtest,
                    batch_size=16,
                    shuffle=False,
                    drop_last=False
                )

    model = CLRPModel(path=config.AUTOMODEL_CHECKPOINT)
    
    print("Starting inference...")
    for dir, _, filenames in os.walk(model_dir):
        for filename in filenames:
            if model_prefix in filename:
                model.load_state_dict(torch.load(os.path.join(dir, filename), map_location=config.DEVICE))
                model.to(config.DEVICE)
                print(f"Predicting using model: {filename}")
                preds = predict(model, test_loader)
                all_preds.append(preds)
    all_preds = np.array(all_preds)
    final_preds = np.mean(all_preds, axis=0)
    print("Inference Completed")
    return final_preds

In [None]:
predictions = run(model_dir="../input/clrp-roberta-base",
                 model_prefix="model-fold-",
                 test_data=test_df)

In [None]:
submission = pd.DataFrame({
    "id": test_df.id.values,
    "target": predictions
})

submission.to_csv("submission.csv", index=False)