In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import torch
from copy import deepcopy
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig
from tqdm import tqdm
import torch.nn as nn
import gc

In [None]:
def get_substring_span(text, substring, min_length=10, fraction=0.999):
    """
    Returns substring's span from the given text with the certain precision.
    """

    position = text.find(substring)
    substring_length = len(substring)
    if position == -1:
        half_length = int(substring_length * fraction) 
        half_substring = substring[:half_length]
        half_substring_length = len(half_substring)
        if half_substring_length < min_length:
            return [-1, 0]
        else:
            return get_substring_span(text=text, 
                                    substring=half_substring, 
                                    min_length=min_length, 
                                    fraction=fraction)

    span = [position, position+substring_length]
    return span


def read_file(path):
    with open(path, "r") as file:
        data = file.read()

    return data

def preprocess(data_frame, 
               essay_id_column="essay_id", 
               essay_path_column="essay_path", 
               essay_text_column="essay_text", 
               discourse_text_column="discourse_text", 
               compute_lengths=True, 
               directory="./", 
               file_format="txt"):

    data_frame = deepcopy(data_frame)

    data_frame[essay_path_column] = data_frame[essay_id_column].apply(lambda essay_id: os.path.join(directory, f"{essay_id}.{file_format}"))
    data_frame[essay_text_column] = data_frame[essay_path_column].apply(lambda essay_path: read_file(essay_path))

    data_frame[f"{discourse_text_column}_span"] = data_frame.apply(lambda sample: get_substring_span(text=sample[essay_text_column], 
                                                                                                     substring=sample[discourse_text_column]), axis=1)
    if compute_lengths:
        data_frame[f"{essay_text_column}_length"] = data_frame[essay_text_column].apply(lambda text: len(text.split()))
        data_frame[f"{discourse_text_column}_length"] = data_frame[discourse_text_column].apply(lambda text: len(text.split()))

    return data_frame

In [None]:
test_path = "../input/feedback-prize-effectiveness/test.csv"
test_directory = "../input/feedback-prize-effectiveness/test"

test = pd.read_csv(test_path)
test = preprocess(data_frame=test, directory=test_directory, compute_lengths=True)

In [None]:
model_name  = '../input/deberta-v3-large/deberta-v3-large'
batch_size = 2
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_path = ['../input/fpe-deberta/Loss-Fold-0.bin',
             '../input/fpe-deberta/Loss-Fold-1.bin',
             '../input/fpe-deberta/Loss-Fold-2.bin',
             '../input/fpe-deberta/Loss-Fold-3.bin',
             '../input/fpe-deberta/Loss-Fold-4.bin']

In [None]:
test

In [None]:
class FeedBackDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.discourse = df['discourse_text'].values
        self.essay = df['essay_text'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        discourse = self.discourse[index]
        essay = self.essay[index]
        text = discourse + " " + self.tokenizer.sep_token + " " + essay
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length'
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
test_dataset = FeedBackDataset(test, tokenizer, max_length=512)

In [None]:
test_loader = DataLoader(test_dataset, batch_size=batch_size,
                         num_workers=2, shuffle=False, pin_memory=True)

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class FeedBackModel(nn.Module):
    def __init__(self, model_name):
        super(FeedBackModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 3)

    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

In [None]:
def valid(model, dataloader, device):
    model.eval()

    dataset_size = 0
    running_loss = 0.0

    preds = []

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)

        outputs = model(ids, mask)
        outputs = F.softmax(outputs, dim=1)
        preds.append(outputs.cpu().detach().numpy())

    preds = np.concatenate(preds)
    gc.collect()

    return preds

In [None]:
def inference(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = FeedBackModel(model_name)
        model.to(device)
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = valid(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

In [None]:
model_preds = inference(model_path, test_loader, device)

In [None]:
sample_submission = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")
print(sample_submission.head())



In [None]:
sample_submission['Adequate'] = model_preds[:, 0]
sample_submission['Effective'] = model_preds[:, 1]
sample_submission['Ineffective'] = model_preds[:, 2]

print(sample_submission)

In [None]:
sample_submission.to_csv('submission.csv', index=False)