In [None]:
# add discourse type, epoch2 5 Fold 

In [None]:
import gc
import os
import sys
import time
import pickle
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

import warnings
warnings.simplefilter('ignore')

In [None]:
def fetchEssay(essay_id: str):
    """
    Read the text file of the specific essay_id
    """
    essay_path = os.path.join('../input/feedback-prize-effectiveness/test/', essay_id + '.txt')
    essay_text = open(essay_path, 'r').read()
    return essay_text


In [None]:
class callback:
    def __init__(self):
        self.loss = list()
        self.model = list()
    
    def put(self, model, loss):
        self.loss.append(loss)
        self.model.append(model)

    def get_model(self):
        ind = np.argmin(self.loss)
        return self.model[ind]

    
class FeedBackModel(nn.Module):
    def __init__(self, model_path):
        super(FeedBackModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.linear = nn.Linear(768, 3)

    def forward(self, ids, mask):
        x = self.model(ids, mask)[0][:, 0, :]
        pred = self.linear(x)
        return pred


class FeedBackDataset(Dataset):
    def __init__(self, data, tokenizer, is_test=False):
        self.data = data
        self.is_test = is_test
        self.tokenizer = tokenizer
        
        
    def __getitem__(self, idx):
        inputs = tokenizer.encode_plus(
            self.data['discourse_type'].values[idx].lower().lstrip().rstrip() + " " + self.data['discourse_text'].values[idx].lower().lstrip().rstrip(),
            self.data['essay'].values[idx].lower().lstrip().rstrip(),
            add_special_tokens=True,
            truncation=True,
            max_length=MAX_LEN
        )['input_ids'] 

        if not self.is_test:
            target_value = self.data[y_cols].values[idx]
      
        mask = [1]*len(inputs) + [0] * (MAX_LEN - len(inputs)) 
        mask = torch.tensor(mask, dtype=torch.long)
        
        if len(inputs) != MAX_LEN:
            inputs = inputs + [self.tokenizer.pad_token_id] * (MAX_LEN - len(inputs)) 
        ids = torch.tensor(inputs, dtype=torch.long)
        
        
        
        
        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
            }
        
        else:
            targets = torch.FloatTensor(target_value)
            return {
                'ids': ids,
                'mask': mask,
                'targets': targets
            }
        
    def __len__(self):
        return len(self.data)

In [None]:
test_df = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
test_df['essay'] = test_df['essay_id'].apply(fetchEssay)

In [None]:
model_path = '../input/roberta-base/'
y_cols = ['discourse_effectiveness']
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [None]:
NFOLDS = 5
BATCH_SIZE = 32
MAX_LEN = 512
test_loader = torch.utils.data.DataLoader(FeedBackDataset(test_df, tokenizer, True), batch_size=BATCH_SIZE, shuffle=False)
model_list = pickle.load(open("../input/feedback-roberta-ep1/roberta_modellist_ep2.pkl", "rb"))
test_pred = np.zeros((len(test_df), 3))

In [None]:
for idx in range(NFOLDS):
    print(f'start to inference fold : {idx}')
    net = model_list[idx]
    net.eval()
    net.cuda()
    result = list()
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            input_ids = data['ids'].cuda()
            input_masks = data['mask'].cuda()
            pred = F.softmax(net(input_ids,input_masks))
            result.extend(pred.cpu().detach().numpy())
    test_pred += np.array(result)/NFOLDS

In [None]:
submission_df = pd.read_csv('../input/feedback-prize-effectiveness/sample_submission.csv')
submission_df['Ineffective'] = test_pred[:,0]
submission_df['Adequate'] = test_pred[:,1]
submission_df['Effective'] = test_pred[:,2]
submission_df

In [None]:
submission_df.to_csv('submission.csv',index=False)
