## Ensemble of models

1. feedback_deberta_large_LB0.619 / Score: 0.619
> https://www.kaggle.com/code/brandonhu0215/feedback-deberta-large-lb0-619

2. RoBerta-base Inference v2.0 / Score: 0.649
> https://www.kaggle.com/code/arvissu/roberta-base-inference-v2-0

#### If this notebook is helpful, please upvote the original versions:


## Content

> I tried to get the same result for each model, but by unifying the actions and removing the excess.
>  
> For these models, I use the same prepare_input and inference_fn.

```
def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    [...]

def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
            
        with torch.no_grad():
            output = model(inputs)
        
    [...]
```


# 1. Import & Def & Set & Load

In [None]:
import gc
import os
import pickle
import glob

from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

import numpy as np
import pandas as pd

from tqdm import tqdm

import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.nn import Parameter
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer, AutoConfig

import warnings
warnings.simplefilter('ignore')

In [None]:
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    
    text = unidecode(text)
    
    return text


def fetch_essay(essay_id: str, txt_dir: str):
    essay_path = os.path.join(COMP_DIR + txt_dir, essay_id + '.txt')
    essay_text = open(essay_path, 'r').read()
    
    return essay_text


def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
            
        with torch.no_grad():
            output = model(inputs)
        
        preds.append(F.softmax(output).to('cpu').numpy())

    return np.concatenate(preds)  


def show_gradient(df, n_row=None):
    if not n_row:
        n_row = 5

    return df.head(n_row) \
                .assign(all_mean=lambda x: x.mean(axis=1)) \
                    .style.background_gradient(cmap=cm, axis=1)

In [None]:
pd.set_option('display.precision', 4)
cm = sns.light_palette('green', as_cmap=True)
props_param = "color:white; font-weight:bold; background-color:green;"

N_ROW = 10

COMP_DIR = "../input/feedback-prize-effectiveness/"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
test_path = COMP_DIR + "test.csv"
submission_path = COMP_DIR + "sample_submission.csv"

test_origin = pd.read_csv(test_path)
submission_origin = pd.read_csv(submission_path)

In [None]:
test_origin.head()

# 2. Check unidecode(text)

```
def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    
    text = unidecode(text)
```

In [None]:
data_path = "../input/feedback-prize-effectiveness/train.csv"
cols_list = ['essay_id', 'discourse_text']
idxs_list = [49, 80, 945, 947, 1870]

temp = pd.read_csv(data_path, usecols=cols_list).loc[idxs_list, :]
temp

In [None]:
temp['discourse_text_UPD'] = temp['discourse_text'].apply(resolve_encodings_and_normalize)

temp['essay_text'] = temp['essay_id'].transform(fetch_essay, txt_dir='train')
temp['essay_text_UPD'] = temp['essay_text'].apply(resolve_encodings_and_normalize)

temp

In [None]:
for n, row in enumerate(temp.iterrows()):
    indx, data = row
    disc_text = data.discourse_text
    disc_text_upd = data.discourse_text_UPD

    print(f'\nN{n} === index: {indx} ===')
    print(f'\n>>> origin text:')
    print(repr(disc_text))
    print(f'\n>>> updated text:')
    print(repr(disc_text_upd))

# 3. Extract predictions

## 3.1 DeBerta

In [None]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):    
        text = self.text[item]
        inputs = prepare_input(self.cfg, text)
        
        return inputs

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        
        self.bilstm = nn.LSTM(self.config.hidden_size, (self.config.hidden_size) // 2, num_layers=2, 
                              dropout=self.config.hidden_dropout_prob, batch_first=True,
                              bidirectional=True)
        
        # self.dropout = nn.Dropout(0.2)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        
        self.output = nn.Sequential(
            nn.Linear(self.config.hidden_size, 3)  # self.cfg.target_size
        )
                
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        sequence_output = self.model(**inputs)[0][:, 0, :]

        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5

        return logits

In [None]:
class CFG:
    path = "../input/feedback-deberta-large-051/"
    config_path = path+'config.pth'
    model = "microsoft/deberta-large"
    num_workers = 2
    batch_size = 16
    max_len = 512
    seed = 42
    n_fold = 4
    # trn_fold = [0, 1, 2, 3]
    # fc_dropout = 0.2
    # target_size = 3
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path + 'tokenizer')

In [None]:
df = test_origin.copy()
SEP = CFG.tokenizer.sep_token

df['discourse_text'] = df['discourse_text'].apply(resolve_encodings_and_normalize)
df['essay_text'] = df['essay_id'].transform(fetch_essay, txt_dir='test')
df['essay_text'] = df['essay_text'].apply(resolve_encodings_and_normalize)
df['text'] = df['discourse_type'] + ' ' + df['discourse_text'] + SEP + df['essay_text']

df.head()

In [None]:
test_dataset = TestDataset(CFG, df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

In [None]:
deberta_predictions = []

for fold in range(CFG.n_fold):
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, DEVICE)
    
    deberta_predictions.append(prediction)
    
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()

In [None]:
deb_ineffective = []
deb_effective = []
deb_adequate = []

for x in deberta_predictions:
    deb_ineffective.append(x[:, 0])
    deb_adequate.append(x[:, 1])
    deb_effective.append(x[:, 2])

In [None]:
deb_ineffective = pd.DataFrame(deb_ineffective).T

show_gradient(
    deb_ineffective,
    N_ROW)

In [None]:
deb_adequate = pd.DataFrame(deb_adequate).T

show_gradient(
    deb_adequate,
    N_ROW)

In [None]:
deb_effective = pd.DataFrame(deb_effective).T

show_gradient(
    deb_effective,
    N_ROW)

## 3.2 RoBerta

In [None]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.discourse = df['discourse'].values
        self.essay = df['essay'].values
        
    def __len__(self):
        return len(self.discourse)
    
    def __getitem__(self, item):
        discourse = self.discourse[item]
        essay = self.essay[item]
        
        inputs = prepare_input(self.cfg, discourse, essay)
        
        return inputs
        
class FeedBackModel(nn.Module):
    def __init__(self, model_path):
        super(FeedBackModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.linear = nn.Linear(768, 3)

    def forward(self, inputs):
        last_hidden_states = self.model(**inputs)[0][:, 0, :]
        outputs = self.linear(last_hidden_states)
        
        return outputs

In [None]:
model_list = pickle.load(
    open("../input/feedback-roberta-ep1/roberta_modellist_ep2.pkl", "rb")
)

class CFG:
    path = "../input/roberta-base/"
    n_fold = 5
    batch = 16
    max_len = 512
    num_workers = 2
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path)

In [None]:
df = test_origin.copy()

txt_sep = " "
df['discourse'] = df['discourse_type'].str.lower().str.strip() + txt_sep \
                + df['discourse_text'].str.lower().str.strip()

df['essay'] = df['essay_id'].transform(fetch_essay, txt_dir='test').str.lower().str.strip()
df.head()

In [None]:
test_dataset = TestDataset(CFG, df)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch,
                         shuffle=False, num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

In [None]:
roberta_predicts = []
for i in range(CFG.n_fold):
    model = model_list[i]
    
    prediction = inference_fn(test_loader, model, DEVICE)
    roberta_predicts.append(prediction)
    
    del model, prediction
    torch.cuda.empty_cache()    
    gc.collect()
    
del model_list
gc.collect()

In [None]:
rob_ineffective = []
rob_effective = []
rob_adequate = []

for x in roberta_predicts:
    rob_ineffective.append(x[:, 0])
    rob_adequate.append(x[:, 1])
    rob_effective.append(x[:, 2])

In [None]:
rob_ineffective = pd.DataFrame(rob_ineffective).T

show_gradient(
    rob_ineffective,
    N_ROW)

In [None]:
rob_adequate = pd.DataFrame(rob_adequate).T

show_gradient(
    rob_adequate,
    N_ROW)

In [None]:
rob_effective = pd.DataFrame(rob_effective).T

show_gradient(
    rob_effective,
    N_ROW)

# 4. Create submission

In [None]:
level_names = ['deberta', 'roberta']

ineffective_ = pd.concat(
    [deb_ineffective, rob_ineffective],
    keys=level_names, axis=1
)

adequate_ = pd.concat(
    [deb_adequate, rob_adequate],
    keys=level_names, axis=1
)

effective_ = pd.concat(
    [deb_effective, rob_effective],
    keys=level_names, axis=1
)

In [None]:
show_gradient(
    ineffective_,
    N_ROW
)

In [None]:
show_gradient(
    adequate_,
    N_ROW
)

In [None]:
show_gradient(
    effective_,
    N_ROW
)

In [None]:
submission = submission_origin.copy()

w_ = [.75, .25]  # ['deberta', 'roberta']
d_ = [('Ineffective', ineffective_),
      ('Adequate', adequate_),
      ('Effective', effective_)]

for x in d_:
    col_name, df = x
    submission[col_name] = pd.DataFrame(
        {col: df[col].mean(axis=1) for col in level_names}
    ).mul(w_).sum(axis=1)    

submission.head(N_ROW)

In [None]:
# 0	a261b6e14276	0.0102	0.3887	0.5911
# 1	5a88900e7dc1	0.0309	0.8405	0.1185
# 2	9790d835736b	0.0217	0.6997	0.2686
# 3	75ce6d68b67b	0.0512	0.6365	0.3023
# 4	93578d946723	0.0399	0.6053	0.3448
# 5	2e214524dbe3	0.0099	0.3721	0.6080
# 6	84812fc2ab9f	0.0084	0.2796	0.7020
# 7	c668ff840720	0.0171	0.5888	0.3841
# 8	739a6d00f44a	0.0178	0.4030	0.5692
# 9	bcfae2c9a244	0.0132	0.6373	0.3395

In [None]:
submission.to_csv('submission.csv',index=False)