##  







#### :


## Content

>  
>  
>  

```
def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    [...]

def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
            
        with torch.no_grad():
            output = model(inputs)
        
    [...]
```


# 1. Import & Def & Set & Load

In [1]:
import gc
import os
import pickle
import glob

from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

import numpy as np
import pandas as pd

from tqdm import tqdm

import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.nn import Parameter
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer, AutoConfig

import warnings
warnings.simplefilter('ignore')

In [2]:
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    
    text = unidecode(text)
    
    return text


def fetch_essay(essay_id: str, txt_dir: str):
    essay_path = os.path.join(COMP_DIR + txt_dir, essay_id + '.txt')
    essay_text = open(essay_path, 'r').read()
    
    return essay_text


def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
            
        with torch.no_grad():
            output = model(inputs)
        
        preds.append(F.softmax(output).to('cpu').numpy())

    return np.concatenate(preds)  


def show_gradient(df, n_row=None):
    if not n_row:
        n_row = 5

    return df.head(n_row) \
                .assign(all_mean=lambda x: x.mean(axis=1)) \
                    .style.background_gradient(cmap=cm, axis=1)

In [3]:
pd.set_option('display.precision', 4)
cm = sns.light_palette('green', as_cmap=True)
props_param = "color:white; font-weight:bold; background-color:green;"

N_ROW = 10

COMP_DIR = "../input/feedback-prize-effectiveness/"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
test_path = COMP_DIR + "test.csv"
submission_path = COMP_DIR + "sample_submission.csv"

test_origin = pd.read_csv(test_path)
submission_origin = pd.read_csv(submission_path)

In [5]:
test_origin.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim


# 2. Check unidecode(text)

```
def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    
    text = unidecode(text)
```

In [6]:
data_path = "../input/feedback-prize-effectiveness/train.csv"
cols_list = ['essay_id', 'discourse_text']
idxs_list = [49, 80, 945, 947, 1870]

temp = pd.read_csv(data_path, usecols=cols_list).loc[idxs_list, :]
temp

Unnamed: 0,essay_id,discourse_text
49,0158970BC5D2,"Often times throughout middle school, my teach..."
80,01AFC67DF935,President Obama has done nothing to improve ou...
945,11B9AC1814C8,"The article says, ¨... humans have sent numero..."
947,11B9AC1814C8,"However, the article says that this can only g..."
1870,22E8627A3CB9,Many people are able to tell when a peer is an...


In [7]:
temp['discourse_text_UPD'] = temp['discourse_text'].apply(resolve_encodings_and_normalize)

temp['essay_text'] = temp['essay_id'].transform(fetch_essay, txt_dir='train')
temp['essay_text_UPD'] = temp['essay_text'].apply(resolve_encodings_and_normalize)

temp

Unnamed: 0,essay_id,discourse_text,discourse_text_UPD,essay_text,essay_text_UPD
49,0158970BC5D2,"Often times throughout middle school, my teach...","Often times throughout middle school, my teach...",Students around the world are forgetting more ...,Students around the world are forgetting more ...
80,01AFC67DF935,President Obama has done nothing to improve ou...,President Obama has done nothing to improve ou...,President Obama has done nothing to improve ou...,President Obama has done nothing to improve ou...
945,11B9AC1814C8,"The article says, ¨... humans have sent numero...","The article says, ""... humans have sent numero...",People have studied the many planets in our so...,People have studied the many planets in our so...
947,11B9AC1814C8,"However, the article says that this can only g...","However, the article says that this can only g...",People have studied the many planets in our so...,People have studied the many planets in our so...
1870,22E8627A3CB9,Many people are able to tell when a peer is an...,Many people are able to tell when a peer is an...,The use of the facial software in class rooms ...,The use of the facial software in class rooms ...


In [8]:
for n, row in enumerate(temp.iterrows()):
    indx, data = row
    disc_text = data.discourse_text
    disc_text_upd = data.discourse_text_UPD

    print(f'\nN{n} === index: {indx} ===')
    print(f'\n>>> origin text:')
    print(repr(disc_text))
    print(f'\n>>> updated text:')
    print(repr(disc_text_upd))


N0 === index: 49 ===

>>> origin text:
"Often times throughout middle school, my teachers would give the students each other's assignments to grade themselves. And afterwards, we would see in the grade book\xa0that the grade changed because the students weren't grading the assignments the way they should have been. The student grading was meant to engage the classroom, but it turned into a showcase of how students dont really know how to grade the way teachers do. If students are unaware of how a project is supposed to be graded, they can not successfully design a project that tests whether or not they understand the material. "

>>> updated text:
"Often times throughout middle school, my teachers would give the students each other's assignments to grade themselves. And afterwards, we would see in the grade book that the grade changed because the students weren't grading the assignments the way they should have been. The student grading was meant to engage the classroom, but it turned

# 3. Extract predictions

## 3.1 DeBerta

In [9]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):    
        text = self.text[item]
        inputs = prepare_input(self.cfg, text)
        
        return inputs

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        
        self.bilstm = nn.LSTM(self.config.hidden_size, (self.config.hidden_size) // 2, num_layers=2, 
                              dropout=self.config.hidden_dropout_prob, batch_first=True,
                              bidirectional=True)
        
        # self.dropout = nn.Dropout(0.2)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        
        self.output = nn.Sequential(
            nn.Linear(self.config.hidden_size, 3)  # self.cfg.target_size
        )
                
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        sequence_output = self.model(**inputs)[0][:, 0, :]

        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5

        return logits

In [10]:
class CFG:
    path = "../input/feedback-deberta-large-051/"
    config_path = path+'config.pth'
    model = "microsoft/deberta-large"
    num_workers = 2
    batch_size = 16
    max_len = 512
    seed = 42
    n_fold = 4
    # trn_fold = [0, 1, 2, 3]
    # fc_dropout = 0.2
    # target_size = 3
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path + 'tokenizer')

In [11]:
df = test_origin.copy()
SEP = CFG.tokenizer.sep_token

df['discourse_text'] = df['discourse_text'].apply(resolve_encodings_and_normalize)
df['essay_text'] = df['essay_id'].transform(fetch_essay, txt_dir='test')
df['essay_text'] = df['essay_text'].apply(resolve_encodings_and_normalize)
df['text'] = df['discourse_type'] + ' ' + df['discourse_text'] + SEP + df['essay_text']

df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,essay_text,text
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,Making choices in life can be very difficult. ...,Lead Making choices in life can be very diffic...
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,Making choices in life can be very difficult. ...,Position Seeking multiple opinions can help a ...
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,Making choices in life can be very difficult. ...,Claim it can decrease stress levels [SEP]Makin...
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,Making choices in life can be very difficult. ...,Claim a great chance to learn something new [S...
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,Making choices in life can be very difficult. ...,Claim can be very helpful and beneficial. [SEP...


In [12]:
test_dataset = TestDataset(CFG, df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

In [13]:
deberta_predictions = []

for fold in range(CFG.n_fold):
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, DEVICE)
    
    deberta_predictions.append(prediction)
    
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()

100%|██████████| 1/1 [00:02<00:00,  2.02s/it]
100%|██████████| 1/1 [00:01<00:00,  1.10s/it]
100%|██████████| 1/1 [00:01<00:00,  1.10s/it]
100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


In [14]:
deb_ineffective = []
deb_effective = []
deb_adequate = []

for x in deberta_predictions:
    deb_ineffective.append(x[:, 0])
    deb_adequate.append(x[:, 1])
    deb_effective.append(x[:, 2])

In [15]:
deb_ineffective = pd.DataFrame(deb_ineffective).T

show_gradient(
    deb_ineffective,
    N_ROW)

Unnamed: 0,0,1,2,3,all_mean
0,0.0026,0.0037,0.0154,0.0025,0.006
1,0.0443,0.023,0.042,0.0186,0.032
2,0.028,0.0279,0.027,0.019,0.0255
3,0.0525,0.079,0.0743,0.0773,0.0708
4,0.0318,0.0547,0.0745,0.0489,0.0525
5,0.0076,0.0084,0.0217,0.0078,0.0114
6,0.0052,0.0093,0.0092,0.0045,0.007
7,0.0142,0.013,0.038,0.0209,0.0215
8,0.011,0.0194,0.0474,0.0149,0.0232
9,0.0089,0.0106,0.0102,0.01,0.0099


In [16]:
deb_adequate = pd.DataFrame(deb_adequate).T

show_gradient(
    deb_adequate,
    N_ROW)

Unnamed: 0,0,1,2,3,all_mean
0,0.2618,0.3429,0.4421,0.2671,0.3285
1,0.8806,0.9091,0.8394,0.9201,0.8873
2,0.7148,0.8394,0.6311,0.8366,0.7555
3,0.7037,0.746,0.6031,0.7203,0.6933
4,0.596,0.7368,0.6416,0.5543,0.6322
5,0.3068,0.4171,0.4339,0.3624,0.3801
6,0.2087,0.3003,0.2288,0.1859,0.2309
7,0.5164,0.7484,0.6227,0.7748,0.6656
8,0.2912,0.4925,0.5889,0.3951,0.4419
9,0.4598,0.6757,0.5322,0.6476,0.5788


In [17]:
deb_effective = pd.DataFrame(deb_effective).T

show_gradient(
    deb_effective,
    N_ROW)

Unnamed: 0,0,1,2,3,all_mean
0,0.7356,0.6535,0.5426,0.7304,0.6655
1,0.0751,0.0679,0.1186,0.0613,0.0807
2,0.2572,0.1327,0.3419,0.1444,0.219
3,0.2438,0.1749,0.3226,0.2025,0.2359
4,0.3722,0.2085,0.2839,0.3968,0.3154
5,0.6856,0.5745,0.5444,0.6297,0.6086
6,0.7861,0.6904,0.762,0.8096,0.762
7,0.4695,0.2386,0.3393,0.2042,0.3129
8,0.6978,0.4881,0.3636,0.59,0.5349
9,0.5313,0.3137,0.4576,0.3424,0.4113


## 3.2 RoBerta

In [18]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.discourse = df['discourse'].values
        self.essay = df['essay'].values
        
    def __len__(self):
        return len(self.discourse)
    
    def __getitem__(self, item):
        discourse = self.discourse[item]
        essay = self.essay[item]
        
        inputs = prepare_input(self.cfg, discourse, essay)
        
        return inputs
        
class FeedBackModel(nn.Module):
    def __init__(self, model_path):
        super(FeedBackModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.linear = nn.Linear(768, 3)

    def forward(self, inputs):
        last_hidden_states = self.model(**inputs)[0][:, 0, :]
        outputs = self.linear(last_hidden_states)
        
        return outputs

In [19]:
model_list = pickle.load(
    open("../input/feedback-roberta-ep1/roberta_modellist_ep2.pkl", "rb")
)

class CFG:
    path = "../input/roberta-base/"
    n_fold = 5
    batch = 16
    max_len = 512
    num_workers = 2
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path)

In [20]:
df = test_origin.copy()

txt_sep = " "
df['discourse'] = df['discourse_type'].str.lower().str.strip() + txt_sep \
                + df['discourse_text'].str.lower().str.strip()

df['essay'] = df['essay_id'].transform(fetch_essay, txt_dir='test').str.lower().str.strip()
df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse,essay
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,lead making choices in life can be very diffic...,making choices in life can be very difficult. ...
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,position seeking multiple opinions can help a ...,making choices in life can be very difficult. ...
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,claim it can decrease stress levels,making choices in life can be very difficult. ...
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,claim a great chance to learn something new,making choices in life can be very difficult. ...
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,claim can be very helpful and beneficial.,making choices in life can be very difficult. ...


In [21]:
test_dataset = TestDataset(CFG, df)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch,
                         shuffle=False, num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

In [22]:
roberta_predicts = []
for i in range(CFG.n_fold):
    model = model_list[i]
    
    prediction = inference_fn(test_loader, model, DEVICE)
    roberta_predicts.append(prediction)
    
    del model, prediction
    torch.cuda.empty_cache()    
    gc.collect()
    
del model_list
gc.collect()

100%|██████████| 1/1 [00:00<00:00,  2.20it/s]
100%|██████████| 1/1 [00:00<00:00,  2.27it/s]
100%|██████████| 1/1 [00:00<00:00,  2.29it/s]
100%|██████████| 1/1 [00:00<00:00,  2.28it/s]
100%|██████████| 1/1 [00:00<00:00,  2.26it/s]


0

In [23]:
rob_ineffective = []
rob_effective = []
rob_adequate = []

for x in roberta_predicts:
    rob_ineffective.append(x[:, 0])
    rob_adequate.append(x[:, 1])
    rob_effective.append(x[:, 2])

In [24]:
rob_ineffective = pd.DataFrame(rob_ineffective).T

show_gradient(
    rob_ineffective,
    N_ROW)

Unnamed: 0,0,1,2,3,4,all_mean
0,0.0055,0.0284,0.0119,0.0101,0.0377,0.0187
1,0.0191,0.0364,0.019,0.0087,0.0658,0.0298
2,0.0269,0.0066,0.0186,0.0045,0.0178,0.0149
3,0.0164,0.0064,0.0149,0.0032,0.0269,0.0135
4,0.0209,0.0101,0.0255,0.0023,0.0209,0.0159
5,0.007,0.0068,0.0057,0.0021,0.0145,0.0072
6,0.009,0.0107,0.0068,0.0029,0.0274,0.0114
7,0.0105,0.0054,0.0077,0.0014,0.019,0.0088
8,0.0119,0.0058,0.0093,0.0023,0.0088,0.0076
9,0.0251,0.0126,0.0141,0.0136,0.035,0.0201


In [25]:
rob_adequate = pd.DataFrame(rob_adequate).T

show_gradient(
    rob_adequate,
    N_ROW)

Unnamed: 0,0,1,2,3,4,all_mean
0,0.3181,0.6565,0.4918,0.4863,0.6528,0.5211
1,0.7711,0.8861,0.8163,0.5262,0.8626,0.7725
2,0.8514,0.595,0.6534,0.3326,0.6136,0.6092
3,0.693,0.5173,0.5038,0.2652,0.7316,0.5422
4,0.7414,0.5963,0.6665,0.233,0.6127,0.57
5,0.3776,0.4833,0.3316,0.1943,0.4502,0.3674
6,0.3994,0.4984,0.2886,0.2227,0.518,0.3854
7,0.5818,0.5313,0.388,0.1684,0.5965,0.4532
8,0.4484,0.4189,0.3826,0.1857,0.2508,0.3373
9,0.8813,0.8387,0.7919,0.7005,0.656,0.7737


In [26]:
rob_effective = pd.DataFrame(rob_effective).T

show_gradient(
    rob_effective,
    N_ROW)

Unnamed: 0,0,1,2,3,4,all_mean
0,0.6763,0.3151,0.4963,0.5036,0.3095,0.4602
1,0.2098,0.0775,0.1647,0.4651,0.0716,0.1977
2,0.1218,0.3984,0.328,0.663,0.3686,0.3759
3,0.2906,0.4764,0.4813,0.7316,0.2415,0.4443
4,0.2377,0.3935,0.3081,0.7647,0.3664,0.4141
5,0.6155,0.5099,0.6627,0.8036,0.5353,0.6254
6,0.5916,0.4909,0.7046,0.7744,0.4546,0.6032
7,0.4077,0.4634,0.6043,0.8302,0.3845,0.538
8,0.5397,0.5754,0.6081,0.8121,0.7404,0.6551
9,0.0936,0.1487,0.194,0.2859,0.309,0.2063


# LGBM

In [27]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import log_loss

import gensim
from scipy import sparse
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')


In [28]:
class CFG:
    seed = 42
    n_folds = 4
    
INPUT_DIR = "../input/feedback-prize-effectiveness/"

def get_train_essay(essay_id):
    essay_path = os.path.join(INPUT_DIR,f'train/{essay_id}.txt')
    essay_text = open(essay_path,'r').read()
    return essay_text

def get_test_essay(essay_id):
    essay_path = os.path.join(INPUT_DIR,f'test/{essay_id}.txt')
    essay_text = open(essay_path,'r').read()
    return essay_text

train = pd.read_csv(INPUT_DIR+'train.csv')
test = pd.read_csv(INPUT_DIR+'test.csv')
train['essay_text'] = train['essay_id'].apply(get_train_essay)
test['essay_text'] = test['essay_id'].apply(get_test_essay)

def set_seed(seed=42):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(CFG.seed)

effectiveness_map = {'Ineffective':0, 'Adequate':1, 'Effective':2}
train['target'] = train['discourse_effectiveness'].map(effectiveness_map)

sgkf = StratifiedGroupKFold(n_splits=CFG.n_folds,shuffle=True,random_state=CFG.seed)

for fold, (_,val_idx) in enumerate(sgkf.split(X=train, y=train['target'], groups=train.essay_id)):
    train.loc[val_idx,'kfold'] = fold

word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../input/google-news/GoogleNews-vectors-negative300.bin', binary=True)
print(word2vec_model.vectors.shape)

def avg_feature_vector(sentence, model, num_features):
    words = sentence.replace('\n'," ").replace(',',' ').replace('.'," ").split()
    feature_vec = np.zeros((num_features,),dtype="float32")
    i=0
    for word in words:
        try:
            feature_vec = np.add(feature_vec, model[word])
        except KeyError as error:
            feature_vec 
            i = i + 1
    if len(words) > 0:
        feature_vec = np.divide(feature_vec, len(words)- i)
    return feature_vec

params = {}
params["objective"] = 'multiclass'
params['metric'] = 'multi_logloss'
params['boosting'] = 'gbdt'
params['num_class'] = 3
params['is_unbalance'] = True
params["learning_rate"] = 0.05
params["lambda_l2"] = 0.0256
params["num_leaves"] = 52
params["max_depth"] = 10
params["feature_fraction"] = 0.503
params["bagging_fraction"] = 0.741
params["bagging_freq"] = 8
params["bagging_seed"] = 10
params["min_data_in_leaf"] = 10
params["verbosity"] = -1
params["random_state"] = 42
num_rounds = 1000

oof_score = 0
y_test_pred = np.zeros((test.shape[0], 3))

for fold in range(CFG.n_folds):
    print(f'=============fold:{fold}==================')
    train_fold=train[train['kfold']!=fold].reset_index(drop=True)
    valid_fold=train[train['kfold']==fold].reset_index(drop=True)

    #word2vec

    #discourse_text
    word2vec_train_disc_text = np.zeros((len(train_fold.index),300),dtype="float32")
    word2vec_valid_disc_text = np.zeros((len(valid_fold.index),300),dtype="float32")
    word2vec_test_disc_text = np.zeros((len(test.index),300),dtype="float32")
    for i in range(len(train_fold.index)):
        word2vec_train_disc_text[i] = avg_feature_vector(train_fold["discourse_text"][i], word2vec_model, 300)
    for i in range(len(valid_fold.index)):
        word2vec_valid_disc_text[i] = avg_feature_vector(valid_fold["discourse_text"][i], word2vec_model, 300)
    for i in range(len(test.index)):
        word2vec_test_disc_text[i] = avg_feature_vector(test["discourse_text"][i], word2vec_model, 300)

    #essay_text
    word2vec_train_essay_text = np.zeros((len(train_fold.index),300),dtype="float32")
    word2vec_valid_essay_text = np.zeros((len(valid_fold.index),300),dtype="float32")
    word2vec_test_essay_text = np.zeros((len(test.index),300),dtype="float32")
    for i in range(len(train_fold.index)):
        word2vec_train_essay_text[i] = avg_feature_vector(train_fold["essay_text"][i], word2vec_model, 300)
    for i in range(len(valid_fold.index)):
        word2vec_valid_essay_text[i] = avg_feature_vector(valid_fold["essay_text"][i], word2vec_model, 300)
    for i in range(len(test.index)):
        word2vec_test_essay_text[i] = avg_feature_vector(test["essay_text"][i], word2vec_model, 300)

    #OneHot
    ohe = OneHotEncoder()
    train_type_ohe=sparse.csr_matrix(ohe.fit_transform(train_fold['discourse_type'].values.reshape(-1,1)))
    valid_type_ohe=sparse.csr_matrix(ohe.transform(valid_fold['discourse_type'].values.reshape(-1,1)))
    test_type_ohe=sparse.csr_matrix(ohe.transform(test['discourse_type'].values.reshape(-1,1)))


    #merge
    Xtrain_word2vec = sparse.hstack((train_type_ohe,word2vec_train_disc_text,word2vec_train_essay_text))
    Xvalid_word2vec = sparse.hstack((valid_type_ohe,word2vec_valid_disc_text,word2vec_valid_essay_text))
    test_word2vec = sparse.hstack((test_type_ohe,word2vec_test_disc_text,word2vec_test_essay_text))

    #lgbm
    lgtrain = lgb.Dataset(Xtrain_word2vec, label=train_fold['target'].ravel())
    lgvalidation = lgb.Dataset(Xvalid_word2vec, label=valid_fold['target'].ravel())

    model = lgb.train(params, lgtrain, num_rounds, 
                    valid_sets=[lgtrain, lgvalidation], 
                    early_stopping_rounds=100, verbose_eval=100)

    y_pred = model.predict(Xvalid_word2vec, num_iteration=model.best_iteration)
    y_test_pred += model.predict(test_word2vec, num_iteration=model.best_iteration)

    score = log_loss(valid_fold['target'], y_pred)
    oof_score += score

    print(f'Fold:{fold},valid score:{score}')
    
y_test_pred = y_test_pred / float(CFG.n_folds)
oof_score /= float(CFG.n_folds)
print("Aggregate OOF Score: {}".format(oof_score))



(3000000, 300)
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.460002	valid_1's multi_logloss: 0.748048
[200]	training's multi_logloss: 0.313471	valid_1's multi_logloss: 0.732728
[300]	training's multi_logloss: 0.221816	valid_1's multi_logloss: 0.735344
Early stopping, best iteration is:
[236]	training's multi_logloss: 0.275861	valid_1's multi_logloss: 0.731422
Fold:0,valid score:0.7314221251391888
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.462338	valid_1's multi_logloss: 0.738764
[200]	training's multi_logloss: 0.316004	valid_1's multi_logloss: 0.724543
[300]	training's multi_logloss: 0.224546	valid_1's multi_logloss: 0.725092
Early stopping, best iteration is:
[255]	training's multi_logloss: 0.261299	valid_1's multi_logloss: 0.723197
Fold:1,valid score:0.7231967256840189
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.461596	valid_1's mu

In [29]:
lgbm_ineffective = y_test_pred[:,0]
lgbm_adequate = y_test_pred[:,1]
lgbm_effective = y_test_pred[:,2]

lgbm_ineffective = pd.DataFrame(lgbm_ineffective)
lgbm_adequate = pd.DataFrame(lgbm_adequate)
lgbm_effective = pd.DataFrame(lgbm_effective)


In [30]:
lgbm_ineffective

Unnamed: 0,0
0,0.0246
1,0.0208
2,0.0245
3,0.079
4,0.0554
5,0.0275
6,0.032
7,0.0287
8,0.0539
9,0.0208


# 4. Create submission

In [31]:
level_names = ['deberta', 'roberta', 'lgbm']

ineffective_ = pd.concat(
    [deb_ineffective, rob_ineffective, lgbm_ineffective],
    keys=level_names, axis=1
)

adequate_ = pd.concat(
    [deb_adequate, rob_adequate, lgbm_adequate],
    keys=level_names, axis=1
)

effective_ = pd.concat(
    [deb_effective, rob_effective, lgbm_effective],
    keys=level_names, axis=1
)

In [32]:
show_gradient(
    ineffective_,
    N_ROW
)

Unnamed: 0_level_0,deberta,deberta,deberta,deberta,roberta,roberta,roberta,roberta,roberta,lgbm,all_mean
Unnamed: 0_level_1,0,1,2,3,0,1,2,3,4,0,Unnamed: 11_level_1
0,0.0026,0.0037,0.0154,0.0025,0.0055,0.0284,0.0119,0.0101,0.0377,0.0246,0.0142
1,0.0443,0.023,0.042,0.0186,0.0191,0.0364,0.019,0.0087,0.0658,0.0208,0.0298
2,0.028,0.0279,0.027,0.019,0.0269,0.0066,0.0186,0.0045,0.0178,0.0245,0.0201
3,0.0525,0.079,0.0743,0.0773,0.0164,0.0064,0.0149,0.0032,0.0269,0.079,0.043
4,0.0318,0.0547,0.0745,0.0489,0.0209,0.0101,0.0255,0.0023,0.0209,0.0554,0.0345
5,0.0076,0.0084,0.0217,0.0078,0.007,0.0068,0.0057,0.0021,0.0145,0.0275,0.0109
6,0.0052,0.0093,0.0092,0.0045,0.009,0.0107,0.0068,0.0029,0.0274,0.032,0.0117
7,0.0142,0.013,0.038,0.0209,0.0105,0.0054,0.0077,0.0014,0.019,0.0287,0.0159
8,0.011,0.0194,0.0474,0.0149,0.0119,0.0058,0.0093,0.0023,0.0088,0.0539,0.0185
9,0.0089,0.0106,0.0102,0.01,0.0251,0.0126,0.0141,0.0136,0.035,0.0208,0.0161


In [33]:
show_gradient(
    adequate_,
    N_ROW
)

Unnamed: 0_level_0,deberta,deberta,deberta,deberta,roberta,roberta,roberta,roberta,roberta,lgbm,all_mean
Unnamed: 0_level_1,0,1,2,3,0,1,2,3,4,0,Unnamed: 11_level_1
0,0.2618,0.3429,0.4421,0.2671,0.3181,0.6565,0.4918,0.4863,0.6528,0.4002,0.4319
1,0.8806,0.9091,0.8394,0.9201,0.7711,0.8861,0.8163,0.5262,0.8626,0.8129,0.8224
2,0.7148,0.8394,0.6311,0.8366,0.8514,0.595,0.6534,0.3326,0.6136,0.5181,0.6586
3,0.7037,0.746,0.6031,0.7203,0.693,0.5173,0.5038,0.2652,0.7316,0.3895,0.5873
4,0.596,0.7368,0.6416,0.5543,0.7414,0.5963,0.6665,0.233,0.6127,0.5239,0.5903
5,0.3068,0.4171,0.4339,0.3624,0.3776,0.4833,0.3316,0.1943,0.4502,0.2372,0.3594
6,0.2087,0.3003,0.2288,0.1859,0.3994,0.4984,0.2886,0.2227,0.518,0.3215,0.3172
7,0.5164,0.7484,0.6227,0.7748,0.5818,0.5313,0.388,0.1684,0.5965,0.4701,0.5398
8,0.2912,0.4925,0.5889,0.3951,0.4484,0.4189,0.3826,0.1857,0.2508,0.3437,0.3798
9,0.4598,0.6757,0.5322,0.6476,0.8813,0.8387,0.7919,0.7005,0.656,0.432,0.6616


In [34]:
show_gradient(
    effective_,
    N_ROW
)

Unnamed: 0_level_0,deberta,deberta,deberta,deberta,roberta,roberta,roberta,roberta,roberta,lgbm,all_mean
Unnamed: 0_level_1,0,1,2,3,0,1,2,3,4,0,Unnamed: 11_level_1
0,0.7356,0.6535,0.5426,0.7304,0.6763,0.3151,0.4963,0.5036,0.3095,0.5752,0.5538
1,0.0751,0.0679,0.1186,0.0613,0.2098,0.0775,0.1647,0.4651,0.0716,0.1662,0.1478
2,0.2572,0.1327,0.3419,0.1444,0.1218,0.3984,0.328,0.663,0.3686,0.4574,0.3213
3,0.2438,0.1749,0.3226,0.2025,0.2906,0.4764,0.4813,0.7316,0.2415,0.5315,0.3697
4,0.3722,0.2085,0.2839,0.3968,0.2377,0.3935,0.3081,0.7647,0.3664,0.4206,0.3753
5,0.6856,0.5745,0.5444,0.6297,0.6155,0.5099,0.6627,0.8036,0.5353,0.7353,0.6297
6,0.7861,0.6904,0.762,0.8096,0.5916,0.4909,0.7046,0.7744,0.4546,0.6464,0.6711
7,0.4695,0.2386,0.3393,0.2042,0.4077,0.4634,0.6043,0.8302,0.3845,0.5012,0.4443
8,0.6978,0.4881,0.3636,0.59,0.5397,0.5754,0.6081,0.8121,0.7404,0.6024,0.6017
9,0.5313,0.3137,0.4576,0.3424,0.0936,0.1487,0.194,0.2859,0.309,0.5472,0.3224


In [35]:
submission = submission_origin.copy()

w_ = [.65, .25, .10]  # ['deberta', 'roberta', 'gbm']
d_ = [('Ineffective', ineffective_),
      ('Adequate', adequate_),
      ('Effective', effective_)]

for x in d_:
    col_name, df = x
    submission[col_name] = pd.DataFrame(
        {col: df[col].mean(axis=1) for col in level_names}
    ).mul(w_).sum(axis=1)    

submission.head(N_ROW)

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.0111,0.3838,0.6052
1,5a88900e7dc1,0.0303,0.8512,0.1185
2,9790d835736b,0.0227,0.6952,0.2821
3,75ce6d68b67b,0.0573,0.6251,0.3176
4,93578d946723,0.0436,0.6058,0.3506
5,2e214524dbe3,0.0119,0.3626,0.6255
6,84812fc2ab9f,0.0106,0.2786,0.7108
7,c668ff840720,0.0191,0.5929,0.388
8,739a6d00f44a,0.0224,0.4059,0.5717
9,bcfae2c9a244,0.0135,0.6128,0.3736


In [36]:
# 0	a261b6e14276	0.0102	0.3887	0.5911
# 1	5a88900e7dc1	0.0309	0.8405	0.1185
# 2	9790d835736b	0.0217	0.6997	0.2686
# 3	75ce6d68b67b	0.0512	0.6365	0.3023
# 4	93578d946723	0.0399	0.6053	0.3448
# 5	2e214524dbe3	0.0099	0.3721	0.6080
# 6	84812fc2ab9f	0.0084	0.2796	0.7020
# 7	c668ff840720	0.0171	0.5888	0.3841
# 8	739a6d00f44a	0.0178	0.4030	0.5692
# 9	bcfae2c9a244	0.0132	0.6373	0.3395

In [37]:
submission.to_csv('submission.csv',index=False)