In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

!ls /kaggle/input/

# Any results you write to the current directory are saved as output.

In [None]:
!ls /kaggle/input/e127-roberta-answer
!ls /kaggle/input/huggingface-model-configs
!ls /kaggle/input/sub-e078-e083-bert2-roberta1-xlnet1

In [None]:
test_df = pd.read_csv('/kaggle/input/google-quest-challenge/test.csv')

## pre-settings

In [None]:
import os

def OSprint(string):
    os.system(f'echo \"{string}\"')
    print(string)

In [None]:
!ls /kaggle/input/

In [None]:
NUM_FOLDS = 5
DEVICE = 'cuda'
#DEVICE = 'cpu'
BATCH_SIZE = 8

BERT_BASE_MODEL_PATH = '/kaggle/input/huggingface-model-configs/bert-model-uncased-config.pkl'
ROBERTA_BASE_MODEL_PATH = '/kaggle/input/huggingface-model-configs/roberta-model-base-config.pkl'
XLNET_BASE_MODEL_PATH = '/kaggle/input/huggingface-model-configs/xlnet-model-base-cased-config.pkl'
BERT_Q_BASE_PATH = '/kaggle/input/e121-bert-question/'
BERT_A_BASE_PATH = '/kaggle/input/e125-bert-answer/'
ROBERTA_Q_BASE_PATH = '/kaggle/input/e126-roberta-question/'
ROBERTA_A_BASE_PATH = '/kaggle/input/e127-roberta-answer/'
XLNET_Q_BASE_PATH = '/kaggle/input/e128-xlnet-question/'
XLNET_A_BASE_PATH = '/kaggle/input/e129-xlnet-answer/'

In [None]:
Q_LABEL_COL = [
    'question_asker_intent_understanding',
    'question_body_critical',
    'question_conversational',
    'question_expect_short_answer',
    'question_fact_seeking',
    'question_has_commonly_accepted_answer',
    'question_interestingness_others',
    'question_interestingness_self',
    'question_multi_intent',
    'question_not_really_a_question',
    'question_opinion_seeking',
    'question_type_choice',
    'question_type_compare',
    'question_type_consequence',
    'question_type_definition',
    'question_type_entity',
    'question_type_instructions',
    'question_type_procedure',
    'question_type_reason_explanation',
    'question_type_spelling',
    'question_well_written',
]

A_LABEL_COL = [
    'answer_helpful',
    'answer_level_of_information',
    'answer_plausible',
    'answer_relevance',
    'answer_satisfaction',
    'answer_type_instructions',
    'answer_type_procedure',
    'answer_type_reason_explanation',
    'answer_well_written'
]

In [None]:
!pip install /kaggle/input/sacremoses-master/sacremoses > /dev/null
#!pip install /kaggle/input/transformers/transformers-master #> /dev/null
!pip install --no-deps /kaggle/input/guchio-transformers/*.whl #> /dev/null

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer, XLNetModel, XLNetTokenizer
import random
from math import ceil, floor
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
import pickle
import gc

In [None]:
OSprint('start training!')

## utils

In [None]:
class QUESTDataset(Dataset):
    def __init__(self, df, mode, tokens, augment,
                 tokenizer_type, pretrained_model_name_or_path, do_lower_case,
                 LABEL_COL, t_max_len, q_max_len, a_max_len, tqa_mode,
                 TBSEP, pos_id_type, MAX_SEQUENCE_LENGTH=None,
                 use_category=True, logger=None):
        self.mode = mode
        self.augment = augment
        self.len = len(df)
        self.t_max_len = t_max_len
        self.q_max_len = q_max_len
        self.a_max_len = a_max_len
        self.tqa_mode = tqa_mode
        self.TBSEP = TBSEP
        self.pos_id_type = pos_id_type
        if MAX_SEQUENCE_LENGTH:
            self.MAX_SEQUENCE_LENGTH = MAX_SEQUENCE_LENGTH
        else:
            raise NotImplementedError
        self.use_category = use_category
        self.logger = logger
        self.cat_dict = {
            'CAT_TECHNOLOGY'.casefold(): 0,
            'CAT_STACKOVERFLOW'.casefold(): 1,
            'CAT_CULTURE'.casefold(): 2,
            'CAT_SCIENCE'.casefold(): 3,
            'CAT_LIFE_ARTS'.casefold(): 4,
        }

        if mode == "test":
            self.labels = pd.DataFrame([[-1] * len(LABEL_COL)] * len(df))
        else:  # train or valid
            self.labels = df[LABEL_COL]

        self.tokenizer_type = tokenizer_type
        if tokenizer_type == 'bert':
            tokenizer = BertTokenizer
        elif tokenizer_type == 'roberta':
            tokenizer = RobertaTokenizer
        elif tokenizer_type == 'xlnet':
            tokenizer = XLNetTokenizer
        else:
            raise NotImplementedError
        self.tokenizer = tokenizer.from_pretrained(
            pretrained_model_name_or_path, do_lower_case=do_lower_case)
        self.tokenizer.add_tokens([self.TBSEP])

        tokens = [token.encode('ascii', 'replace').decode()
                  for token in tokens if token != '']
        added_num = self.tokenizer.add_tokens(tokens)
        if logger:
            logger.info(f'additional_tokens : {added_num}')
        else:
            print(f'additional_tokens : {added_num}')
        # change online preprocess or off line preprocess
        self.original_df = df

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        # change online preprocess or off line preprocess
        idx_row = self.original_df.iloc[idx].copy()
        idx_row = self.__preprocess_text_row(idx_row,
                                             t_max_len=self.t_max_len,
                                             q_max_len=self.q_max_len,
                                             a_max_len=self.a_max_len)
        input_ids = idx_row['input_ids'].squeeze()
        if self.tokenizer_type == 'roberta':
            token_type_ids = torch.zeros(self.MAX_SEQUENCE_LENGTH, dtype=torch.long)
        else:
            token_type_ids = idx_row['token_type_ids'].squeeze()
        attention_mask = idx_row['attention_mask'].squeeze()
        qa_id = idx_row['qa_id'].squeeze()

        if self.pos_id_type == 'arange':
            position_ids = torch.arange(self.MAX_SEQUENCE_LENGTH)
        # elif self.pos_id_type == 'tq_a_sep':
        #     position_ids = torch.cat([
        #         torch.arange(self.t_max_len+self.q_max_len+3),
        #         torch.arange(self.q_max_len+1)])
        # elif self.pos_id_type == 't_q_sep':
        #     position_ids = torch.cat([
        #         torch.arange(self.t_max_len+self.q_max_len+3),
        #         torch.arange(self.q_max_len+1)])
        # elif self.pos_id_type == 't_a_sep':
        #     position_ids = torch.cat(torch.arange(self.t_max_len+self.q_max_len+3)
        #             self.MAX_SEQUENCE_LENGTH)
        else:
            raise NotImplementedError

        labels = self.labels.iloc[idx].values
        return qa_id, input_ids, attention_mask, \
            token_type_ids, position_ids, labels

    def _trim_input(self, title, question, answer,
                    t_max_len, q_max_len, a_max_len):

        t_len = len(title)
        q_len = len(question)
        a_len = len(answer)

        if (t_len + q_len + a_len + 4) > self.MAX_SEQUENCE_LENGTH:
            if t_max_len > t_len:
                t_new_len = t_len
                a_max_len = a_max_len + floor((t_max_len - t_len) / 2)
                q_max_len = q_max_len + ceil((t_max_len - t_len) / 2)
            else:
                t_new_len = t_max_len

            if a_max_len > a_len:
                a_new_len = a_len
                q_new_len = q_max_len + (a_max_len - a_len)
            elif q_max_len > q_len:
                a_new_len = a_max_len + (q_max_len - q_len)
                q_new_len = q_len
            else:
                a_new_len = a_max_len
                q_new_len = q_max_len

            if t_new_len + a_new_len + q_new_len + 4 != self.MAX_SEQUENCE_LENGTH:
                raise ValueError("New sequence length should be %d, but is %d"
                                 % (self.MAX_SEQUENCE_LENGTH,
                                     (t_new_len + a_new_len + q_new_len + 4)))
            if len(title) > t_new_len:
                title = title[:t_new_len // 2] + title[-t_new_len // 2:]
            else:
                title = title[:t_new_len]
            if len(question) > q_new_len:
                question = question[:q_new_len // 2] + \
                    question[-q_new_len // 2:]
            else:
                question = question[:q_new_len]
            if len(answer) > a_new_len:
                answer = answer[:a_new_len // 2] + answer[-a_new_len // 2:]
            else:
                answer = answer[:a_new_len]
        return title, question, answer

    def __preprocess_text_row(self, row, t_max_len, q_max_len, a_max_len):
        qa_id = row.qa_id
        title = self.tokenizer.tokenize(row.question_title)
        body = self.tokenizer.tokenize(row.question_body)
        answer = self.tokenizer.tokenize(row.answer)
        category = ('CAT_' + row.category).casefold()

        # category を text として入れてしまう !!!
        if self.use_category:
            title = [category] + title

        title, body, answer = self._trim_input(title, body, answer,
                                               t_max_len=t_max_len,
                                               q_max_len=q_max_len,
                                               a_max_len=a_max_len)

        if len(title) == 0:
            # print(f'NO TITLE, qa_id: {qa_id}')
            title = ['_']
        if len(body) == 0:
            # print(f'NO BODY, qa_id: {qa_id}')
            body = ['_']
        if len(answer) == 0:
            # print(f'NO ANSWER, qa_id: {qa_id}')
            answer = ['_']

        if self.tqa_mode == 'tq_a':
            text = title + [self.TBSEP] + body
            text_pair = answer
        elif self.tqa_mode == 't_q':
            text = title
            text_pair = body
        elif self.tqa_mode == 't_a':
            text = title
            text_pair = answer

        encoded_texts_dict = self.tokenizer.encode_plus(
            text=text,
            text_pair=text_pair,
            add_special_tokens=True,
            max_length=self.MAX_SEQUENCE_LENGTH,
            pad_to_max_length=True,
            return_tensors='pt',
            return_token_type_ids=True,
            return_attention_mask=True,
            return_overflowing_tokens=True,
        )
        encoded_texts_dict['qa_id'] = qa_id
        return encoded_texts_dict

class BertModelForBinaryMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels, config_path, state_dict,
                 token_size=None, MAX_SEQUENCE_LENGTH=512):
        super(BertModelForBinaryMultiLabelClassifier, self).__init__()
        with open(config_path, 'rb') as fin:
            config = pickle.load(fin)
        self.model = BertModel(config)
        # self.model.load_state_dict(state_dict)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.model.config.hidden_size, num_labels)

        # resize
        if token_size:
            self.model.resize_token_embeddings(token_size)

        # add modules
        self.add_module('my_fc_output', self.classifier)

    def forward(self, input_ids=None, input_cats=None, labels=None, attention_mask=None,
                token_type_ids=None, position_ids=None, head_mask=None,
                inputs_embeds=None, encoder_hidden_states=None,
                encoder_attention_mask=None):

        outputs = self.model(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
                             position_ids=position_ids,
                             head_mask=head_mask,
                             inputs_embeds=inputs_embeds,
                             encoder_hidden_states=encoder_hidden_states,
                             encoder_attention_mask=encoder_attention_mask)

        pooled_output = torch.mean(outputs[0], dim=1)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # add hidden states and attention if they are here
        outputs = (logits,) + outputs[2:]

        return outputs  # logits, (hidden_states), (attentions)

    def resize_token_embeddings(self, token_num):
        self.model.resize_token_embeddings(token_num)

    def freeze_unfreeze_bert(self, freeze=True, logger=None):
        if freeze:
            print('FREEZE bert model !', logger)
            # for name, child in self.model.module.named_children():
            for name, child in self.model.named_children():
                for param in child.parameters():
                    param.requires_grad = False
        else:
            print('UNFREEZE bert model !', logger)
            # for name, child in self.model.module.named_children():
            for name, child in self.model.named_children():
                for param in child.parameters():
                    param.requires_grad = True

    # def _resize_embeddings(self, old_embeddings, new_num_tokens):
    #     old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
    #     if old_num_tokens == new_num_tokens:
    #         return old_embeddings

    #     # Build new embeddings
    #     new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
    #     new_embeddings.to(old_embeddings.weight.device)

    #     # Copy word embeddings from the previous weights
    #     num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
    #     new_embeddings.weight.data[:num_tokens_to_copy,
    #                                :] = old_embeddings.weight.data[:num_tokens_to_copy, :]

    #     return new_embeddings


class RobertaModelForBinaryMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels, config_path, state_dict,
                 token_size=None, MAX_SEQUENCE_LENGTH=512):
        super(RobertaModelForBinaryMultiLabelClassifier, self).__init__()
        with open(config_path, 'rb') as fin:
            config = pickle.load(fin)
        self.model = RobertaModel(config)
        # self.model.load_state_dict(state_dict)
        # # only for roberta
        # if self.model.state_dict()['embeddings.token_type_embeddings.weight'].shape[0] == 1:
        #     self.model.load_state_dict(state_dict)
        #     self.model.embeddings.token_type_embeddings = self._resize_embeddings(
        #         self.model.embeddings.token_type_embeddings, 2)
        # elif self.model.state_dict()['embeddings.token_type_embeddings.weight'].shape[0] == 2:
        #     self.model.embeddings.token_type_embeddings = self._resize_embeddings(
        #         self.model.embeddings.token_type_embeddings, 2)
        #     self.model.load_state_dict(state_dict)
        # else:
        #     raise NotImplementedError
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.model.config.hidden_size, num_labels)

        # resize
        if token_size:
            self.model.resize_token_embeddings(token_size)

        # add modules
        self.add_module('my_fc_output', self.classifier)

    def forward(self, input_ids=None, input_cats=None, labels=None, attention_mask=None,
                token_type_ids=None, position_ids=None, head_mask=None,
                inputs_embeds=None, encoder_hidden_states=None,
                encoder_attention_mask=None):

        outputs = self.model(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
                             position_ids=position_ids,
                             head_mask=head_mask,
                             inputs_embeds=inputs_embeds,
                             encoder_hidden_states=encoder_hidden_states,
                             encoder_attention_mask=encoder_attention_mask)

        pooled_output = torch.mean(outputs[0], dim=1)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # add hidden states and attention if they are here
        outputs = (logits,) + outputs[2:]

        return outputs  # logits, (hidden_states), (attentions)

    def resize_token_embeddings(self, token_num):
        self.model.resize_token_embeddings(token_num)

    def freeze_unfreeze_bert(self, freeze=True, logger=None):
        if freeze:
            print('FREEZE bert model !', logger)
            # for name, child in self.model.module.named_children():
            for name, child in self.model.named_children():
                for param in child.parameters():
                    param.requires_grad = False
        else:
            print('UNFREEZE bert model !', logger)
            # for name, child in self.model.module.named_children():
            for name, child in self.model.named_children():
                for param in child.parameters():
                    param.requires_grad = True

    # def _resize_embeddings(self, old_embeddings, new_num_tokens):
    #     old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
    #     if old_num_tokens == new_num_tokens:
    #         return old_embeddings

    #     # Build new embeddings
    #     new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
    #     new_embeddings.to(old_embeddings.weight.device)

    #     # Copy word embeddings from the previous weights
    #     num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
    #     new_embeddings.weight.data[:num_tokens_to_copy,
    #                                :] = old_embeddings.weight.data[:num_tokens_to_copy, :]

    #     return new_embeddings


class XLNetModelForBinaryMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels, config_path, state_dict,
                 token_size=None, MAX_SEQUENCE_LENGTH=512):
        super(XLNetModelForBinaryMultiLabelClassifier, self).__init__()
        with open(config_path, 'rb') as fin:
            config = pickle.load(fin)
        self.model = XLNetModel(config)
        # self.model.load_state_dict(state_dict)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.model.config.d_model, num_labels)

        # resize
        if token_size:
            self.model.resize_token_embeddings(token_size)

        # add modules
        self.add_module('my_fc_output', self.classifier)

    def forward(self, input_ids=None, input_cats=None, labels=None, attention_mask=None,
                token_type_ids=None, position_ids=None, head_mask=None,
                inputs_embeds=None, encoder_hidden_states=None,
                encoder_attention_mask=None):

        outputs = self.model(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
                             # position_ids=position_ids,
                             head_mask=head_mask,
                             inputs_embeds=inputs_embeds,
                             # encoder_hidden_states=encoder_hidden_states,
                             # encoder_attention_mask=encoder_attention_mask
                             )

        pooled_output = torch.mean(outputs[0], dim=1)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # add hidden states and attention if they are here
        outputs = (logits,) + outputs[2:]

        return outputs  # logits, (hidden_states), (attentions)

    def resize_token_embeddings(self, token_num):
        self.model.resize_token_embeddings(token_num)

    def freeze_unfreeze_bert(self, freeze=True, logger=None):
        if freeze:
            print('FREEZE bert model !', logger)
            # for name, child in self.model.module.named_children():
            for name, child in self.model.named_children():
                for param in child.parameters():
                    param.requires_grad = False
        else:
            print('UNFREEZE bert model !', logger)
            # for name, child in self.model.module.named_children():
            for name, child in self.model.named_children():
                for param in child.parameters():
                    param.requires_grad = True

    # def _resize_embeddings(self, old_embeddings, new_num_tokens):
    #     old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
    #     if old_num_tokens == new_num_tokens:
    #         return old_embeddings

    #     # Build new embeddings
    #     new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
    #     new_embeddings.to(old_embeddings.weight.device)

    #     # Copy word embeddings from the previous weights
    #     num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
    #     new_embeddings.weight.data[:num_tokens_to_copy,
    #                                :] = old_embeddings.weight.data[:num_tokens_to_copy, :]

    #     return new_embeddings

def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            spearmanr(
                col_trues,
                col_pred +
                np.random.normal(
                    0,
                    1e-7,
                    col_pred.shape[0])).correlation)
    return rhos


def train_one_epoch(model, fobj, optimizer, loader, DEVICE):
    model.train()

    running_loss = 0
    for (qa_id, input_ids, attention_mask,
         token_type_ids, position_ids, labels) in tqdm(loader):
        # send them to DEVICE
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        token_type_ids = token_type_ids.to(DEVICE)
        position_ids = position_ids.to(DEVICE)
        labels = labels.to(DEVICE)

        # forward
        outputs = model(
            input_ids=input_ids,
            labels=labels,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids
        )
        loss = fobj(outputs[0], labels.float())

        # backword and update
        optimizer.zero_grad()
        loss.backward()

        optimizer.step()

        # store loss to culc epoch mean
        running_loss += loss

    loss_mean = running_loss / len(loader)

    return loss_mean


def test(model, fobj, loader, DEVICE, mode):
    model.eval()

    with torch.no_grad():
        y_preds, y_trues, qa_ids = [], [], []

        running_loss = 0
        for (qa_id, input_ids, attention_mask,
             token_type_ids, position_ids, labels) in tqdm(loader):
            # send them to DEVICE
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
            token_type_ids = token_type_ids.to(DEVICE)
            position_ids = position_ids.to(DEVICE)
            labels = labels.to(DEVICE)

            # forward
            outputs = model(
                input_ids=input_ids,
                labels=labels,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids
            )
            logits = outputs[0]
            if mode != 'test':
                loss = fobj(logits, labels.float())

                running_loss += loss

            y_preds.append(torch.sigmoid(logits))
            y_trues.append(labels)
            qa_ids.append(qa_id)

        loss_mean = running_loss / len(loader)

        y_preds = torch.cat(y_preds).to('cpu').numpy()
        y_trues = torch.cat(y_trues).to('cpu').numpy()
        qa_ids = torch.cat(qa_ids).to('cpu').numpy()

        if mode == 'valid':
            metric_raws = compute_spearmanr(y_trues, y_preds)
            metric = np.mean(metric_raws)
        elif mode != 'test':
            raise NotImplementedError
        else:
            metric_raws = None
            metric = None

    return loss_mean, metric, metric_raws, y_preds, y_trues, qa_ids


def seed_everything(seed=71):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything()

## BERT

In [None]:
#%debug
# prediction_loop
import os
import gc
import pickle

from torch.utils.data import DataLoader
from torch.utils.data.sampler import RandomSampler


FOLD_NUM = 5
RANK_NUM = 1

state_dict = {}

#############################
#  BERT QUESTION
#############################
bert_question_fold_prediction_dict = {}
for fold in range(FOLD_NUM):
    for rank in range(RANK_NUM):
        OSprint(f'fold -- {fold}')
        STATE_DICT_PATH = f'{BERT_Q_BASE_PATH}/state_dicts/fold_{fold}_rank_{rank}_state_dict.pkl'
        with open(STATE_DICT_PATH, 'rb') as fin:
            del state_dict
            state_dict = pickle.load(fin)
            gc.collect()

        model = BertModelForBinaryMultiLabelClassifier(21, BERT_BASE_MODEL_PATH, state_dict, token_size=30528, )
        model.load_state_dict(state_dict)
        model.to(DEVICE)        
        
        test_dataset = QUESTDataset(
            df=test_df,
            mode='test',
            tokens = [
                'CAT_TECHNOLOGY'.casefold(),
                'CAT_STACKOVERFLOW'.casefold(),
                'CAT_CULTURE'.casefold(),
                'CAT_SCIENCE'.casefold(),
                'CAT_LIFE_ARTS'.casefold(),
            ],
            augment=[],
            tokenizer_type='bert',
            pretrained_model_name_or_path=BERT_Q_BASE_PATH,
            do_lower_case=True,
            LABEL_COL=Q_LABEL_COL,
            t_max_len=30,
            q_max_len=239 * 2,
            a_max_len=239 * 0,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=512,
        )
        
        test_sampler = RandomSampler(data_source=test_dataset)
        test_loader = DataLoader(
                test_dataset,
                batch_size=BATCH_SIZE,
                sampler=test_sampler,
                num_workers=os.cpu_count(),
                worker_init_fn=lambda x: np.random.seed(),
                drop_last=False,
                pin_memory=True
            )

        _, _, _, y_preds, _, qa_ids = test(model, None, test_loader, DEVICE, 'test')

        fold_df = pd.DataFrame(y_preds)
        fold_df['qa_id'] = qa_ids
        fold_df = fold_df.set_index('qa_id')
        if fold in bert_question_fold_prediction_dict:
            bert_question_fold_prediction_dict[fold] += fold_df
        else:
            bert_question_fold_prediction_dict[fold] = fold_df
        
        del model, test_dataset, test_sampler, test_loader
        gc.collect()
    bert_question_fold_prediction_dict[fold] = bert_question_fold_prediction_dict[fold] / RANK_NUM #AVG            
    
    
#############################
#  BERT ANSWER
#############################
FOLD_NUM = 5
RANK_NUM = 1

bert_answer_fold_prediction_dict = {}
for fold in range(FOLD_NUM):
    for rank in range(RANK_NUM):
        OSprint(f'fold -- {fold}')
        STATE_DICT_PATH = f'{BERT_A_BASE_PATH}/state_dicts/fold_{fold}_rank_{rank}_state_dict.pkl'
        with open(STATE_DICT_PATH, 'rb') as fin:
            del state_dict
            state_dict = pickle.load(fin)
            gc.collect()

        model = BertModelForBinaryMultiLabelClassifier(9, BERT_BASE_MODEL_PATH, state_dict, token_size=30528, )
        model.load_state_dict(state_dict)
        model.to(DEVICE)
        
        test_dataset = QUESTDataset(
            df=test_df,
            mode='test',
            tokens = [
                'CAT_TECHNOLOGY'.casefold(),
                'CAT_STACKOVERFLOW'.casefold(),
                'CAT_CULTURE'.casefold(),
                'CAT_SCIENCE'.casefold(),
                'CAT_LIFE_ARTS'.casefold(),
            ],
            augment=[],
            tokenizer_type='bert',
            pretrained_model_name_or_path=BERT_A_BASE_PATH,
            do_lower_case=True,
            LABEL_COL=A_LABEL_COL,
            t_max_len=30,
            q_max_len=239 * 0,
            a_max_len=239 * 2,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=512,
        )
        
        test_sampler = RandomSampler(data_source=test_dataset)
        test_loader = DataLoader(
                test_dataset,
                batch_size=BATCH_SIZE,
                sampler=test_sampler,
                num_workers=os.cpu_count(),
                worker_init_fn=lambda x: np.random.seed(),
                drop_last=False,
                pin_memory=True
            )

        _, _, _, y_preds, _, qa_ids = test(model, None, test_loader, DEVICE, 'test')

        fold_df = pd.DataFrame(y_preds)
        fold_df['qa_id'] = qa_ids
        fold_df = fold_df.set_index('qa_id')
        if fold in bert_answer_fold_prediction_dict:
            bert_answer_fold_prediction_dict[fold] += fold_df
        else:
            bert_answer_fold_prediction_dict[fold] = fold_df

        del model, test_dataset, test_sampler, test_loader
        gc.collect()
    bert_answer_fold_prediction_dict[fold] = bert_answer_fold_prediction_dict[fold] / RANK_NUM #AVG

bert_fold_prediction_dict = {}
for fold in bert_question_fold_prediction_dict:
    bert_fold_prediction_dict[fold] = bert_question_fold_prediction_dict[fold].reset_index().merge(
                                      bert_answer_fold_prediction_dict[fold].reset_index(), on='qa_id', how='left')

In [None]:
bert_fold_prediction_dict[0]

## ROBERTA

In [None]:
# %debug
FOLD_NUM = 5
RANK_NUM = 2

state_dict = {}

#############################
#  ROBERTA QUESTION
#############################
roberta_question_fold_prediction_dict = {}
for fold in range(FOLD_NUM):
    for rank in range(RANK_NUM):
        OSprint(f'fold -- {fold}')
        STATE_DICT_PATH = f'{ROBERTA_Q_BASE_PATH}/state_dicts/fold_{fold}_rank_{rank}_state_dict.pkl'
        with open(STATE_DICT_PATH, 'rb') as fin:
            del state_dict
            state_dict = pickle.load(fin)
            gc.collect()

        model = RobertaModelForBinaryMultiLabelClassifier(21, ROBERTA_BASE_MODEL_PATH, state_dict, token_size=50271, )
        model.load_state_dict(state_dict)
        model.to(DEVICE)        
        
        test_dataset = QUESTDataset(
            df=test_df,
            mode='test',
            tokens = [
                'CAT_TECHNOLOGY'.casefold(),
                'CAT_STACKOVERFLOW'.casefold(),
                'CAT_CULTURE'.casefold(),
                'CAT_SCIENCE'.casefold(),
                'CAT_LIFE_ARTS'.casefold(),
            ],
            augment=[],
            tokenizer_type='roberta',
            pretrained_model_name_or_path=ROBERTA_Q_BASE_PATH,
            do_lower_case=False,
            LABEL_COL=Q_LABEL_COL,
            t_max_len=30,
            q_max_len=239 * 2,
            a_max_len=239 * 0,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=512,
        )
        
        test_sampler = RandomSampler(data_source=test_dataset)
        test_loader = DataLoader(
                test_dataset,
                batch_size=BATCH_SIZE,
                sampler=test_sampler,
                num_workers=os.cpu_count(),
                worker_init_fn=lambda x: np.random.seed(),
                drop_last=False,
                pin_memory=True
            )

        _, _, _, y_preds, _, qa_ids = test(model, None, test_loader, DEVICE, 'test')

        fold_df = pd.DataFrame(y_preds)
        fold_df['qa_id'] = qa_ids
        fold_df = fold_df.set_index('qa_id')
        if fold in roberta_question_fold_prediction_dict:
            roberta_question_fold_prediction_dict[fold] += fold_df
        else:
            roberta_question_fold_prediction_dict[fold] = fold_df
        
        del model, test_dataset, test_sampler, test_loader
        gc.collect()
    roberta_question_fold_prediction_dict[fold] = roberta_question_fold_prediction_dict[fold] / RANK_NUM #AVG            
    
    
#############################
#  ROBERTA ANSWER
#############################
FOLD_NUM = 5
RANK_NUM = 2

roberta_answer_fold_prediction_dict = {}
for fold in range(FOLD_NUM):
    for rank in range(RANK_NUM):
        OSprint(f'fold -- {fold}')
        STATE_DICT_PATH = f'{ROBERTA_A_BASE_PATH}/state_dicts/fold_{fold}_rank_{rank}_state_dict.pkl'
        with open(STATE_DICT_PATH, 'rb') as fin:
            del state_dict
            state_dict = pickle.load(fin)
            gc.collect()

        model = RobertaModelForBinaryMultiLabelClassifier(9, ROBERTA_BASE_MODEL_PATH, state_dict, token_size=50271, )
        model.load_state_dict(state_dict) 
        model.to(DEVICE)        
        
        test_dataset = QUESTDataset(
            df=test_df,
            mode='test',
            tokens = [
                'CAT_TECHNOLOGY'.casefold(),
                'CAT_STACKOVERFLOW'.casefold(),
                'CAT_CULTURE'.casefold(),
                'CAT_SCIENCE'.casefold(),
                'CAT_LIFE_ARTS'.casefold(),
            ],
            augment=[],
            tokenizer_type='roberta',
            pretrained_model_name_or_path=ROBERTA_A_BASE_PATH,
            do_lower_case=False,
            LABEL_COL=A_LABEL_COL,
            t_max_len=30,
            q_max_len=239 * 0,
            a_max_len=239 * 2,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=512,
        )
        
        test_sampler = RandomSampler(data_source=test_dataset)
        test_loader = DataLoader(
                test_dataset,
                batch_size=BATCH_SIZE,
                sampler=test_sampler,
                num_workers=os.cpu_count(),
                worker_init_fn=lambda x: np.random.seed(),
                drop_last=False,
                pin_memory=True
            )

        _, _, _, y_preds, _, qa_ids = test(model, None, test_loader, DEVICE, 'test')

        fold_df = pd.DataFrame(y_preds)
        fold_df['qa_id'] = qa_ids
        fold_df = fold_df.set_index('qa_id')
        if fold in roberta_answer_fold_prediction_dict:
            roberta_answer_fold_prediction_dict[fold] += fold_df
        else:
            roberta_answer_fold_prediction_dict[fold] = fold_df

        del model, test_dataset, test_sampler, test_loader
        gc.collect()
    roberta_answer_fold_prediction_dict[fold] = roberta_answer_fold_prediction_dict[fold] / RANK_NUM #AVG

roberta_fold_prediction_dict = {}
for fold in roberta_question_fold_prediction_dict:
    roberta_fold_prediction_dict[fold] = roberta_question_fold_prediction_dict[fold].reset_index().merge(
                                      roberta_answer_fold_prediction_dict[fold].reset_index(), on='qa_id', how='left')

In [None]:
roberta_fold_prediction_dict[0]

In [None]:
bert_fold_prediction_dict[0]

## XLNet

In [None]:
# %debug
FOLD_NUM = 5
RANK_NUM = 1

state_dict = {}

#############################
#  XLNET QUESTION
#############################
xlnet_question_fold_prediction_dict = {}
for fold in range(FOLD_NUM):
    for rank in range(RANK_NUM):
        OSprint(f'fold -- {fold}')
        STATE_DICT_PATH = f'{XLNET_Q_BASE_PATH}/state_dicts/fold_{fold}_rank_{rank}_state_dict.pkl'
        with open(STATE_DICT_PATH, 'rb') as fin:
            del state_dict
            state_dict = pickle.load(fin)
            gc.collect()

        model = XLNetModelForBinaryMultiLabelClassifier(21, XLNET_BASE_MODEL_PATH, state_dict, token_size=32006, )
        model.load_state_dict(state_dict)
        model.to(DEVICE)        
        
        test_dataset = QUESTDataset(
            df=test_df,
            mode='test',
            tokens = [
                'CAT_TECHNOLOGY'.casefold(),
                'CAT_STACKOVERFLOW'.casefold(),
                'CAT_CULTURE'.casefold(),
                'CAT_SCIENCE'.casefold(),
                'CAT_LIFE_ARTS'.casefold(),
            ],
            augment=[],
            tokenizer_type='xlnet',
            pretrained_model_name_or_path=XLNET_Q_BASE_PATH,
            do_lower_case=False,
            LABEL_COL=Q_LABEL_COL,
            t_max_len=30,
            q_max_len=239 * 2,
            a_max_len=239 * 0,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=512,
        )
        
        test_sampler = RandomSampler(data_source=test_dataset)
        test_loader = DataLoader(
                test_dataset,
                batch_size=BATCH_SIZE,
                sampler=test_sampler,
                num_workers=os.cpu_count(),
                worker_init_fn=lambda x: np.random.seed(),
                drop_last=False,
                pin_memory=True
            )

        _, _, _, y_preds, _, qa_ids = test(model, None, test_loader, DEVICE, 'test')

        fold_df = pd.DataFrame(y_preds)
        fold_df['qa_id'] = qa_ids
        fold_df = fold_df.set_index('qa_id')
        if fold in xlnet_question_fold_prediction_dict:
            xlnet_question_fold_prediction_dict[fold] += fold_df
        else:
            xlnet_question_fold_prediction_dict[fold] = fold_df
        
        del model, test_dataset, test_sampler, test_loader
        gc.collect()
    xlnet_question_fold_prediction_dict[fold] = xlnet_question_fold_prediction_dict[fold] / RANK_NUM #AVG            
    
    
#############################
#  XLNET ANSWER
#############################
FOLD_NUM = 5
RANK_NUM = 1

xlnet_answer_fold_prediction_dict = {}
for fold in range(FOLD_NUM):
    for rank in range(RANK_NUM):
        OSprint(f'fold -- {fold}')
        STATE_DICT_PATH = f'{XLNET_A_BASE_PATH}/state_dicts/fold_{fold}_rank_{rank}_state_dict.pkl'
        with open(STATE_DICT_PATH, 'rb') as fin:
            del state_dict
            state_dict = pickle.load(fin)
            gc.collect()

        model = XLNetModelForBinaryMultiLabelClassifier(9, XLNET_BASE_MODEL_PATH, state_dict, token_size=32006, )
        model.load_state_dict(state_dict)
        model.to(DEVICE)        
        
        test_dataset = QUESTDataset(
            df=test_df,
            mode='test',
            tokens = [
                'CAT_TECHNOLOGY'.casefold(),
                'CAT_STACKOVERFLOW'.casefold(),
                'CAT_CULTURE'.casefold(),
                'CAT_SCIENCE'.casefold(),
                'CAT_LIFE_ARTS'.casefold(),
            ],
            augment=[],
            tokenizer_type='xlnet',
            pretrained_model_name_or_path=XLNET_A_BASE_PATH,
            do_lower_case=False,
            LABEL_COL=A_LABEL_COL,
            t_max_len=30,
            q_max_len=239 * 0,
            a_max_len=239 * 2,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=512,
        )
        
        test_sampler = RandomSampler(data_source=test_dataset)
        test_loader = DataLoader(
                test_dataset,
                batch_size=BATCH_SIZE,
                sampler=test_sampler,
                num_workers=os.cpu_count(),
                worker_init_fn=lambda x: np.random.seed(),
                drop_last=False,
                pin_memory=True
            )

        _, _, _, y_preds, _, qa_ids = test(model, None, test_loader, DEVICE, 'test')

        fold_df = pd.DataFrame(y_preds)
        fold_df['qa_id'] = qa_ids
        fold_df = fold_df.set_index('qa_id')
        if fold in xlnet_answer_fold_prediction_dict:
            xlnet_answer_fold_prediction_dict[fold] += fold_df
        else:
            xlnet_answer_fold_prediction_dict[fold] = fold_df

        del model, test_dataset, test_sampler, test_loader
        gc.collect()
    xlnet_answer_fold_prediction_dict[fold] = xlnet_answer_fold_prediction_dict[fold] / RANK_NUM #AVG

xlnet_fold_prediction_dict = {}
for fold in xlnet_question_fold_prediction_dict:
    xlnet_fold_prediction_dict[fold] = xlnet_question_fold_prediction_dict[fold].reset_index().merge(
                                      xlnet_answer_fold_prediction_dict[fold].reset_index(), on='qa_id', how='left')

In [None]:
bert_fold_prediction_dict[0]

In [None]:
roberta_fold_prediction_dict[0]

In [None]:
xlnet_fold_prediction_dict[0]

#### merge all fold prediction dicts

In [None]:
fold_prediction_dict = {}

dicts = [bert_fold_prediction_dict, roberta_fold_prediction_dict, xlnet_fold_prediction_dict]

for _fold_prediction_dict in dicts:
    for fold in _fold_prediction_dict:
        if fold in fold_prediction_dict:
            fold_prediction_dict[fold] += _fold_prediction_dict[fold].sort_values('qa_id').reset_index(drop=True)
        else:
            fold_prediction_dict[fold] = _fold_prediction_dict[fold].sort_values('qa_id').reset_index(drop=True)#.copy()

for fold in fold_prediction_dict:
    fold_prediction_dict[fold] /= len(dicts)

In [None]:
fold_prediction_dict[0]

#### ensemble

In [None]:
def convert_folds_prediction_to_sub(folds_prediction_dict):
    for i, fold in enumerate(folds_prediction_dict):
        if i == 0:
            prediction = folds_prediction_dict[fold].sort_values('qa_id').set_index('qa_id').values
        else:
            prediction += folds_prediction_dict[fold].sort_values('qa_id').set_index('qa_id').values
#    sub_df = pd.DataFrame(base_prediction)
#    sub_df['qa_id'] = folds_prediction_dict[fold]['qa_id'].sort_values().values
    prediction /= len(folds_prediction_dict)
    return prediction

In [None]:
prediction = convert_folds_prediction_to_sub(fold_prediction_dict)

In [None]:
raw_prediction = prediction.copy()

## opt

In [None]:
import pickle
from functools import partial
from glob import glob

import numpy as np
import pandas as pd
import scipy as sp
import torch
from scipy.stats import spearmanr
from tqdm import tqdm

class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """

    def __init__(self):
        self.coef_ = 0

    def _spearmanr_loss(self, coef, X, y, labels):
        """
        Get loss according to
        using current coefficients
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) +
                     [np.inf], labels=labels)

        # return -np.mean(spearmanr(y, X_p).correlation)
        return -spearmanr(y, X_p).correlation

    def fit(self, X, y, initial_coef):
        """
        Optimize rounding thresholds
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        labels = self.labels
        loss_partial = partial(self._spearmanr_loss, X=X, y=y, labels=labels)
        self.coef_ = sp.optimize.minimize(
            loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        labels = self.labels
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) +
                      [np.inf], labels=labels)
        # [np.inf], labels=[0, 1, 2, 3])

    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

    def set_labels(self, labels):
        self.labels = labels

In [None]:
#with open('/kaggle/input/e059-question-snapshot/optRs.pkl', 'rb') as fin:
#    optRs = pickle.load(fin)
#with open('/kaggle/input/e060-answer-snapshot/optRs.pkl', 'rb') as fin:
#    optRs += pickle.load(fin)   

with open('/kaggle/input/sub-e121-e129-bert1-roberta2-xlnet1/optRs.pkl', 'rb') as fin:
    optRs = pickle.load(fin)    

In [None]:
res_prediction = []
for i in tqdm(list(range(30))):
    y_pred = prediction[:, i]
    #if i not in [2,4,5,6,7,11,12,13,14,15,16,18,19,29]:
    # if i not in [2,4,5,6,7,8,9,11,12,13,14,15,16,19,23,25,]:
    # if i not in [2,4,5,6,7,8,11,12,13,14,15,16,19,23,25,]:    
    # if i not in [2,4,5,6,7,8,11,12,13,14,15,16,23,25,]:     
    if i not in [2,4,5,6,7,8,9,11,12,13,14,15,16,19,23,25,]:        
        res_prediction.append(y_pred)
        continue
    
    optR = optRs[i]
    res = optR.predict(y_pred, optR.coefficients()).astype(float)

    res_prediction.append(res)

prediction = np.asarray(res_prediction).T

In [None]:
raw_prediction[:, 0]

In [None]:
optRs[0].coefficients()

In [None]:
from matplotlib import pyplot as plt

for i in range(30):
    plt.hist(prediction[:, i])
    plt.title(f'{i}th label')
    plt.show()

In [None]:
sub_df = pd.read_csv('/kaggle/input/google-quest-challenge/sample_submission.csv')
sub_df.head()

In [None]:
sub_df.iloc[:, 1:] = prediction
sub_df.head()

In [None]:
# post process using corr
#sub_df.loc[(sub_df['question_opinion_seeking'] < np.min(optRs[10].coefficients())).values, 'question_fact_seeking'] = 1.
sub_df.loc[(sub_df['question_fact_seeking'] < np.min(optRs[4].coefficients())).values, 'question_opinion_seeking'] = 1.

## avoid scoring error

In [None]:
pub_qa_id = [39, 46, 70, 132, 200, 245, 257, 267, 284, 292, 296, 312, 322, 327, 334, 340, 357, 374, 375, 387, 391, 395, 444, 482, 483, 513, 542, 579, 589, 625, 641, 683, 725, 727, 728, 740, 748, 765, 811, 830, 851, 856, 885, 905, 929, 938, 939, 962, 1082, 1091, 1101, 1119, 1153, 1226, 1230, 1238, 1247, 1249, 1266, 1282, 1297, 1331, 1359, 1398, 1423, 1477, 1502, 1544, 1567, 1654, 1676, 1700, 1701, 1727, 1764, 1794, 1795, 1807, 1812, 1816, 1833, 1847, 1868, 1877, 1885, 1934, 1959, 1983, 1990, 2005, 2018, 2027, 2042, 2066, 2070, 2075, 2094, 2128, 2163, 2180, 2203, 2230, 2244, 2257, 2277, 2278, 2286, 2303, 2335, 2374, 2387, 2395, 2455, 2465, 2474, 2487, 2493, 2534, 2569, 2573, 2580, 2592, 2607, 2621, 2655, 2666, 2669, 2670, 2676, 2691, 2748, 2763, 2774, 2789, 2793, 2795, 2797, 2806, 2844, 2868, 2895, 2912, 2922, 2931, 2968, 3034, 3062, 3087, 3107, 3173, 3207, 3229, 3290, 3336, 3378, 3399, 3437, 3461, 3463, 3502, 3504, 3524, 3526, 3527, 3532, 3543, 3544, 3560, 3592, 3671, 3682, 3696, 3720, 3787, 3854, 3871, 3876, 3881, 3901, 3941, 3943, 3949, 3961, 3962, 3967, 4006, 4039, 4057, 4070, 4099, 4139, 4157, 4160, 4176, 4182, 4183, 4211, 4213, 4250, 4263, 4269, 4281, 4284, 4285, 4286, 4346, 4387, 4388, 4401, 4416, 4441, 4497, 4546, 4547, 4564, 4569, 4575, 4598, 4607, 4663, 4679, 4743, 4751, 4778, 4792, 4835, 4881, 4901, 4954, 4973, 5001, 5003, 5019, 5035, 5050, 5057, 5076, 5091, 5095, 5141, 5152, 5253, 5367, 5373, 5403, 5417, 5422, 5499, 5503, 5530, 5541, 5542, 5660, 5663, 5697, 5753, 5783, 5790, 5835, 5847, 5862, 5878, 5891, 5904, 5907, 5914, 5936, 5947, 5958, 5972, 5979, 5998, 6016, 6042, 6058, 6079, 6087, 6111, 6126, 6132, 6159, 6204, 6212, 6226, 6258, 6271, 6280, 6285, 6301, 6319, 6325, 6331, 6332, 6336, 6346, 6378, 6379, 6420, 6445, 6481, 6494, 6495, 6502, 6560, 6580, 6583, 6621, 6643, 6646, 6688, 6694, 6715, 6723, 6737, 6744, 6745, 6766, 6770, 6774, 6821, 6832, 6838, 6856, 6888, 6889, 6955, 6957, 6964, 6988, 6994, 7012, 7018, 7036, 7064, 7072, 7114, 7116, 7123, 7150, 7165, 7176, 7194, 7201, 7216, 7247, 7254, 7272, 7278, 7281, 7293, 7302, 7326, 7404, 7410, 7438, 7477, 7481, 7485, 7519, 7520, 7525, 7531, 7544, 7546, 7589, 7595, 7614, 7640, 7654, 7672, 7711, 7727, 7739, 7758, 7766, 7815, 7838, 7852, 7869, 7878, 7899, 7935, 7939, 7970, 8021, 8032, 8045, 8070, 8089, 8115, 8143, 8146, 8191, 8197, 8206, 8212, 8242, 8245, 8250, 8258, 8271, 8273, 8339, 8350, 8355, 8376, 8395, 8412, 8427, 8437, 8464, 8496, 8516, 8517, 8551, 8564, 8591, 8621, 8626, 8629, 8672, 8684, 8685, 8690, 8738, 8755, 8756, 8760, 8771, 8773, 8778, 8823, 8834, 8842, 8846, 8875, 8916, 8921, 8932, 8934, 8938, 8973, 8987, 9001, 9006, 9018, 9033, 9065, 9140, 9141, 9174, 9213, 9225, 9228, 9237, 9240, 9256, 9259, 9263, 9298, 9324, 9350, 9391, 9400, 9439, 9454, 9476, 9478, 9497, 9545, 9567, 9569, 9590, 9597, 9623, 9640]
# pub_qa_id = [70, 132, 200, 245, 257, 267, 284, 292, 296, 312, 322, 327, 334, 340, 357, 374, 375, 387, 391, 395, 444, 482, 483, 513, 542, 579, 589, 625, 641, 683, 725, 727, 728, 740, 748, 765, 811, 830, 851, 856, 885, 905, 929, 938, 939, 962, 1082, 1091, 1101, 1119, 1153, 1226, 1230, 1238, 1247, 1249, 1266, 1282, 1297, 1331, 1359, 1398, 1423, 1477, 1502, 1544, 1567, 1654, 1676, 1700, 1701, 1727, 1764, 1794, 1795, 1807, 1812, 1816, 1833, 1847, 1868, 1877, 1885, 1934, 1959, 1983, 1990, 2005, 2018, 2027, 2042, 2066, 2070, 2075, 2094, 2128, 2163, 2180, 2203, 2230, 2244, 2257, 2277, 2278, 2286, 2303, 2335, 2374, 2387, 2395, 2455, 2465, 2474, 2487, 2493, 2534, 2569, 2573, 2580, 2592, 2607, 2621, 2655, 2666, 2669, 2670, 2676, 2691, 2748, 2763, 2774, 2789, 2793, 2795, 2797, 2806, 2844, 2868, 2895, 2912, 2922, 2931, 2968, 3034, 3062, 3087, 3107, 3173, 3207, 3229, 3290, 3336, 3378, 3399, 3437, 3461, 3463, 3502, 3504, 3524, 3526, 3527, 3532, 3543, 3544, 3560, 3592, 3671, 3682, 3696, 3720, 3787, 3854, 3871, 3876, 3881, 3901, 3941, 3943, 3949, 3961, 3962, 3967, 4006, 4039, 4057, 4070, 4099, 4139, 4157, 4160, 4176, 4182, 4183, 4211, 4213, 4250, 4263, 4269, 4281, 4284, 4285, 4286, 4346, 4387, 4388, 4401, 4416, 4441, 4497, 4546, 4547, 4564, 4569, 4575, 4598, 4607, 4663, 4679, 4743, 4751, 4778, 4792, 4835, 4881, 4901, 4954, 4973, 5001, 5003, 5019, 5035, 5050, 5057, 5076, 5091, 5095, 5141, 5152, 5253, 5367, 5373, 5403, 5417, 5422, 5499, 5503, 5530, 5541, 5542, 5660, 5663, 5697, 5753, 5783, 5790, 5835, 5847, 5862, 5878, 5891, 5904, 5907, 5914, 5936, 5947, 5958, 5972, 5979, 5998, 6016, 6042, 6058, 6079, 6087, 6111, 6126, 6132, 6159, 6204, 6212, 6226, 6258, 6271, 6280, 6285, 6301, 6319, 6325, 6331, 6332, 6336, 6346, 6378, 6379, 6420, 6445, 6481, 6494, 6495, 6502, 6560, 6580, 6583, 6621, 6643, 6646, 6688, 6694, 6715, 6723, 6737, 6744, 6745, 6766, 6770, 6774, 6821, 6832, 6838, 6856, 6888, 6889, 6955, 6957, 6964, 6988, 6994, 7012, 7018, 7036, 7064, 7072, 7114, 7116, 7123, 7150, 7165, 7176, 7194, 7201, 7216, 7247, 7254, 7272, 7278, 7281, 7293, 7302, 7326, 7404, 7410, 7438, 7477, 7481, 7485, 7519, 7520, 7525, 7531, 7544, 7546, 7589, 7595, 7614, 7640, 7654, 7672, 7711, 7727, 7739, 7758, 7766, 7815, 7838, 7852, 7869, 7878, 7899, 7935, 7939, 7970, 8021, 8032, 8045, 8070, 8089, 8115, 8143, 8146, 8191, 8197, 8206, 8212, 8242, 8245, 8250, 8258, 8271, 8273, 8339, 8350, 8355, 8376, 8395, 8412, 8427, 8437, 8464, 8496, 8516, 8517, 8551, 8564, 8591, 8621, 8626, 8629, 8672, 8684, 8685, 8690, 8738, 8755, 8756, 8760, 8771, 8773, 8778, 8823, 8834, 8842, 8846, 8875, 8916, 8921, 8932, 8934, 8938, 8973, 8987, 9001, 9006, 9018, 9033, 9065, 9140, 9141, 9174, 9213, 9225, 9228, 9237, 9240, 9256, 9259, 9263, 9298, 9324, 9350, 9391, 9400, 9439, 9454, 9476, 9478, 9497, 9545, 9567, 9569, 9590, 9597, 9623, 9640]

In [None]:
sub_df_cols = sub_df.columns[1:]
sub_df_cols

In [None]:
for i in range(30):
    sub_df_col = sub_df_cols[i]

    pub_y_pred = raw_prediction[sub_df.qa_id.isin(pub_qa_id).values, i]
    pub_y_res = sub_df.loc[sub_df.qa_id.isin(pub_qa_id).values, sub_df_col].values

    pub_y_pred_argmax = np.argmax(pub_y_pred)
    pub_y_pred_argmin = np.argmin(pub_y_pred)

    if len(np.unique(pub_y_res)) == 1:
        #if np.unique(pub_y_res)[0] == pub_y_pred[pub_y_pred_argmax]:
        #    if np.unique(pub_y_res)[0] == pub_y_pred[pub_y_pred_argmin]:
        #        if np.unique(pub_y_res)[0] > 0.5:
        #            pub_y_res[pub_y_pred_argmin] = 0
        #        else:
        #            pub_y_res[pub_y_pred_argmax] = 1
        #    else:
        #        pub_y_res[pub_y_pred_argmin] = np.min(pub_y_pred)                
        #else:
        #    pub_y_res[pub_y_pred_argmax] = np.max(pub_y_pred)
        pub_y_res = pub_y_pred
    sub_df.loc[sub_df.qa_id.isin(pub_qa_id).values, sub_df_col] = pub_y_res


    pri_y_pred = raw_prediction[~(sub_df.qa_id.isin(pub_qa_id).values), i]

    # only for sub
    if len(pri_y_pred) == 0:
        continue    
    pri_y_res = sub_df.loc[~(sub_df.qa_id.isin(pub_qa_id).values), sub_df_col].values  
    
    pri_y_pred_argmax = np.argmax(pri_y_pred)
    pri_y_pred_argmin = np.argmin(pri_y_pred)    

    if len(np.unique(pri_y_res)) == 1:
        #if np.unique(pri_y_res)[0] == pri_y_pred[pri_y_pred_argmax]:
        #    if np.unique(pri_y_res)[0] == pri_y_pred[pri_y_pred_argmin]:
        #        if np.unique(pri_y_res)[0] > 0.5:
        #            pri_y_res[pri_y_pred_argmin] = 0
        #        else:
        #            pri_y_res[pri_y_pred_argmax] = 1
        #    else:
        #        pri_y_res[pri_y_pred_argmin] = np.min(pri_y_pred)                
        #else:
        #    pri_y_res[pri_y_pred_argmax] = np.max(pri_y_pred)
        pri_y_res = pri_y_pred
        
    sub_df.loc[~(sub_df.qa_id.isin(pub_qa_id).values), sub_df_col] = pri_y_res    

In [None]:
n = test_df['url'].apply(lambda x:('english.stackexchange.com' in x)).tolist()
# n = (test_df['host'].isin(['english.stackexchange.com', 'ell.stackexchange.com']).values & (test_df['category'] == 'CULTURE').values).tolist()
spelling=[]
for x in n:
    if x:
        spelling.append(0.5)
    else:
        spelling.append(0.)

In [None]:
sub_df['question_type_spelling'] = spelling

In [None]:
for i in range(30):
    plt.hist(sub_df.iloc[:, i+1])
    plt.title(f'{i}th : {sub_df.columns[i+1]}')
    plt.show()

In [None]:
sub_df.to_csv("submission.csv", index=False)