## Prediction
### Parameter

In [None]:
BS = 8
SEED = 42

### Library Install & Import

In [None]:
!pip install ../input/sacremoses/sacremoses-master/ > /dev/null
!pip install ../input/transformers/transformers-master/ > /dev/null

In [None]:
import numpy as np
import pandas as pd
import pickle

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

from scipy import stats
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

import torch
import torch.utils.data
from transformers import *
import torch.nn as nn
from pytorch_transformers.modeling_bert import BertPreTrainedModel, BertModel

from matplotlib import pyplot as plt
import re
import math
from math import floor, ceil

device = torch.device('cuda')

In [None]:
train = pd.read_csv('/kaggle/input/google-quest-challenge/train.csv').fillna(' ')
test = pd.read_csv('/kaggle/input/google-quest-challenge/test.csv').fillna(' ')
sub = pd.read_csv('../input/google-quest-challenge/sample_submission.csv').fillna(' ')

In [None]:
def _get_segments_xlnet(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "<sep>":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return [0] * (max_seq_length - len(tokens)) + segments

def _get_segments_bert(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids_xlnet(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids =  [5] * (max_seq_length-len(token_ids)) + token_ids
    return input_ids

def _get_ids_bert(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def _trim_input(title, question, answer, max_sequence_length=512-1, 
                t_max_len=70-1, q_max_len=219, a_max_len=219):


    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = norm_token_length(q, q_new_len)
        a = norm_token_length(a, a_new_len)
    
    return t, q, a

def norm_token_length(tokens, l):
    if len(tokens) > l:
        head = l//2
        tail = l - head
        return tokens[:head] + tokens[-tail:]
    else:
        return tokens[:l]

def _convert_to_bert_inputs(title, question, answer, cate, pretrained_weights, max_sequence_length=512):
    """Converts tokenized input to ids, masks and segments for BERT"""
    if "bert-base" in pretrained_weights:
        stoken = ["[CLS]"] + [cate] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]
        input_ids = _get_ids_bert(stoken, tokenizer, max_sequence_length)
        input_segments = _get_segments_bert(stoken, max_sequence_length)    
    elif pretrained_weights == "xlnet-base-cased":
        stoken = [cate] + title + ["<sep>"] + question + ["<sep>"] + answer + ["<sep>", "<cls>"]
        input_ids = _get_ids_xlnet(stoken, tokenizer, max_sequence_length)
        input_segments = _get_segments_xlnet(stoken, max_sequence_length)
        try:
            cls_index = input_segments.index(5) - 1
        except ValueError:
            cls_index = -1
        input_segments[cls_index] = 2
    
    return [input_ids, input_segments]

def convert_row(row, pretrained_weights):
    if pretrained_weights == "bert-base-uncased":
        c = f"[{row['category'].lower()}]"
    elif pretrained_weights == "bert-base-cased":
        c = f"[{row['category']}]"
    elif pretrained_weights == "xlnet-base-cased":
        c = f"[{row['category']}]"
    t, q, a = title = row["question_title"], row["question_body"], row["answer"]
    t, q, a = _trim_input(t, q, a)
    ids, segments = _convert_to_bert_inputs(t, q, a, c, pretrained_weights)
    return np.array([[ids, segments]])

## BERT

In [None]:
class CustomBert(BertPreTrainedModel):
    def __init__(self, config):
        super(CustomBert, self).__init__(config)

        config.num_labels = 30
        config.output_hidden_states = True
        self.n_use_layer = 4

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dense1 = nn.Linear(768*self.n_use_layer, 768*self.n_use_layer)
        self.dense2 = nn.Linear(768*self.n_use_layer, 768*self.n_use_layer)    
        self.classifier = nn.Linear(768*self.n_use_layer, config.num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask)
        
        pooled_output = torch.cat([outputs[2][-1*i][:,0] for i in range(1, self.n_use_layer+1)], dim=1)
        pooled_output = self.dense1(pooled_output)
        pooled_output = self.dense2(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]

        return outputs

### uncased

In [None]:
tokenizer = BertTokenizer.from_pretrained('../input/quest-bert-swa/')
tokenizer.added_tokens_encoder

In [None]:
pretrained_weights = 'bert-base-uncased'
X_test = test[["question_title", "question_body", "answer", "category"]].progress_apply(lambda x: convert_row(x, pretrained_weights), axis=1).values
X_test = np.vstack(X_test).reshape((len(X_test), 1024))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BS, shuffle=False)

In [None]:
model = CustomBert.from_pretrained("../input/quest-bert-swa/")

model = model.to(device)
for param in model.parameters():
    param.requires_grad=False
model.resize_token_embeddings(len(tokenizer))
model.eval()

In [None]:
model_dir_target = "../input/quest-bertuncased-10fold/"

bert_pred_lst = []
for fold in range(10):
    bert_path = f"{model_dir_target}/bert-base-uncased_f{fold}_best"
    model.load_state_dict(torch.load(bert_path))
    
    lst = []
    for i, (x_batch,)  in enumerate(test_loader):
        input_ids = x_batch[:, :512]
        token_ids = x_batch[:, 512:]
        pred = model(input_ids.to(device), attention_mask=(input_ids > 0).to(device), token_type_ids=token_ids.to(device))
        lst.append(pred[0].detach().cpu().squeeze().numpy())
    test_pred = np.vstack(lst)
    bert_pred_lst.append(test_pred)

## BERT cased

In [None]:
tokenizer = BertTokenizer.from_pretrained('../input/quest-bertcased-tokenizer/')
tokenizer.added_tokens_encoder

In [None]:
pretrained_weights = 'bert-base-cased'
X_test = test[["question_title", "question_body", "answer", "category"]].progress_apply(lambda x: convert_row(x, pretrained_weights), axis=1).values
X_test = np.vstack(X_test).reshape((len(X_test), 1024))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BS, shuffle=False)

In [None]:
model = CustomBert.from_pretrained("../input/quest-bert-swa/")

model = model.to(device)
for param in model.parameters():
    param.requires_grad=False
model.resize_token_embeddings(len(tokenizer))
model.eval()

In [None]:
model_dir_target = "../input/quest-bertcased-10fold/"

cased_pred_lst = []
for fold in range(10):
    bert_path = f"{model_dir_target}/bert-base-cased_f{fold}_best"
    model.load_state_dict(torch.load(bert_path))
    
    lst = []
    for i, (x_batch,)  in enumerate(test_loader):
        input_ids = x_batch[:, :512]
        token_ids = x_batch[:, 512:]
        pred = model(input_ids.to(device), attention_mask=(input_ids > 0).to(device), token_type_ids=token_ids.to(device))
        lst.append(pred[0].detach().cpu().squeeze().numpy())
    test_pred = np.vstack(lst)
    cased_pred_lst.append(test_pred)

## XLNet

In [None]:
class CustomXLNet(XLNetForSequenceClassification):
    def __init__(self, config):
        super(CustomXLNet, self).__init__(config)  
        config.num_labels = 30
        config.output_hidden_states = True
        self.n_use_layer = 4
        self.n_labels = config.num_labels
        self.transformer = XLNetModel(config)
        
        self.dense1 = nn.Linear(768*self.n_use_layer, 768*self.n_use_layer)
        self.dense2 = nn.Linear(768*self.n_use_layer, 768*self.n_use_layer)
        self.logits_proj = nn.Linear(768*self.n_use_layer, config.num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        mems=None,
        perm_mask=None,
        target_mapping=None,
        token_type_ids=None,
        input_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        transformer_outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            mems=mems,
            perm_mask=perm_mask,
            target_mapping=target_mapping,
            token_type_ids=token_type_ids,
            input_mask=input_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        
        output = torch.cat([transformer_outputs[1][-1*(i+1)][:,-1] for i in range(self.n_use_layer)], dim=1)
        output = self.dense1(output)
        output = self.dense2(output)

        logits = self.logits_proj(output)

        outputs = (logits,) + transformer_outputs[1:]

        return outputs 

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('../input/quest-xlnet-tokenizer/')
tokenizer.added_tokens_encoder

In [None]:
pretrained_weights = "xlnet-base-cased"

X_test = test[["question_title", "question_body", "answer", "category"]].progress_apply(lambda x: convert_row(x, pretrained_weights), axis=1).values
X_test = np.vstack(X_test).reshape((len(X_test), 1024))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BS, shuffle=False)

In [None]:
model = CustomXLNet.from_pretrained("../input/quest-xlnet-tokenizer/")

model = model.to(device)
for param in model.parameters():
    param.requires_grad=False
model.resize_token_embeddings(len(tokenizer))
model.eval()

In [None]:
model_dir_target = "../input/quest-xlnetcased-10fold/"

xlnet_pred_lst = []
for fold in range(10):
    bert_path = f"{model_dir_target}/xlnet-base-cased_f{fold}_best"
    model.load_state_dict(torch.load(bert_path))
    
    lst = []
    for i, (x_batch,)  in enumerate(test_loader):
        input_ids = x_batch[:, :512]
        token_ids = x_batch[:, 512:]
        pred = model(input_ids.to(device), attention_mask=(input_ids != 5).int().to(device), token_type_ids=token_ids.to(device))
        lst.append(pred[0].detach().cpu().squeeze().numpy())
    test_pred = np.vstack(lst)
    xlnet_pred_lst.append(test_pred)

## Ensemble

In [None]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

bert_pred = np.array(bert_pred_lst).mean(0)
cased_pred = np.array(cased_pred_lst).mean(0)
xlnet_pred = np.array(xlnet_pred_lst).mean(0)

bert_pred = sigmoid(bert_pred)
cased_pred = sigmoid(cased_pred)
xlnet_pred = sigmoid(xlnet_pred)

test_pred = bert_pred*0.4 + cased_pred*0.2 + xlnet_pred*0.4

## LGBM

In [None]:
lgbm_models = pickle.load(open("../input/quest-lgbm/lgbm_question_type_spelling.pkl", 'rb'))
count_vectorizers = pickle.load(open("../input/quest-lgbm/tfidf_vectorizers.pkl", 'rb'))

In [None]:
dfs = []
for idx, col_name in enumerate(["question_title", "question_body", "answer"]):
    X = count_vectorizers[idx].transform(test[col_name])
    feat = [f"{col_name}_{c}".encode("utf-8") for c in count_vectorizers[idx].get_feature_names()]
    df = pd.DataFrame(X.toarray(), columns=feat)
    dfs.append(df)
test_x = pd.concat(dfs, axis=1)
test_x.shape

In [None]:
lgbm_pred = np.zeros(len(test_x))
for fold in range(4):
    lgbm_pred += lgbm_models[fold].predict(test_x)/4
lgbm_pred.shape 

## Post Processing

In [None]:
norm_dict = {}
for c in sub.columns[1:]:
  unique_values = train[c].unique()
  unique_values = list(set(list(unique_values)+[0,1]))
  lst = []
  for common_num in range(90):
    num = 90 - common_num
    bunbo = [round((1/num)*n, 8) for n in range(num+1)]
    kyoutu = [round(v, 8) for v in unique_values if round(v, 8) in bunbo]
    if len(kyoutu) == len(unique_values):
      lst.append(num)
  norm_dict[c] = min(lst)
norm_dict

In [None]:
def spearman_corr(y_true, y_pred):
    if np.ndim(y_pred) == 2:
        corr = np.nan_to_num([stats.spearmanr(y_true[:, i], y_pred[:, i])[0] for i in range(y_true.shape[1])]).mean()
    else:
        corr = stats.spearmanr(y_true, y_pred)[0]
    return corr


# ref: https://qiita.com/kaggle_master-arai-san/items/d59b2fb7142ec7e270a5
# thank you kaggle masterのアライさん!!
class OptimizedRounder(object):
    def __init__(self,
                 n_overall: int = 5,
                 n_classwise: int = 5,
                 n_classes: int = 7,
                 metric: str = "qwk"):
        self.n_overall = n_overall
        self.n_classwise = n_classwise
        self.n_classes = n_classes
        self.coef = [1.0 / n_classes * i for i in range(1, n_classes)]
        self.metric_str = metric
        self.metric = spearman_corr

    def _loss(self, X: np.ndarray, y: np.ndarray) -> float:
        X_p = np.digitize(X, self.coef)
        ll = -self.metric(y, X_p)
        return ll

    def fit(self, X: np.ndarray, y: np.ndarray):
        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [
            (0.01, 1.0 / self.n_classes + 0.05),
        ]
        for i in range(1, self.n_classes):
            ab_start.append((i * 1.0 / self.n_classes + 0.05,
                             (i + 1) * 1.0 / self.n_classes + 0.05))
        for _ in range(self.n_overall):
            for idx in range(self.n_classes - 1):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                self.coef[idx] = a
                la = self._loss(X, y)
                self.coef[idx] = b
                lb = self._loss(X, y)
                for it in range(self.n_classwise):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        self.coef[idx] = a
                        la = self._loss(X, y)
                    else:
                        b = b - (b - a) * golden2
                        self.coef[idx] = b
                        lb = self._loss(X, y)

    def predict(self, X: np.ndarray) -> np.ndarray:
        X_p = np.digitize(X, self.coef)
        return X_p

In [None]:
optR_lst = pickle.load(open("../input/quest-optimizedrounder/optR_lst_10fold_ensemble_v3.pkl", 'rb'))

lst = []
for idx, optR in enumerate(optR_lst):
    coeff = optR.predict(test_pred[:, idx])
    lst.append(coeff/norm_dict[sub.columns[1:][idx]])
opt_preds = np.array(lst).T
test_pred = opt_preds

test_pred[:,19] = lgbm_pred

In [None]:
test_pred[test[test["category"] != "CULTURE"].index, 19] = 0.0

test["host_info"] = test["question_user_page"].map(lambda x: x.split("/")[2].replace(".stackexchange.com", ""))
test_pred[test[test["host_info"].map(lambda x: x not in ["english", "ell"])].index, 19] = 0.0

In [None]:
sub[sub.columns[1:]] = test_pred
sub.to_csv("submission.csv", index=False)
sub.head()