In [None]:
from typing import List, Tuple
import random
import html

import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold, KFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow.keras.backend as K
import os
from scipy.stats import spearmanr
from scipy.optimize import minimize
from math import floor, ceil
from transformers import *
import torch
import torch.nn as nn
import torch.nn.functional as F

def tqdm(it, *args, **kwargs):
    return it


def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything()
np.set_printoptions(suppress=True)

print(tf.__version__)
print(torch.__version__)

## 1. Read data and tokenizer

In [None]:
PATH = '../input/google-quest-challenge/'

BERT_PATH = '../input/bertpretrained/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12/'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

MAX_SEQUENCE_LENGTH = 512

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

## 2. Preprocessing

In [None]:
df_train.question_body = df_train.question_body.apply(html.unescape)
df_train.question_title = df_train.question_title.apply(html.unescape)
df_train.answer = df_train.answer.apply(html.unescape)

df_test.question_body = df_test.question_body.apply(html.unescape)
df_test.question_title = df_test.question_title.apply(html.unescape)
df_test.answer = df_test.answer.apply(html.unescape)

In [None]:
def _preprocess_text(s: str) -> str:
    return s


def _trim_input(question_tokens: List[str], answer_tokens: List[str], max_sequence_length: int, q_max_len: int, a_max_len: int) -> Tuple[List[str], List[str]]:
    q_len = len(question_tokens)
    a_len = len(answer_tokens)
    if q_len + a_len + 3 > max_sequence_length:
        if a_max_len <= a_len and q_max_len <= q_len:
            ## Answer も Question も長過ぎる場合、どちらも限界まで切り詰めるしかない
            q_new_len_head = floor((q_max_len - q_max_len/2))
            question_tokens = question_tokens[:q_new_len_head] + question_tokens[q_new_len_head - q_max_len:]
            a_new_len_head = floor((a_max_len - a_max_len/2))
            answer_tokens = answer_tokens[:a_new_len_head] + answer_tokens[a_new_len_head - a_max_len:]
        elif q_len <= a_len and q_len < q_max_len:
            ## Answer のほうが長く、Question が十分短いなら、その分 Answer にまわす
            a_max_len = a_max_len + (q_max_len - q_len - 1)
            a_new_len_head = floor((a_max_len - a_max_len/2))
            answer_tokens = answer_tokens[:a_new_len_head] + answer_tokens[a_new_len_head - a_max_len:]
        elif a_len < q_len:
            assert a_len <= a_max_len
            q_max_len = q_max_len + (a_max_len - a_len - 1)
            q_new_len_head = floor((q_max_len - q_max_len/2))
            question_tokens = question_tokens[:q_new_len_head] + question_tokens[q_new_len_head - q_max_len:]
        else:
            raise ValueError("unreachable: q_len: {}, a_len: {}, q_max_len: {}, a_max_len: {}".format(q_len, a_len, q_max_len, a_max_len))
    return question_tokens, answer_tokens


def _convert_to_transformer_inputs(title: str, question: str, answer: str, tokenizer: BertTokenizer, question_only=False):
    title = _preprocess_text(title)
    question = _preprocess_text(question)
    answer = _preprocess_text(answer)
    question = "{} [SEP] {}".format(title, question)
    question_tokens = tokenizer.tokenize(question)
    if question_only:
        answer_tokens = []
    else:
        answer_tokens = tokenizer.tokenize(answer)
    question_tokens, answer_tokens = _trim_input(question_tokens, answer_tokens, MAX_SEQUENCE_LENGTH, (MAX_SEQUENCE_LENGTH - 3) // 2, (MAX_SEQUENCE_LENGTH - 3) // 2)
    ids = tokenizer.convert_tokens_to_ids(["[CLS]"] + question_tokens + ["[SEP]"] + answer_tokens + ["[SEP]"])
    padded_ids = ids + [tokenizer.pad_token_id] * (MAX_SEQUENCE_LENGTH - len(ids))
    token_type_ids = [0] * (1 + len(question_tokens) + 1) + [1] * (len(answer_tokens) + 1) + [0] * (MAX_SEQUENCE_LENGTH - len(ids))
    attention_mask = [1] * len(ids) + [0] * (MAX_SEQUENCE_LENGTH - len(ids))
    return padded_ids, token_type_ids, attention_mask

sample_args = df_train["question_title"].values[0], df_train["question_body"].values[0], df_train["answer"].values[0]
sample_ids = _convert_to_transformer_inputs(*sample_args, tokenizer, question_only=True)
print(sample_ids)
print(tokenizer.convert_ids_to_tokens(sample_ids[0]))

In [None]:
def compute_input_arrays(df, question_only=False):
    input_ids, input_token_type_ids, input_attention_masks = [], [], []
    for title, body, answer in zip(df["question_title"].values, df["question_body"].values, df["answer"].values):
        ids, type_ids, mask = _convert_to_transformer_inputs(title, body, answer, tokenizer, question_only=question_only)
        input_ids.append(ids)
        input_token_type_ids.append(type_ids)
        input_attention_masks.append(mask)
    return (
        np.asarray(input_ids, dtype=np.int32),
        np.asarray(input_token_type_ids, dtype=np.int32),
        np.asarray(input_attention_masks, dtype=np.int32),
    )


def compute_output_arrays(df):
    return np.asarray(df[output_categories])

## 3. Modeling

In [None]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        config = BertConfig.from_json_file(BERT_PATH + "/bert_config.json")
        config.output_hidden_states = True
        self.bert = BertForPreTraining.from_pretrained(BERT_PATH + "/bert_model.ckpt.index", from_tf=True, config=config).bert
        self.cls_token_head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(768 * 4, 768),
            nn.ReLU(inplace=True),
        )
        self.qa_sep_token_head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(768 * 4, 768),
            nn.ReLU(inplace=True),
        )
        self.linear = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(768 * 2, 30),
        )
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        question_answer_seps = (torch.sum((token_type_ids == 0) * attention_mask, -1) - 1)

#         p_question_answer_dropout = 0.2
#         if self.training and random.random() < p_question_answer_dropout:
#             if random.random() < 0.5:
#                 # mask question
#                 attention_mask = attention_mask * (token_type_ids == 1)
#             else:
#                 # mask answer
#                 attention_mask = attention_mask * (token_type_ids == 0)
        
        _, _, hidden_states = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_states_cls_embeddings = [x[:, 0] for x in hidden_states[-4:]]
        x = torch.cat(hidden_states_cls_embeddings, dim=-1)
        x_cls = self.cls_token_head(x)
        
        # Gather [SEP] hidden states
        tmp = torch.arange(0, len(input_ids), dtype=torch.long)
        hidden_states_qa_sep_embeddings = [x[tmp, question_answer_seps] for x in hidden_states[-4:]]
        x = torch.cat(hidden_states_qa_sep_embeddings, dim=-1)
        
        x_qa_sep = self.qa_sep_token_head(x)
        x = torch.cat([x_cls, x_qa_sep], -1)
        x = self.linear(x)
        return x

## 4. Training

In [None]:
outputs = torch.tensor(compute_output_arrays(df_train), dtype=torch.float)
inputs = [torch.tensor(x, dtype=torch.long) for x in compute_input_arrays(df_train)]
question_only_inputs = [torch.tensor(x, dtype=torch.long) for x in compute_input_arrays(df_train, question_only=True)]
test_inputs = [torch.tensor(x, dtype=torch.long) for x in compute_input_arrays(df_test)]
test_question_only_inputs = [torch.tensor(x, dtype=torch.long) for x in compute_input_arrays(df_test, question_only=True)]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
for n, _ in Model().named_parameters():
    print(n)

In [None]:
LABEL_WEIGHTS = torch.tensor(1.0 / df_train[output_categories].std().values, dtype=torch.float32).to(device)
LABEL_WEIGHTS = LABEL_WEIGHTS / LABEL_WEIGHTS.sum() * 30
for name, weight in zip(output_categories, LABEL_WEIGHTS.cpu().numpy()):
    print(name, "\t", weight)

In [None]:
BEST_BINS = [400, 400, 15, 100, 400, 7, 1600, 100, 100, 400, 100, 9, 8, 50, 9, 8, 15, 400, 400, 5, 400, 400, 800, 50, 200, 1600, 20, 200, 1600, 1600]

def binning_output(preds, n_bins=BEST_BINS):
    preds = preds.copy()
    for i in range(preds.shape[-1]):
        n = n_bins[i]
        binned = (preds[:, i] * n).astype(np.int32).astype(np.float32) / n
        unique_values, unique_counts = np.unique(binned, return_counts=True)
        # 多数派以外が 0.5 % を下回ったら binning をやめる
        minor_value_ratio = (unique_counts.sum() - unique_counts.max()) / unique_counts.sum()
        if minor_value_ratio < 0.005:
            keep = np.argsort(preds[:, i])[::-1][:int(len(preds) * 0.005) + 1]
            binned[keep] = preds[keep, i]
        preds[:, i] = binned
    return preds


def compute_spearmanr(trues, preds, n_bins=None):
    rhos = []
    if n_bins:
        preds = binning_output(preds, n_bins)
    for col_trues, col_pred in zip(trues.T, preds.T):
        if len(np.unique(col_pred)) == 1:
            col_pred[np.random.randint(0, len(col_pred) - 1)] = col_pred.max() + 1
        rhos.append(spearmanr(col_trues, col_pred).correlation)
    return np.mean(rhos)



def compute_loss(outputs, targets, alpha=0.5, margin=0.1, question_only=False):
    if question_only:
        outputs = outputs[:, :21]
        targets = targets[:, :21]
    bce = F.binary_cross_entropy_with_logits(outputs, targets, reduction="none")
    bce = (bce * LABEL_WEIGHTS[:bce.size(-1)]).mean()
    
    batch_size = outputs.size(0)
    if batch_size % 2 == 0:
        outputs1, outputs2 = outputs.sigmoid().contiguous().view(2, batch_size // 2, outputs.size(-1))
        targets1, targets2 = targets.contiguous().view(2, batch_size // 2, outputs.size(-1))
        # 1 if first ones are larger, -1 if second ones are larger, and 0 if equals.
        ordering = (targets1 > targets2).float() - (targets1 < targets2).float()
        margin_rank_loss = (-ordering * (outputs1 - outputs2) + margin).clamp(min=0.0)
        margin_rank_loss = (margin_rank_loss * LABEL_WEIGHTS[:outputs.size(-1)]).mean()
    else:
        # batch size is not even number, so we can't devide them into pairs.
        margin_rank_loss = 0.0

    return alpha * bce + (1 - alpha) * margin_rank_loss


def train_and_predict(train_data, valid_data, test_data, q_train_data, q_valid_data, q_test_data, q_epochs, epochs, batch_size, fold):
    dataloader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=batch_size)
    valid_dataloader = torch.utils.data.DataLoader(valid_data, shuffle=False, batch_size=batch_size)
    test_dataloader = torch.utils.data.DataLoader(test_data, shuffle=False, batch_size=batch_size)
    q_dataloader = torch.utils.data.DataLoader(q_train_data, shuffle=True, batch_size=batch_size)
    q_valid_dataloader = torch.utils.data.DataLoader(q_valid_data, shuffle=False, batch_size=batch_size)
    q_test_dataloader = torch.utils.data.DataLoader(q_test_data, shuffle=False, batch_size=batch_size)

    model = Model().to(device)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if p.requires_grad and not any(nd in n for nd in no_decay) and "bert" in n],
            "weight_decay": 1e-2,
            "lr": 5e-5
        },
        {
            "params": [p for n, p in model.named_parameters() if  p.requires_grad and any(nd in n for nd in no_decay) and "bert" in n], 
            "weight_decay": 0.0,
            "lr": 5e-5
        },
        {
            "params": [p for n, p in model.named_parameters() if p.requires_grad and "bert" not in n],
            "weight_decay": 1e-2,
            "lr": 5e-4
            
        }
    ]
    optimizer = AdamW(optimizer_grouped_parameters)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(len(dataloader) * (q_epochs) * 0.05),
        num_training_steps=len(dataloader) * (q_epochs)
    )
    
    test_predictions = []
    valid_predictions = []

    ## Question Only
    for epoch in range(q_epochs): 
        import time
        start = time.time()
        model.train()
        train_losses = []
        train_preds = []
        train_targets = []
        for input_ids, token_type_ids, attention_mask, targets in tqdm(q_dataloader, total=len(q_dataloader)):
            input_ids = input_ids.to(device)
            token_type_ids = token_type_ids.to(device)
            attention_mask = attention_mask.to(device)
            targets = targets.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            train_preds.extend(outputs.detach().sigmoid().cpu().numpy())
            train_targets.extend(targets.detach().cpu().numpy())
            loss = compute_loss(outputs, targets, question_only=True)
            model.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_losses.append(loss.detach().cpu().item())
        model.eval()
        valid_losses = []
        valid_preds = []
        valid_targets = []
        with torch.no_grad():
            for input_ids, token_type_ids, attention_mask, targets in tqdm(q_valid_dataloader, total=len(q_valid_dataloader)):
                input_ids = input_ids.to(device)
                token_type_ids = token_type_ids.to(device)
                attention_mask = attention_mask.to(device)
                targets = targets.to(device)
                outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                prob = outputs.sigmoid()
                prob[:, 21:] = 0.0
                valid_preds.extend(prob.cpu().numpy())
                valid_targets.extend(targets.cpu().numpy())
                loss = compute_loss(outputs, targets, question_only=True)
                valid_losses.append(loss.detach().cpu().item())
            valid_predictions.append(np.stack(valid_preds))
            test_preds = []
            for input_ids, token_type_ids, attention_mask in tqdm(q_test_dataloader, total=len(q_test_dataloader)):
                input_ids = input_ids.to(device)
                token_type_ids = token_type_ids.to(device)
                attention_mask = attention_mask.to(device)
                outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                prob = outputs.sigmoid()
                prob[:, 21:] = 0.0
                test_preds.extend(prob.cpu().numpy())
            test_predictions.append(np.stack(test_preds))
            print()
        print("Epoch {}: Train Loss {}, Valid Loss {}".format(epoch + 1, np.mean(train_losses), np.mean(valid_losses)))
        print("\t Train Spearmanr {:.4f}, Valid Spearmanr (avg) {:.4f}, Valid Spearmanr (last) {:.4f}".format(
            compute_spearmanr(np.stack(train_targets), np.stack(train_preds)),
            compute_spearmanr(np.stack(valid_targets), sum(valid_predictions) / len(valid_predictions)),
            compute_spearmanr(np.stack(valid_targets), valid_predictions[-1])
        ))
        print("\t elapsed: {}s".format(time.time() - start))

    ## Q and A
    model = Model().to(device)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if p.requires_grad and not any(nd in n for nd in no_decay) and "bert" in n],
            "weight_decay": 1e-2,
            "lr": 5e-5
        },
        {
            "params": [p for n, p in model.named_parameters() if  p.requires_grad and any(nd in n for nd in no_decay) and "bert" in n], 
            "weight_decay": 0.0,
            "lr": 5e-5
        },
        {
            "params": [p for n, p in model.named_parameters() if p.requires_grad and "bert" not in n],
            "weight_decay": 1e-2,
            "lr": 5e-4
            
        }
    ]
    optimizer = AdamW(optimizer_grouped_parameters)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(len(dataloader) * (epochs) * 0.05),
        num_training_steps=len(dataloader) * (epochs)
    )

    for epoch in range(epochs): 
        import time
        start = time.time()
        model.train()
        train_losses = []
        train_preds = []
        train_targets = []
        for input_ids, token_type_ids, attention_mask, targets in tqdm(dataloader, total=len(dataloader)):
            input_ids = input_ids.to(device)
            token_type_ids = token_type_ids.to(device)
            attention_mask = attention_mask.to(device)
            targets = targets.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            train_preds.extend(outputs.detach().sigmoid().cpu().numpy())
            train_targets.extend(targets.detach().cpu().numpy())
            loss = compute_loss(outputs, targets)
            model.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_losses.append(loss.detach().cpu().item())
        model.eval()
        valid_losses = []
        valid_preds = []
        valid_targets = []
        with torch.no_grad():
            for input_ids, token_type_ids, attention_mask, targets in tqdm(valid_dataloader, total=len(valid_dataloader)):
                input_ids = input_ids.to(device)
                token_type_ids = token_type_ids.to(device)
                attention_mask = attention_mask.to(device)
                targets = targets.to(device)
                outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                valid_preds.extend(outputs.sigmoid().cpu().numpy())
                valid_targets.extend(targets.cpu().numpy())
                loss = compute_loss(outputs, targets)
                valid_losses.append(loss.detach().cpu().item())
            valid_predictions.append(np.stack(valid_preds))
            test_preds = []
            for input_ids, token_type_ids, attention_mask in tqdm(test_dataloader, total=len(test_dataloader)):
                input_ids = input_ids.to(device)
                token_type_ids = token_type_ids.to(device)
                attention_mask = attention_mask.to(device)
                outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                test_preds.extend(outputs.sigmoid().cpu().numpy())
            test_predictions.append(np.stack(test_preds))
            print()
        print("Epoch {}: Train Loss {}, Valid Loss {}".format(epoch + 1, np.mean(train_losses), np.mean(valid_losses)))
        print("\t Train Spearmanr {:.4f}, Valid Spearmanr (avg) {:.4f}, Valid Spearmanr (last) {:.4f}".format(
            compute_spearmanr(np.stack(train_targets), np.stack(train_preds)),
            compute_spearmanr(np.stack(valid_targets), sum(valid_predictions) / len(valid_predictions)),
            compute_spearmanr(np.stack(valid_targets), valid_predictions[-1])
        ))
        print("\t elapsed: {}s".format(time.time() - start))

    return valid_predictions, test_predictions

In [None]:
class Fold(object):
    def __init__(self, n_splits=5, shuffle=True, random_state=71):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_groupkfold(self, train, group_name):
        group = train[group_name]
        unique_group = group.unique()

        kf = KFold(
            n_splits=self.n_splits,
            shuffle=self.shuffle,
            random_state=self.random_state
        )
        folds_ids = []
        for trn_group_idx, val_group_idx in kf.split(unique_group):
            trn_group = unique_group[trn_group_idx]
            val_group = unique_group[val_group_idx]
            is_trn = group.isin(trn_group)
            is_val = group.isin(val_group)
            trn_idx = train[is_trn].index
            val_idx = train[is_val].index
            folds_ids.append((trn_idx, val_idx))

        return folds_ids

In [None]:
gkf = Fold(n_splits=3, shuffle=True, random_state=71)
fold_ids = gkf.get_groupkfold(df_train, group_name="url")

for train_idx, valid_idx in fold_ids:
    print((df_train.loc[train_idx, "question_type_spelling"] > 0).sum())
    print((df_train.loc[valid_idx, "question_type_spelling"] > 0).sum())

In [None]:
histories = []
test_dataset = torch.utils.data.TensorDataset(*test_inputs)
q_test_dataset = torch.utils.data.TensorDataset(*test_question_only_inputs)

for fold, (train_idx, valid_idx) in enumerate(fold_ids):
    import gc
    gc.collect()

    train_inputs = [inputs[i][train_idx] for i in range(3)]
    q_train_inputs = [question_only_inputs[i][train_idx] for i in range(3)]
    train_outputs = outputs[train_idx]
    train_dataset = torch.utils.data.TensorDataset(*train_inputs, train_outputs)
    q_train_dataset = torch.utils.data.TensorDataset(*q_train_inputs, train_outputs)

    valid_inputs = [inputs[i][valid_idx] for i in range(3)]
    q_valid_inputs = [question_only_inputs[i][valid_idx] for i in range(3)]
    valid_outputs = outputs[valid_idx]
    valid_dataset = torch.utils.data.TensorDataset(*valid_inputs, valid_outputs)
    q_valid_dataset = torch.utils.data.TensorDataset(*q_valid_inputs, valid_outputs)

    history = train_and_predict(
        train_data=train_dataset, 
        valid_data=valid_dataset,
        test_data=test_dataset, 
        q_train_data=q_train_dataset, 
        q_valid_data=q_valid_dataset,
        q_test_data=q_test_dataset, 
        q_epochs=3, epochs=3, batch_size=8, fold=fold
        )

    histories.append(history)

## 5. Submit

In [None]:
# get val preds per each epochs
val_preds_list = []
n_epochs = len(histories[0][0])

for epoch in range(n_epochs):
    val_preds_one_epoch = np.zeros([len(df_train), 30])    

    for fold, (train_idx, valid_idx) in enumerate(fold_ids):
        val_pred = histories[fold][0][epoch]
        val_preds_one_epoch[valid_idx, :] += val_pred

    val_preds_list.append(val_preds_one_epoch)

In [None]:
oof_predictions = np.zeros((n_epochs, len(df_train), len(output_categories)), dtype=np.float32)

for j, name in enumerate(output_categories):
    for epoch in range(n_epochs):
        col = "{}_{}".format(epoch, name)
        oof_predictions[epoch, :, j] = val_preds_list[epoch][:, j]

oof_predictions.shape

In [None]:
# get test preds per each epochs
test_preds_list = []

for epoch in range(n_epochs):
    test_preds_one_epoch = 0

    for fold in range(len(fold_ids)):
        test_preds = histories[fold][1][epoch]
        test_preds_one_epoch += test_preds

    test_preds_one_epoch = test_preds_one_epoch / len(fold_ids)
    test_preds_list.append(test_preds_one_epoch)

In [None]:
test_predictions = np.zeros((n_epochs, len(df_test), len(output_categories)), dtype=np.float32)

for j, name in enumerate(output_categories):
    for epoch in range(n_epochs):
        col = "{}_{}".format(epoch, name)
        test_predictions[epoch, :, j] = test_preds_list[epoch][:, j]

test_predictions.shape

In [None]:
import numpy as np
import pandas as pd
from abc import abstractmethod
from sklearn.metrics import roc_auc_score


class Base_Model(object):
    @abstractmethod
    def fit(self, x_train, y_train, x_valid, y_valid, config):
        raise NotImplementedError
    
    @abstractmethod
    def get_best_iteration(self, model):
        raise NotImplementedError

    @abstractmethod
    def predict(self, model, features):
        raise NotImplementedError
        
    @abstractmethod
    def get_feature_importance(self, model):
        raise NotImplementedError      
        

    def cv(self, y_train, train_features, test_features, feature_name, folds_ids, config):
        # initialize
        test_preds = np.zeros(len(test_features))
        oof_preds = np.zeros(len(train_features))
        importances = pd.DataFrame(index=feature_name)
        best_iteration = 0
        cv_score_list = []
        models = []

        for i_fold, (trn_idx, val_idx) in enumerate(folds_ids):
            # get train data and valid data
            x_trn = train_features.iloc[trn_idx]
            y_trn = y_train[trn_idx]
            x_val = train_features.iloc[val_idx]
            y_val = y_train[val_idx]
            
            # train model
            model, best_score = self.fit(x_trn, y_trn, x_val, y_val, config)
            cv_score_list.append(best_score)
            models.append(model)
            best_iteration += self.get_best_iteration(model) / len(folds_ids)
    
            # predict out-of-fold and test
            oof_preds[val_idx] = self.predict(model, x_val)
            test_preds += self.predict(model, test_features) / len(folds_ids)

            # get feature importances
            importances_tmp = pd.DataFrame(
                self.get_feature_importance(model),
                columns=[f'gain_{i_fold+1}'],
                index=feature_name
            )
            importances = importances.join(importances_tmp, how='inner')

        # summary of feature importance
        feature_importance = importances.mean(axis=1)

        # full train
        # model, best_score = self.full_train(train_features, y_train, config, best_iteration * 1.5)
        # oof_preds = self.predict(model, train_features)
        # test_preds = self.predict(model, test_features)
    
        evals_results = {"evals_result": {
            "cv_score": {f"cv{i+1}": cv_score for i, cv_score in enumerate(cv_score_list)},
            "n_data": len(train_features),
            "best_iteration": best_iteration,
            "n_features": len(train_features.columns),
            "feature_importance": feature_importance.sort_values(ascending=False).to_dict()
        }}

        return models, oof_preds, test_preds, feature_importance, evals_results

In [None]:
def lgb_compute_spearmanr(preds, trues):
    rhos = spearmanr(trues.get_label(), preds).correlation
    return "spearmanr", rhos, True


def compute_spearmanr_each_col(trues, preds, n_bins=None):
    if n_bins:
        preds = binning_output(preds, n_bins)
    rhos = spearmanr(trues, preds).correlation
    return rhos

In [None]:
import lightgbm as lgb
from pathlib import Path


class LightGBM(Base_Model):
    def fit(self, x_train, y_train, x_valid, y_valid, config):
        d_train = lgb.Dataset(x_train, label=y_train)
        d_valid = lgb.Dataset(x_valid, label=y_valid)
        lgb_model_params = config["model"]["model_params"]
        lgb_train_params = config["model"]["train_params"]
        model = lgb.train(
            params=lgb_model_params,
            train_set=d_train,
            valid_sets=[d_valid],
            valid_names=['valid'],
            feval=lgb_compute_spearmanr,
            **lgb_train_params
        )
        best_score = dict(model.best_score)
        return model, best_score

    def full_train(self, x_train, y_train, config, iteration):
        d_train = lgb.Dataset(x_train, label=y_train)
        lgb_model_params = config["model"]["model_params"]
        model = lgb.train(
            params=lgb_model_params,
            train_set=d_train,
            feval=lgb_compute_spearmanr,
            num_boost_round=int(iteration)
        )
        best_score = dict(model.best_score)
        return model, best_score

    def get_best_iteration(self, model):
        return model.best_iteration
    
    def predict(self, model, features):
        return model.predict(features)
        
    def get_feature_importance(self, model):
        return model.feature_importance(importance_type='gain')

In [None]:
config = {
    "model": {
        "name": "lightgbm",
        "model_params": {
            "boosting_type": "gbdt",
            "objective": "rmse",
            "tree_learner": "serial",
            "learning_rate": 0.1,
            "max_depth": 1,
            "seed": 71,
            "bagging_seed": 71,
            "feature_fraction_seed": 71,
            "drop_seed": 71,
            "verbose": -1
        },
        "train_params": {
            "num_boost_round": 5000,
            "early_stopping_rounds": 200,
            "verbose_eval": 500
        }
    }
}


outputs = compute_output_arrays(df_train)
oof_preds_list = []
test_preds_list = []

for i_col in range(len(output_categories)):
    y_train = outputs[:, i_col]
    #x_train = pd.DataFrame(oof_predictions[:, :, 2].T)
    x_train = pd.DataFrame(np.concatenate([oof_predictions[:, :, i].T for i in range(30)], axis=1))
    x_test = pd.DataFrame(np.concatenate([test_predictions[:, :, i].T for i in range(30)], axis=1))
    feature_name = x_train.columns

    model = LightGBM()
    models, oof_preds, test_preds, feature_importance, evals_results = model.cv(
            y_train, x_train, x_test, feature_name, fold_ids, config
    )
    oof_preds_list.append(oof_preds.reshape(-1, 1))
    test_preds_list.append(test_preds.reshape(-1, 1))

    print(i_col, output_categories[i_col])
    print(compute_spearmanr_each_col(oof_preds, y_train))
    print(len(oof_preds), len(np.unique(oof_preds)))
    print(len(test_preds), len(np.unique(test_preds)))

In [None]:
def compute_spearmanr(trues, preds, n_bins=None):
    rhos = []
    if n_bins:
        preds = binning_output(preds, n_bins)
    for col_trues, col_pred in zip(trues.T, preds.T):
        if len(np.unique(col_pred)) == 1:
            col_pred[np.random.randint(0, len(col_pred) - 1)] = col_pred.max() + 1
        rhos.append(spearmanr(col_trues, col_pred).correlation)
    return np.mean(rhos)


oof_preds_fi = np.concatenate(oof_preds_list, axis=1)
print(compute_spearmanr(outputs, oof_preds_fi))

In [None]:
test_preds_fi = np.concatenate(test_preds_list, axis=1)
sub.iloc[:, 1:] = test_preds_fi
sub.to_csv('submission.csv', index=False)