In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, GroupKFold, KFold, RepeatedKFold
from tqdm import tqdm
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow.keras.backend as K
import gc
import re
import os
import sys
import six
import collections
from scipy.stats import spearmanr, rankdata
from math import floor, ceil

sys.path.extend(['../input/bert-joint-baseline/'])
# from transformers import *
import tokenization
DEBUG = False
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# print('tokenizer: ', tokenizer)

if not DEBUG:
    os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = "true"
    np.set_printoptions(suppress=True)

PATH = '../input/google-quest-challenge/'
BERT_PATH = '../input/bert-en-uncased-l12-h768-a12'
# BERT_PATH = '../input/bertenuncasedl24h1024a16'
tokenizer = tokenization.FullTokenizer(BERT_PATH+'/assets/vocab.txt', False)
MAX_SEQUENCE_LENGTH = 512

if DEBUG:
    df_train = pd.read_csv(PATH+'train.csv', nrows=30)
    df_test = pd.read_csv(PATH+'test.csv', nrows=30)
else:
    df_train = pd.read_csv(PATH + 'train.csv')
    df_test = pd.read_csv(PATH + 'test.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
print('\noutput categories:\n\t', output_categories)

q_targets = [
        'question_asker_intent_understanding',
        'question_body_critical',
        'question_conversational',
        'question_expect_short_answer',
        'question_fact_seeking',
        'question_has_commonly_accepted_answer',
        'question_interestingness_others',
        'question_interestingness_self',
        'question_multi_intent',
        'question_not_really_a_question',
        'question_opinion_seeking',
        'question_type_choice',
        'question_type_compare',
        'question_type_consequence',
        'question_type_definition',
        'question_type_entity',
        'question_type_instructions',
        'question_type_procedure',
        'question_type_reason_explanation',
        'question_type_spelling',
        'question_well_written',
    ]
a_targets = [
        'answer_helpful',
        'answer_level_of_information',
        'answer_plausible',
        'answer_relevance',
        'answer_satisfaction',
        'answer_type_instructions',
        'answer_type_procedure',
        'answer_type_reason_explanation',
        'answer_well_written'
    ]

In [None]:
class MyLabelEncoder(object):
    """safely handle unknown label"""
    def __init__(self):
        self.mapper = {}

    def fit(self, X):
        uniq_X = np.unique(X)
        # reserve 0 for unknown
        self.mapper = dict(zip(uniq_X, range(1, len(uniq_X) + 1)))
        return self

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

    def _map(self, x):
        return self.mapper.get(x, 0)

    def transform(self, X):
        return list(map(self._map, X))
def get_cate_feat(df, isTrain):
    if isTrain:
        label_encoder["category"] = MyLabelEncoder()
        category_feat = np.array(label_encoder["category"].fit_transform(df["category"]))
        label_encoder["host"] = MyLabelEncoder()
        host_feat = np.array(label_encoder["host"].fit_transform(df["host"]))
    else:
        category_feat = np.array(label_encoder["category"].transform(df["category"]))
        host_feat = np.array(label_encoder["host"].transform(df["host"]))
    return [category_feat, host_feat]
label_encoder = {}
cate_feat_tr = get_cate_feat(df_train, isTrain=True)
cate_feat_te = get_cate_feat(df_test, isTrain=False)

In [None]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens) > max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1] * len(tokens) + [0] * (max_seq_length - len(tokens))


def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens) > max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length - len(token_ids))
    return input_ids

def my_pad(text, max_length, tokenizer):
    res = tokenizer.tokenize(text)
    if len(res) > max_length:
        head_length = int(0.25 * max_length)
        tail_length = max_length - head_length
        res = res[:head_length] + res[-tail_length:]
    return res

def my_padQ(q, a, max_length, tokenizer):
    q_ = tokenizer.tokenize(q)
    a_ = tokenizer.tokenize(a)

    if len(q_)+10 >= max_length:
        head_length = int(0.25 * (max_length-10))
        tail_length = (max_length-10) - head_length
        q_ = q_[:head_length] + q_[-tail_length:]

    if len(q_) + len(a_) > max_length:
        a_length = max_length - len(q_)
        head_a_length = int(0.25*a_length)
        tail_a_length = a_length - head_a_length
        a_ = a_[:head_a_length] + a_[-tail_a_length:]
    # print(len(q_), len(a_))
    return q_, a_
def my_padA(q, a, max_length, tokenizer):
    q_ = tokenizer.tokenize(q)
    a_ = tokenizer.tokenize(a)

    if len(a_)+10 >= max_length:
        head_length = int(0.25 * (max_length-10))
        tail_length = (max_length-10) - head_length
        a_ = a_[:head_length] + a_[-tail_length:]

    if len(q_) + len(a_) > max_length:
        q_length = max_length - len(a_)
        head_q_length = int(0.25*q_length)
        tail_q_length = q_length - head_q_length
        q_ = q_[:head_q_length] + q_[-tail_q_length:]
    return q_, a_

def prepare_Qdata(df, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[['question_title', 'question_body', 'answer']].iterrows()):
        question_title_ = str(instance['question_title']).lower()
        question_body_ = str(instance['question_body']).lower()
        answer_ = str(instance['answer']).lower()

        question = question_title_ + ' ' + question_body_
        # question, answer = my_padQ(question, answer_, MAX_SEQUENCE_LENGTH-3, tokenizer)
        question = my_pad(question, MAX_SEQUENCE_LENGTH-3, tokenizer)
        
        stoken = ["[CLS]"] + question + ["[SEP]"]
        # print(len(stoken))
        # stoken = ["[CLS]"] + question_title_ + [","] + question_body_ + ["[SEP]"] + answer_ + ["[SEP]"]

        input_ids_ = _get_ids(stoken, tokenizer, MAX_SEQUENCE_LENGTH)
        input_masks_ = _get_masks(stoken, MAX_SEQUENCE_LENGTH)
        input_segments_ = _get_segments(stoken, MAX_SEQUENCE_LENGTH)

        input_ids.append(input_ids_)
        input_masks.append(input_masks_)
        input_segments.append(input_segments_)

    return [np.asarray(input_ids, dtype=np.int32),
            np.asarray(input_masks, dtype=np.int32),
            np.asarray(input_segments, dtype=np.int32)]

def prepare_Adata(df, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[['question_title', 'question_body', 'answer']].iterrows()):
        question_title_ = str(instance['question_title']).lower()
        question_body_ = str(instance['question_body']).lower()
        answer_ = str(instance['answer']).lower()

        question = question_title_ + ' ' + question_body_
        question, answer = my_padA(question, answer_, MAX_SEQUENCE_LENGTH-3, tokenizer)
        
        stoken = ["[CLS]"] + question + ["[SEP]"] + answer + ["[SEP]"]
        # stoken = ["[CLS]"] + question_title_ + [","] + question_body_ + ["[SEP]"] + answer_ + ["[SEP]"]

        input_ids_ = _get_ids(stoken, tokenizer, MAX_SEQUENCE_LENGTH)
        input_masks_ = _get_masks(stoken, MAX_SEQUENCE_LENGTH)
        input_segments_ = _get_segments(stoken, MAX_SEQUENCE_LENGTH)

        input_ids.append(input_ids_)
        input_masks.append(input_masks_)
        input_segments.append(input_segments_)

    return [np.asarray(input_ids, dtype=np.int32),
            np.asarray(input_masks, dtype=np.int32),
            np.asarray(input_segments, dtype=np.int32)]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [None]:
Qinputs_tr = prepare_Qdata(df_train, tokenizer)
Qinputs_te = prepare_Qdata(df_test, tokenizer)
Qinputs_tr.extend(cate_feat_tr)
Qinputs_te.extend(cate_feat_te)
Qoutputs = compute_output_arrays(df_train, q_targets)

Ainputs_tr = prepare_Adata(df_train, tokenizer)
Ainputs_te = prepare_Adata(df_test, tokenizer)
Ainputs_tr.extend(cate_feat_tr)
Ainputs_te.extend(cate_feat_te)
Aoutputs = compute_output_arrays(df_train, a_targets)

In [None]:
import tensorflow as tf
from tensorflow.python.keras.optimizer_v2.optimizer_v2 import OptimizerV2
from tensorflow.python import ops, math_ops, state_ops, control_flow_ops
from tensorflow.python.keras import backend_config

__all__ = ['AdamWarmup']


class AdamWarmup(OptimizerV2):
    """Adam optimizer with warmup."""

    def __init__(self,
                 decay_steps,
                 warmup_steps,
                 min_lr=0.0,
                 learning_rate=0.001,
                 beta_1=0.9,
                 beta_2=0.999,
                 epsilon=1e-7,
                 weight_decay=0.,
                 weight_decay_pattern=None,
                 amsgrad=False,
                 name='Adam',
                 **kwargs):
        r"""Construct a new Adam optimizer.

        Args:
            decay_steps: Learning rate will decay linearly to zero in decay steps.
            warmup_steps: Learning rate will increase linearly to lr in first warmup steps.
            lr: float >= 0. Learning rate.
            beta_1: float, 0 < beta < 1. Generally close to 1.
            beta_2: float, 0 < beta < 1. Generally close to 1.
            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
            weight_decay: float >= 0. Weight decay.
            weight_decay_pattern: A list of strings. The substring of weight names to be decayed.
                                  All weights will be decayed if it is None.
            amsgrad: boolean. Whether to apply the AMSGrad variant of this
                algorithm from the paper "On the Convergence of Adam and
                Beyond".
        """

        super(AdamWarmup, self).__init__(name, **kwargs)
        self._set_hyper('decay_steps', float(decay_steps))
        self._set_hyper('warmup_steps', float(warmup_steps))
        self._set_hyper('min_lr', min_lr)
        self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
        self._set_hyper('decay', self._initial_decay)
        self._set_hyper('beta_1', beta_1)
        self._set_hyper('beta_2', beta_2)
        self._set_hyper('weight_decay', weight_decay)
        self.epsilon = epsilon or backend_config.epsilon()
        self.amsgrad = amsgrad
        self._initial_weight_decay = weight_decay
        self._weight_decay_pattern = weight_decay_pattern

    def _create_slots(self, var_list):
        for var in var_list:
            self.add_slot(var, 'm')
        for var in var_list:
            self.add_slot(var, 'v')
        if self.amsgrad:
            for var in var_list:
                self.add_slot(var, 'vhat')

    def set_weights(self, weights):
        params = self.weights
        num_vars = int((len(params) - 1) / 2)
        if len(weights) == 3 * num_vars + 1:
            weights = weights[:len(params)]
        super(AdamWarmup, self).set_weights(weights)

    def _resource_apply_dense(self, grad, var):
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype)
        local_step = math_ops.cast(self.iterations + 1, var_dtype)
        beta_1_power = math_ops.pow(beta_1_t, local_step)
        beta_2_power = math_ops.pow(beta_2_t, local_step)

        decay_steps = self._get_hyper('decay_steps', var_dtype)
        warmup_steps = self._get_hyper('warmup_steps', var_dtype)
        min_lr = self._get_hyper('min_lr', var_dtype)
        lr_t = tf.where(
            local_step <= warmup_steps,
            lr_t * (local_step / warmup_steps),
            min_lr + (lr_t - min_lr) * (1.0 - tf.minimum(local_step, decay_steps) / decay_steps),
        )
        lr_t = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))

        m_t = state_ops.assign(m,
                               beta_1_t * m + (1.0 - beta_1_t) * grad,
                               use_locking=self._use_locking)

        v_t = state_ops.assign(v,
                               beta_2_t * v + (1.0 - beta_2_t) * math_ops.square(grad),
                               use_locking=self._use_locking)

        if self.amsgrad:
            v_hat = self.get_slot(var, 'vhat')
            v_hat_t = math_ops.maximum(v_hat, v_t)
            var_update = m_t / (math_ops.sqrt(v_hat_t) + epsilon_t)
        else:
            var_update = m_t / (math_ops.sqrt(v_t) + epsilon_t)

        if self._initial_weight_decay > 0.0:
            weight_decay = self._get_hyper('weight_decay', var_dtype)
            var_update += weight_decay * var
        var_update = state_ops.assign_sub(var, lr_t * var_update, use_locking=self._use_locking)

        updates = [var_update, m_t, v_t]
        if self.amsgrad:
            updates.append(v_hat_t)
        return control_flow_ops.group(*updates)

    def _resource_apply_sparse(self, grad, var, indices):
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype)
        local_step = math_ops.cast(self.iterations + 1, var_dtype)
        beta_1_power = math_ops.pow(beta_1_t, local_step)
        beta_2_power = math_ops.pow(beta_2_t, local_step)

        decay_steps = self._get_hyper('decay_steps', var_dtype)
        warmup_steps = self._get_hyper('warmup_steps', var_dtype)
        min_lr = self._get_hyper('min_lr', var_dtype)
        lr_t = tf.where(
            local_step <= warmup_steps,
            lr_t * (local_step / warmup_steps),
            min_lr + (lr_t - min_lr) * (1.0 - tf.minimum(local_step, decay_steps) / decay_steps),
        )
        lr_t = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))

        m = self.get_slot(var, 'm')
        m_scaled_g_values = grad * (1 - beta_1_t)
        m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
        with ops.control_dependencies([m_t]):
            m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)

        v = self.get_slot(var, 'v')
        v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
        v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
        with ops.control_dependencies([v_t]):
            v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)

        if self.amsgrad:
            v_hat = self.get_slot(var, 'vhat')
            v_hat_t = math_ops.maximum(v_hat, v_t)
            var_update = m_t / (math_ops.sqrt(v_hat_t) + epsilon_t)
        else:
            var_update = m_t / (math_ops.sqrt(v_t) + epsilon_t)

        if self._initial_weight_decay > 0.0:
            weight_decay = self._get_hyper('weight_decay', var_dtype)
            var_update += weight_decay * var
        var_update = state_ops.assign_sub(var, lr_t * var_update, use_locking=self._use_locking)

        updates = [var_update, m_t, v_t]
        if self.amsgrad:
            updates.append(v_hat_t)
        return control_flow_ops.group(*updates)

    def get_config(self):
        config = super(AdamWarmup, self).get_config()
        config.update({
            'decay_steps': self._serialize_hyperparameter('decay_steps'),
            'warmup_steps': self._serialize_hyperparameter('warmup_steps'),
            'min_lr': self._serialize_hyperparameter('min_lr'),
            'learning_rate': self._serialize_hyperparameter('learning_rate'),
            'decay': self._serialize_hyperparameter('decay'),
            'beta_1': self._serialize_hyperparameter('beta_1'),
            'beta_2': self._serialize_hyperparameter('beta_2'),
            'weight_decay': self._serialize_hyperparameter('weight_decay'),
            'epsilon': self.epsilon,
            'amsgrad': self.amsgrad,
        })
        return config

def calc_train_steps(num_example, batch_size, epochs, warmup_proportion=0.1):
    """Calculate the number of total and warmup steps.
    (320, 32)
    :param num_example: Number of examples in one epoch.
    :param batch_size: Batch size.
    :param epochs: Number of epochs.
    :param warmup_proportion: The proportion of warmup steps.
    :return: Total steps and warmup steps.
    """
    steps = (num_example + batch_size - 1) // batch_size
    total = steps * epochs
    warmup = int(total * warmup_proportion)
    return total, warmup

In [None]:
class SpearmanRhoCallback(tf.keras.callbacks.Callback):
    def __init__(self, training_data, validation_data, patience, model_name):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

        self.patience = patience
        self.value = -1
        self.bad_epochs = 0
        self.model_name = model_name

    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred_val = self.model.predict(self.x_val)
        rho_val = np.mean([spearmanr(self.y_val[:, ind],
                                     y_pred_val[:, ind]).correlation
                           for ind in range(y_pred_val.shape[1])])
        if rho_val >= self.value:
            self.value = rho_val
            self.bad_epochs = 0
            self.model.save_weights(self.model_name)
        else:
            self.bad_epochs += 1
        if self.bad_epochs >= self.patience:
            print("Epoch %05d: early stopping Threshold" % epoch)
            self.model.stop_training = True
        print('\rval_spearman-rho: %s' % (str(round(rho_val, 4))), end=100 * ' ' + '\n')
        logs['val_rho'] = rho_val
        return rho_val

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [None]:
def bert_model(bert_trainabel, learning_rate, len_tr, BATCH_SIZE, NUM_EPOCHS, out_dim):
    EMB_SIZE = 32

    input_word_ids = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')

    input_category = tf.keras.layers.Input((1,), dtype=tf.int32, name='input_category')
    input_host = tf.keras.layers.Input((1,), dtype=tf.int32, name='input_host')

    category_emb = tf.keras.layers.SpatialDropout1D(0.1)(
        tf.keras.layers.Embedding(input_dim=6, output_dim=EMB_SIZE)(input_category))
    host_emb = tf.keras.layers.SpatialDropout1D(0.1)(
        tf.keras.layers.Embedding(input_dim=65, output_dim=EMB_SIZE)(input_host))
    features_dense = tf.keras.layers.concatenate([category_emb, host_emb], axis=1)
    features_dense = tf.keras.layers.Flatten()(features_dense)

    bert_layer = hub.KerasLayer(BERT_PATH, trainable=bert_trainabel)
    pooled_output, sequence_output = bert_layer([input_word_ids, input_masks, input_segments])

    x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.concatenate([x, features_dense])

    out = tf.keras.layers.Dense(out_dim, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(inputs=[input_word_ids, input_masks, input_segments, input_category, input_host],
                                  outputs=out)
    decay_steps, warmup_steps = calc_train_steps(
        len_tr,
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHS,
        )
    adamW_opt = AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=learning_rate, min_lr=0,)
    # Nadam = tf.keras.optimizers.Nadam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
    model.compile(loss='binary_crossentropy', optimizer=adamW_opt)

    return model

In [None]:
!ls ../input/qa2bert1230

In [None]:
import time
MODEL_PATH = '../input/qa2bert1230'
all_predictions_q = []
start_time = time.time()
for fold_ in range(0, 5):
    print('fold: ', fold_)
    K.clear_session()
    
    model_name = f'{MODEL_PATH}/Qbert_fold{fold_}.h5'
    
    model = bert_model(bert_trainabel=False, learning_rate=5e-5, len_tr=len(df_test), BATCH_SIZE=4, NUM_EPOCHS=4, out_dim=len(q_targets))
    model.load_weights(model_name)

    all_predictions_q.append(model.predict(Qinputs_te))
    print("time elapsed: {:<5.2}m".format((time.time() - start_time) / 60))

all_predictions_a = []
for fold_ in range(0, 5):
    print('fold: ', fold_)
    K.clear_session()
    
    model_name = f'{MODEL_PATH}/Abert_fold{fold_}.h5'
    
    model = bert_model(bert_trainabel=False, learning_rate=5e-5, len_tr=len(df_test), BATCH_SIZE=4, NUM_EPOCHS=4, out_dim=len(a_targets))
    model.load_weights(model_name)

    all_predictions_a.append(model.predict(Ainputs_te))
    print("time elapsed: {:<5.2}m".format((time.time() - start_time) / 60))

In [None]:
test_preds_q = np.mean(all_predictions_q, axis=0)
test_preds_a = np.mean(all_predictions_a, axis=0)
print(test_preds_q.shape, test_preds_a.shape)

In [None]:
test_preds1 = np.concatenate([test_preds_q, test_preds_a], axis=1)
print(test_preds1.shape)

In [None]:
target_col = df_train.columns.tolist()[11:42]

output_categories = list(df_train.columns[11:])
print('\noutput categories:\n\t', output_categories)

targets = [
        'question_asker_intent_understanding',
        'question_body_critical',
        'question_conversational',
        'question_expect_short_answer',
        'question_fact_seeking',
        'question_has_commonly_accepted_answer',
        'question_interestingness_others',
        'question_interestingness_self',
        'question_multi_intent',
        'question_not_really_a_question',
        'question_opinion_seeking',
        'question_type_choice',
        'question_type_compare',
        'question_type_consequence',
        'question_type_definition',
        'question_type_entity',
        'question_type_instructions',
        'question_type_procedure',
        'question_type_reason_explanation',
        'question_type_spelling',
        'question_well_written',
        'answer_helpful',
        'answer_level_of_information',
        'answer_plausible',
        'answer_relevance',
        'answer_satisfaction',
        'answer_type_instructions',
        'answer_type_procedure',
        'answer_type_reason_explanation',
        'answer_well_written'
    ]

In [None]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens) > max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1] * len(tokens) + [0] * (max_seq_length - len(tokens))


def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens) > max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length - len(token_ids))
    return input_ids

def my_pad(text, max_length, tokenizer):
    res = tokenizer.tokenize(text)
    if len(res) > max_length:
        head_length = int(0.25 * max_length)
        tail_length = max_length - head_length
        res = res[:head_length] + res[-tail_length:]
    return res

def _trim_input(title, question, answer, max_sequence_length,
                t_max_len=30, q_max_len=239, a_max_len=239):
    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)

    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len + q_len + a_len + 4) > max_sequence_length:

        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len) / 2)
            q_max_len = q_max_len + ceil((t_max_len - t_len) / 2)
        else:
            t_new_len = t_max_len

        if a_max_len > a_len:
            a_new_len = a_len
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len

        if t_new_len + a_new_len + q_new_len + 4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d"
                             % (max_sequence_length, (t_new_len + a_new_len + q_new_len + 4)))

        head_t_new_len = int(0.25 * t_new_len)
        tail_t_new_len = t_new_len - head_t_new_len

        head_q_new_len = int(0.25 * q_new_len)
        tail_q_new_len = q_new_len - head_q_new_len

        head_a_new_len = int(0.25 * a_new_len)
        tail_a_new_len = a_new_len - head_a_new_len

        t = t[:head_t_new_len] + t[-tail_t_new_len:]
        q = q[:head_q_new_len] + q[-tail_q_new_len:]
        a = a[:head_a_new_len] + a[-tail_a_new_len:]

    return t, q, a

def prepare_data(df, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[['question_title', 'question_body', 'answer', 'question_user_name', 'answer_user_name']].iterrows()):
        question_title_ = str(instance['question_title']).lower()
        question_body_ = str(instance['question_body']).lower()
        answer_ = str(instance['answer']).lower()
#         question_user_name_ = instance['question_user_name']
#         answer_user_name_ = instance['answer_user_name']

#         question_title_ = my_pad(question_title_, max_length=15, tokenizer=tokenizer)
#         question_body_ = my_pad(question_body_, max_length=245, tokenizer=tokenizer)
#         answer_ = my_pad(answer_, max_length=248, tokenizer=tokenizer)
#         question_user_name_ = my_pad(question_user_name_, max_length=2, tokenizer=tokenizer)
#         answer_user_name_ = my_pad(answer_user_name_, max_length=2, tokenizer=tokenizer)

        question_title_, question_body_, answer_ = _trim_input(question_title_, question_body_, answer_, MAX_SEQUENCE_LENGTH,
                                                               t_max_len=15, q_max_len=245, a_max_len=248)

        stoken = ["[CLS]"] + question_title_ + [","] + question_body_ + ["[SEP]"] + answer_ + ["[SEP]"]

        input_ids_ = _get_ids(stoken, tokenizer, MAX_SEQUENCE_LENGTH)
        input_masks_ = _get_masks(stoken, MAX_SEQUENCE_LENGTH)
        input_segments_ = _get_segments(stoken, MAX_SEQUENCE_LENGTH)

        input_ids.append(input_ids_)
        input_masks.append(input_masks_)
        input_segments.append(input_segments_)

    return [np.asarray(input_ids, dtype=np.int32),
            np.asarray(input_masks, dtype=np.int32),
            np.asarray(input_segments, dtype=np.int32)]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

outputs = compute_output_arrays(df_train, output_categories)

# inputs = prepare_data(df_train, tokenizer)
test_inputs = prepare_data(df_test, tokenizer)

In [None]:
class MyLabelEncoder(object):
    """safely handle unknown label"""
    def __init__(self):
        self.mapper = {}

    def fit(self, X):
        uniq_X = np.unique(X)
        # reserve 0 for unknown
        self.mapper = dict(zip(uniq_X, range(1, len(uniq_X) + 1)))
        return self

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

    def _map(self, x):
        return self.mapper.get(x, 0)

    def transform(self, X):
        return list(map(self._map, X))
def get_cate_feat(df, isTrain):
    if isTrain:
        label_encoder["category"] = MyLabelEncoder()
        category_feat = np.array(label_encoder["category"].fit_transform(df["category"]))
        label_encoder["host"] = MyLabelEncoder()
        host_feat = np.array(label_encoder["host"].fit_transform(df["host"]))
    else:
        category_feat = np.array(label_encoder["category"].transform(df["category"]))
        host_feat = np.array(label_encoder["host"].transform(df["host"]))
    return [category_feat, host_feat]
label_encoder = {}
cate_feat_tr = get_cate_feat(df_train, isTrain=True)
cate_feat_te = get_cate_feat(df_test, isTrain=False)

# inputs.extend(cate_feat_tr)
test_inputs.extend(cate_feat_te)

In [None]:
class SpearmanRhoCallback(tf.keras.callbacks.Callback):
    def __init__(self, training_data, validation_data, patience, model_name):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

        self.patience = patience
        self.value = -1
        self.bad_epochs = 0
        self.model_name = model_name

    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred_val = self.model.predict(self.x_val)
        rho_val = np.mean([spearmanr(self.y_val[:, ind],
                                     y_pred_val[:, ind]).correlation
                           for ind in range(y_pred_val.shape[1])])
        if rho_val >= self.value:
            self.value = rho_val
            self.bad_epochs = 0
            self.model.save_weights(self.model_name)
        else:
            self.bad_epochs += 1
        if self.bad_epochs >= self.patience:
            print("Epoch %05d: early stopping Threshold" % epoch)
            self.model.stop_training = True
        print('\rval_spearman-rho: %s' % (str(round(rho_val, 4))), end=100 * ' ' + '\n')
        logs['val_rho'] = rho_val
        return rho_val

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return
import itertools

def spearman_loss(y_true, y_pred):
  return K.var(y_pred - y_true, axis=-1) / (K.std(y_pred, axis=-1)*K.std(y_true, axis=-1))# +K.random_uniform(shape=y_true.shape, minval=0.0, maxval=0.0001))

def bert_model(bert_trainabel, learning_rate, len_tr, BATCH_SIZE, NUM_EPOCHS):
    EMB_SIZE = 32

    input_word_ids = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')

    input_category = tf.keras.layers.Input((1,), dtype=tf.int32, name='input_category')
    input_host = tf.keras.layers.Input((1,), dtype=tf.int32, name='input_host')

    category_emb = tf.keras.layers.SpatialDropout1D(0.1)(
        tf.keras.layers.Embedding(input_dim=6, output_dim=EMB_SIZE)(input_category))
    host_emb = tf.keras.layers.SpatialDropout1D(0.1)(
        tf.keras.layers.Embedding(input_dim=65, output_dim=EMB_SIZE)(input_host))
    features_dense = tf.keras.layers.concatenate([category_emb, host_emb], axis=1)
    features_dense = tf.keras.layers.Flatten()(features_dense)

    bert_layer = hub.KerasLayer(BERT_PATH, trainable=bert_trainabel)
    pooled_output, sequence_output = bert_layer([input_word_ids, input_masks, input_segments])

    print('pooled_output: ', pooled_output)
    print('sequence_output: ', sequence_output)

    # bert_feat = tf.keras.layers.Dense(EMB_SIZE, activation="relu")(pooled_output)
    # fm_emb = [bert_feat, category_emb, host_emb]
    # sum_add = tf.keras.layers.add(fm_emb)
    # sum_add = tf.keras.layers.multiply([sum_add,sum_add])
    # add_sum = []
    # for layer in fm_emb:
    #     add_sum.append(tf.keras.layers.multiply([layer,layer]))
    # add_sum = tf.keras.layers.add(add_sum)
        
    # subtract_layer = tf.keras.layers.Lambda(lambda inputs: inputs[0] - inputs[1],output_shape=lambda shapes: shapes[0])
    # fm_part = subtract_layer([sum_add, add_sum])
    # fm_part  = tf.keras.layers.Lambda(lambda x: x * 0.5)(fm_part)
    # fm_part = tf.keras.layers.Dropout(0.5)(fm_part)
    # fm_part = tf.keras.layers.Flatten()(fm_part)
    # print(fm_part)

    x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.concatenate([x, features_dense])

    out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(inputs=[input_word_ids, input_masks, input_segments, input_category, input_host],
                                  outputs=out)
    decay_steps, warmup_steps = calc_train_steps(
        len_tr,
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHS,
        )
    adamW_opt = AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=learning_rate, min_lr=0,)
    # Nadam = tf.keras.optimizers.Nadam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
    model.compile(loss='binary_crossentropy', optimizer=adamW_opt)

    return model

In [None]:
!ls ../input/bert-weights-repeatkfold1224

In [None]:
import time
MODEL_PATH = '../input/bert-weights-repeatkfold1224'
y_test = np.zeros((len(test_inputs), len(targets)))
all_predictions = []
start_time = time.time()
for fold_ in range(1, 11):
    print('fold: ', fold_)
    K.clear_session()
    
    model_name = f'{MODEL_PATH}/bert_fold{fold_}.h5'
    
    model = bert_model(bert_trainabel=False, learning_rate=5e-5, len_tr=len(y_test), BATCH_SIZE=4, NUM_EPOCHS=4)
    model.load_weights(model_name)

    all_predictions.append(model.predict(test_inputs))
    print("time elapsed: {:<5.2}m".format((time.time() - start_time) / 60))

In [None]:
test_preds2 = np.mean(all_predictions, axis=0)

In [None]:
test_preds = test_preds1 * 0.5 + test_preds2 * 0.5

In [None]:
pred = np.copy(test_preds)

hyper = pd.read_csv('../input/bert-base/hyper2.csv')
submission = pd.read_csv(PATH+'sample_submission.csv')
col1 = hyper.loc[hyper[['score1', 'score2']].max(axis=1) == hyper.score1, 'col'].tolist()

for col in col1:
    colidx = target_col.index(col)
    pred[:, colidx] = (pred[:, colidx]//(1/90))/90

for col in [x for x in target_col if x not in col1]:
    if hyper.loc[hyper.col==col, 'pct'].values == 1:
        pct = hyper.loc[hyper.col==col, 'pct'] - 0.005
    elif hyper.loc[hyper.col==col, 'pct'].values == 0:
        pct = hyper.loc[hyper.col==col, 'pct'] + 0.005
    else:
        pct = hyper.loc[hyper.col==col, 'pct']
        
    changerow = int(len(df_test) * pct)
    colidx = target_col.index(col)
    
    if hyper.loc[hyper.col==col, 'choice'].values =='low':
        rowidx = pred[:, colidx].argsort()[:changerow]
        pred[rowidx, colidx] = 0
    elif hyper.loc[hyper.col==col, 'choice'].values =='up':
        rowidx = pred[:, colidx].argsort()[-changerow:]
        pred[rowidx, colidx] = 1
    else:
        print('Wrong!')

submission[target_col] = pred


submission.to_csv("submission.csv", index=False)