In [None]:
!pip install ../input/transformers280/transformers/ > /dev/null

In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertConfig
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from keras.callbacks import Callback
import tensorflow.keras.backend as K
from sklearn.model_selection import GroupKFold

In [None]:
max_sequence_length = 380   # Higher will case google colab crashes
n_epoch = 3                 # 3 or 4 would be good enough
learning_rate = 2e-5        # 5e-5, 3e-5, 2e-5
n_fold = 5                  # Train Valid split, i.e. (1/n_fold) for validation on each fold
batch_size = 8              # Larger will cause google colab crashes
dropout_rate = 0.1

In [None]:
model_version = 'bertv11-3e/best_model_2.h5'
DATA_PATH = '../input/google-quest-challenge'
BERT_PATH = '../input/bertbaseuncased/bert-base-uncased'

In [None]:
tokenizer = BertTokenizer(os.path.join(BERT_PATH, 'bert-base-uncased-vocab.txt'))

df_train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
df_test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

input_columns = list(df_train[['question_title', 'question_body', 'answer']].columns)
output_labels = list(df_train.columns[11:])

In [None]:
def process_sequence(str1, str2, length, truncation_strategy='longest_first'):
    """
    Process sequence or sequence pair into ids, masks and segments.
    """

    inputs = tokenizer.encode_plus(str1, str2,
        add_special_tokens=True,
        max_length=length,
        truncation_strategy=truncation_strategy)
    
    id =  inputs["input_ids"]
    mask = [1] * len(id)
    segment = inputs["token_type_ids"]
    padding_length = length - len(id)
    id = id + ([0] * padding_length)
    mask = mask + ([0] * padding_length)
    segment = segment + ([0] * padding_length)
    
    return id, mask, segment

def convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length):
    """
    Preprocess text input into tokens, then encode then into ids, masks and segments as input for the BERT transformer.
    """    
    id_q, mask_q, segment_q = process_sequence(title + ' ' + question, None , max_sequence_length)
    
    id_a, mask_a, segment_a = process_sequence(answer, None, max_sequence_length)
    
    return id_q, mask_q, segment_q, id_a, mask_a, segment_a

In [None]:
def compute_input(df, columns, tokenizer, max_sequence_length):
    ids_q, masks_q, segments_q = [], [], []
    ids_a, masks_a, segments_a = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        id_q, mask_q, segment_q, id_a, mask_a, segment_a = \
        convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
        
        ids_q.append(id_q)
        masks_q.append(mask_q)
        segments_q.append(segment_q)

        ids_a.append(id_a)
        masks_a.append(mask_a)
        segments_a.append(segment_a)
        
    return [np.asarray(ids_q, dtype=np.int32), 
            np.asarray(masks_q, dtype=np.int32), 
            np.asarray(segments_q, dtype=np.int32),
            np.asarray(ids_a, dtype=np.int32), 
            np.asarray(masks_a, dtype=np.int32), 
            np.asarray(segments_a, dtype=np.int32)]

def compute_output(df, columns):
    return np.asarray(df[columns])

In [None]:
def create_model():
    q_id = tf.keras.layers.Input((max_sequence_length,), dtype=tf.int32)
    a_id = tf.keras.layers.Input((max_sequence_length,), dtype=tf.int32)
    
    q_mask = tf.keras.layers.Input((max_sequence_length,), dtype=tf.int32)
    a_mask = tf.keras.layers.Input((max_sequence_length,), dtype=tf.int32)
    
    q_seg = tf.keras.layers.Input((max_sequence_length,), dtype=tf.int32)
    a_seg = tf.keras.layers.Input((max_sequence_length,), dtype=tf.int32)
    
    config = BertConfig() 
    # print(config)
    config.output_hidden_states = False # Set to True to obtain hidden states
    
    # normally ".from_pretrained('bert-base-uncased')", but because of no internet, the 
    # pretrained model has been downloaded manually and uploaded to kaggle. 
    bert_model = TFBertModel.from_pretrained(os.path.join(BERT_PATH, 'bert-base-uncased-tf_model.h5'), config=config)
    
    # Get the hidden embedding of the question/answer sequence.
    q_emb = bert_model(q_id, attention_mask=q_mask, token_type_ids=q_seg)[0]
    a_emb = bert_model(a_id, attention_mask=a_mask, token_type_ids=a_seg)[0]
    
    q = tf.keras.layers.GlobalAveragePooling1D()(q_emb)
    a = tf.keras.layers.GlobalAveragePooling1D()(a_emb)
    
    x = tf.keras.layers.Concatenate()([q, a])

    x = tf.keras.layers.Dense(1500)(x)
    x = tf.keras.layers.Dense(1500)(x)
    
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    
    x = tf.keras.layers.Dense(30, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=[q_id, q_mask, q_seg, a_id, a_mask, a_seg], outputs=x)
    
    return model
  
def compute_rho(trues, preds):
    rhos = []
    for tcol, pcol in zip(np.transpose(trues), np.transpose(preds)):
        rhos.append(spearmanr(tcol, pcol).correlation)
    return np.nanmean(rhos)

In [None]:
%%time
test_preds = []
K.clear_session()
model = create_model()
model.load_weights(os.path.join('../input/', model_version))
test_inputs = compute_input(df_test, input_columns, tokenizer, max_sequence_length)
test_preds.append(model.predict(test_inputs))

df_submission = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))
df_submission.iloc[:, 1:] = np.average(test_preds, axis=0) # for weighted average set weights=[...]
df_submission.to_csv('submission.csv', index=False)