In [None]:
! pip install ../input/sacremoses/sacremoses-master/ 
! pip install ../input/transformers/transformers-master/

In [None]:
!pip install ../input/tensorflowv1/tensorboard-1.15.0-py3-none-any.whl
!pip install ../input/tensorflowv1/tensorflow_estimator-1.15.1-py2.py3-none-any.whl
!pip install ../input/tensorflowv1/tensorflow-1.15.0-cp36-cp36m-manylinux2010_x86_64.whl
import os
import sys

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer
from scipy.stats import spearmanr
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import MinMaxScaler
from transformers import AdamW, BertConfig, BertModel
import time
import datetime
import  gc
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
PATH = '../input/google-quest-challenge/'
df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

MAX_LEN = 512
SEP_TOKEN_ID = 102
BERT_PATH = '../input/bert-base-from-tfhub/bert_en_uncased_L-12_H-768_A-12'
configPath = '../input/bert-model-config/'

In [None]:
use_cuda = torch.cuda.is_available()
if use_cuda:
    print('GPU available!')
    print(torch.cuda.get_device_name(0))
else:
    print('GPU not available')
device = torch.device("cuda:0" if use_cuda else "cpu")
# torch.backends.cudnn.benchmark = True

In [None]:
# helper time function
def format_time(elapsed_time):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round(elapsed_time))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
class QuestDataset(torch.utils.data.Dataset):
    def __init__(self, df, train_mode=True, labeled=True, target_columns=output_categories):
        self.df = df
        self.train_mode = train_mode
        self.labeled = labeled
        self.tokenizer = BertTokenizer.from_pretrained(BERT_PATH+'/assets/vocab.txt', do_lower_case=True)
        self.target_columns = target_columns

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        token_ids, seg_ids = self.get_token_ids(row)
        if self.labeled:
            labels = self.get_labels(row, self.target_columns)
            return token_ids, seg_ids, labels
        else:
            return token_ids, seg_ids

    def get_token_ids(self, row):
        title_tokens, ques_tokens, ans_tokens = self.trim_input(row.question_title, row.question_body, row.answer)
        tokens = ['[CLS]'] + title_tokens + ['[SEP]'] + ques_tokens + ['[SEP]'] + ans_tokens + ['[SEP]']
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if (len(token_ids)) < MAX_LEN:
            # PADDING
            token_ids += [0] * (MAX_LEN - len(token_ids))

        token_ids_torch = torch.tensor(token_ids)
        seg_ids = self.get_seg_ids(token_ids_torch)

        return token_ids_torch, seg_ids

    def get_seg_ids(self, ids):
        seg_ids = torch.zeros_like(ids)
        seg_idx = 0
        first_step = True
        for i, e in enumerate(ids):
            seg_ids[i] = seg_idx
            if e == SEP_TOKEN_ID:
                if first_step:
                    first_step = False
                else:
                    seg_idx = 1
        # get indexes of paddings
        pad_idx = torch.nonzero(ids == 0)
        seg_ids[pad_idx] = 0

        return seg_ids

    
    def get_labels(self, row, target_columns):
        return torch.tensor(row[target_columns].values.astype(np.float32))

    def trim_input(self, title, question, answer, max_seq_length=MAX_LEN,
                   title_max_len=30, ques_max_len=239, ans_max_len=239, 
                   head_len=120):
        title_tokens = self.tokenizer.tokenize(title)
        ques_tokens = self.tokenizer.tokenize(question)
        ans_tokens = self.tokenizer.tokenize(answer)

        title_len = len(title_tokens)
        ques_len = len(ques_tokens)
        ans_len = len(ans_tokens)

        if (title_len + ques_len + ans_len + 4) > max_seq_length:
            if title_max_len > title_len:
                final_title_len = title_len
                ans_max_len = int(ans_max_len + np.ceil((title_max_len - title_len) / 2))
                ques_max_len = int(ques_max_len + np.floor((title_max_len - title_len) / 2))
            else:
                final_title_len = title_max_len

            if ans_max_len > ans_len:
                final_ans_len = ans_len
                final_ques_len = ques_max_len + (ans_max_len - ans_len)
            elif ques_max_len > ques_len:
                final_ques_len = ques_len
                final_ans_len = ans_max_len + (ques_max_len - ques_len)
            else:
                final_ans_len = ans_max_len
                final_ques_len = ques_max_len

            if final_ans_len + final_ques_len + final_title_len + 4 != max_seq_length:
                print('Token Sequence Length:', final_ans_len + final_ques_len + final_title_len + 4)
                raise ValueError('New sequence length does not match.')

            title_tokens = title_tokens[:final_title_len]
            # head + tail tokenization
            if ques_len > final_ques_len:
                max_head_len = max(head_len, int(0.5 * final_ques_len))
                ques_tokens = ques_tokens[:max_head_len] + ques_tokens[max_head_len - final_ques_len:]
            else:
                ques_tokens = ques_tokens[:final_ques_len]
            if ans_len > final_ans_len:
                max_head_len = max(head_len, int(0.5 * final_ans_len))
                ans_tokens = ans_tokens[:max_head_len] + ans_tokens[max_head_len - final_ans_len:]
            else:
                ans_tokens = ans_tokens[:final_ans_len]


        return title_tokens, ques_tokens, ans_tokens

In [None]:
params = {'batch_size': 8,
          'shuffle': False,
          'num_workers': 6}

In [None]:
def prediction_v7(test_dataloader, model, classifier, shape, batch_size=8):
    print("Running Prediction...")

    t0 = time.time()

    # Put model in evaluation mode to evaluate loss on the prediction set
    model.eval()
    classifier.eval()

    # val_shape = len(df_val)
    test_preds = np.zeros((shape, 30))

    for step, batch in enumerate(test_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        test_input_ids, test_input_mask = batch

        # Telling the model not to compute or store gradients, saving memory and speeding up prediction
        with torch.no_grad():
            attention_mask = (test_input_ids > 0)
            last_layer, _ , hidden_layers= model(input_ids=test_input_ids, 
                                                 token_type_ids=test_input_mask, 
                                                 attention_mask=attention_mask)
            
            cls = torch.cat(tuple(hidden_layers[i][:, 0, :] for i in [-1, -2, -3, -4]), dim = -1)
            # cls = torch.cat((cls, external_features), dim = -1)
            # outputs = torch.cat(tuple(hidden_layers[i] for i in [-1, -2, -3, -4]), dim = -1)
            # pooling_layer = torch.nn.AdaptiveAvgPool2d((1, 3072))
            # cls = pooling_layer(outputs).squeeze()
            # cls = last_layer[:, 0, :]
            # outputs = torch.nn.AdaptiveAvgPool2d((1, 768))(last_layer).squeeze()
            # outputs = torch.nn.Dropout(0.2)(outputs)
            # outputs = outputs * 0.8
            prediction = classifier(cls)

            test_preds[step * batch_size: (step + 1) * batch_size] = prediction.detach().cpu().numpy()
    print(" Prediction took: {:}".format(format_time(time.time() - t0)))

    return test_preds

In [None]:
def prediction(test_dataloader, model, classifier, shape, batch_size=8):
    print("Running Prediction...")

    t0 = time.time()

    # Put model in evaluation mode to evaluate loss on the prediction set
    model.eval()
    classifier.eval()

    # val_shape = len(df_val)
    test_preds = np.zeros((shape, 30))

    for step, batch in enumerate(test_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        test_input_ids, test_input_mask= batch

        # Telling the model not to compute or store gradients, saving memory and speeding up prediction
        with torch.no_grad():
            attention_mask = (test_input_ids > 0)
            last_layer, _ , hidden_layers= model(input_ids=test_input_ids, 
                                                 token_type_ids=test_input_mask, 
                                                 attention_mask=attention_mask)
            
            # cls = torch.cat(tuple(hidden_layers[i][:, 0, :] for i in [-1, -2, -3, -4]), dim = -1)
            outputs = torch.cat(tuple(hidden_layers[i] for i in [-1, -2, -3, -4]), dim = -1)
            pooling_layer = torch.nn.AdaptiveAvgPool2d((1, 3072))
            cls = pooling_layer(outputs).squeeze()
            # cls = last_layer[:, 0, :]
            # outputs = torch.nn.AdaptiveAvgPool2d((1, 768))(last_layer).squeeze()
            # outputs = torch.nn.Dropout(0.2)(outputs)
            # outputs = outputs * 0.8
            prediction = classifier(cls)

            test_preds[step * batch_size: (step + 1) * batch_size] = prediction.detach().cpu().numpy()
    print(" Prediction took: {:}".format(format_time(time.time() - t0)))

    return test_preds

In [None]:
ds_test = QuestDataset(df_test, labeled=False)
test_dataloader = torch.utils.data.DataLoader(ds_test, **params)

In [None]:
# classifier = torch.nn.Sequential(torch.nn.Linear(768, 30),
#                                  torch.nn.Sigmoid())
classifier = torch.nn.Sequential(torch.nn.Linear(3072, 3072),
                                 torch.nn.Tanh(),
                                 torch.nn.Linear(3072, 30),
                                 torch.nn.Sigmoid())
# classifier = torch.nn.Sequential(torch.nn.Linear(3072 + num_external_features, 3072 + num_external_features),
#                                  torch.nn.Tanh(),
#                                  torch.nn.Linear(3072 + num_external_features, 30),
#                                  torch.nn.Sigmoid())

In [None]:
config = config = BertConfig.from_pretrained(configPath)
config.output_hidden_states = True

In [None]:
test_predictions = []

In [None]:
for i in range(5):
    model_path = f'../input/google-quest-pytorch-v7/bert-base-{i}.h5py'
    classifier_path = f'../input/google-quest-pytorch-v7/bert-base-classifier-{i}.h5py'
    bert_model_weights = torch.load(model_path)
    model = BertModel.from_pretrained(configPath, state_dict = bert_model_weights, config=config)
    classifier.load_state_dict(torch.load(classifier_path))
    model.cuda()
    classifier.cuda()
    
    test_predictions.append(prediction_v7(test_dataloader, model, classifier, len(df_test)))
    print('Fold Done')
    print(' ')

In [None]:
final_predictions = np.mean(test_predictions, axis=0)

In [None]:
del test_predictions, ds_test, df_test, test_dataloader, model, classifier; gc.collect()

In [None]:
torch.cuda.empty_cache() ## Clear_Memory

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from torch.utils.data import DataLoader, Dataset
import numpy as np 
import pandas as pd
import os
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold
import datetime
import pkg_resources
import time
import scipy.stats as stats
import gc
import re
import operator 
import sys
from sklearn import metrics
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from tqdm import tqdm, tqdm_notebook
from scipy.stats import spearmanr
import os
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')
import pickle
import random
import shutil
import transformers
from spacy.lang.en import English
from math import floor, ceil
#from comment_parser import comment_parser
from xml.sax.saxutils import unescape
import torch.nn.init as init
import glob
from numba import cuda


In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
! ls /kaggle/input/roberta-large

In [None]:
SEED                = 1414
DATA_DIR            = "../input/google-quest-challenge/"
WORK_DIR            = "../working/"
BERT_VOCAB_PATH     = "../input/roberta-large/roberta-large-vocab.json"
BERT_MERGES_PATH    = "../input/roberta-large/roberta-large-merges.txt"
BERT_MODEL_PATH     = "../input/roberta-large/roberta-large-pytorch_model.bin"
BERT_CONFIG_PATH    = "../input/roberta-large/roberta-large-config.json"
input_columns       = ['question_title', 'question_body', 'answer']
seed_everything(SEED)
batch_size          = 32

In [None]:
test_df  = pd.read_csv(os.path.join(DATA_DIR,"test.csv"))

In [None]:
from nltk.corpus import stopwords
eng_stopwords = set(stopwords.words("english"))

def add_external_features(df):
    
    eng_stopwords = set(stopwords.words("english"))
    #If the question is longer, it may be more clear, which may help users give a more 
    df['question_body']      = df['question_body'].apply(lambda x:str(x))
    df['question_num_words'] = df.question_body.str.count('\S+')
    df['question_title_num_words'] = df.question_title.str.count('\S+')
    
    #The assumption here is that longer answer could bring more useful detail
    df['answer']            = df['answer'].apply(lambda x:str(x))
    df['answer_num_words']  = df.answer.str.count('\S+')
    
    df["question_title_num_unique_words"] = df["question_title"].apply(lambda x: len(set(str(x).split())))
    df["question_body_num_unique_words"]  = df["question_body"].apply(lambda x: len(set(str(x).split())))
    df["answer_num_unique_words"]         = df["answer"].apply(lambda x: len(set(str(x).split())))
    
    df["question_title_num_chars"] = df["question_title"].apply(lambda x: len(str(x)))
    df["question_body_num_chars"]  = df["question_body"].apply(lambda x: len(str(x)))
    df["answer_num_chars"]         = df["answer"].apply(lambda x: len(str(x)))
    
    df['qt_words'] = df['question_title'].apply(lambda s: [f for f in s.split() if f not in eng_stopwords])
    df['q_words'] = df['question_body'].apply(lambda s: [f for f in s.split() if f not in eng_stopwords])
    df['a_words'] = df['answer'].apply(lambda s: [f for f in s.split() if f not in eng_stopwords])
    df['qa_word_overlap'] = df.apply(lambda s: len(np.intersect1d(s['q_words'], s['a_words'])), axis = 1)
    df['qt_word_overlap'] = df.apply(lambda s: len(np.intersect1d(s['qt_words'], s['a_words'])), axis = 1)
    
    df['qa_word_overlap_norm'] = df.apply(lambda s: s['qa_word_overlap']/(len(s['a_words']) + len(s['q_words'])  - s['qa_word_overlap']) , axis = 1)
    df['qta_word_overlap_norm'] = df.apply(lambda s: s['qt_word_overlap']/(len(s['a_words']) + len(s['qt_words']) - s['qt_word_overlap']), axis = 1)
    df.drop(['q_words', 'a_words', 'qt_words'], axis = 1, inplace = True)
    
    return df

In [None]:
test_df = add_external_features(test_df)

In [None]:
handmade_cols = ["question_body_num_unique_words", 'question_num_words',
                 "question_title_num_unique_words", "question_title_num_words", 
                 "answer_num_unique_words", "answer_num_words"]

with open('/kaggle/input/d-large-roberta-b32-steplr3-drop-tb-rmf-h56-2/C_scaler.pickle', mode='rb') as f:
    num_words_scaler = pickle.load(f)

test_df[handmade_cols]=  num_words_scaler.transform(test_df[handmade_cols].values)
train_handmade_features= test_df[handmade_cols + ['qa_word_overlap_norm', 'qta_word_overlap_norm']].values

In [None]:
def label_encoder(x, dict_reverse):
    try:
        return dict_reverse[x]
    except:
        return 0

with open('/kaggle/input/d-large-roberta-b32-steplr3-drop-tb-rmf-h56-2/C_category.pickle', mode='rb') as f:
    category_dict_reverse = pickle.load(f)
test_df['category'] = test_df['category'].apply(lambda x: label_encoder(x, category_dict_reverse))

with open('/kaggle/input/d-large-roberta-b32-steplr3-drop-tb-rmf-h56-2/C_host.pickle', mode='rb') as f:
    host_dict_reverse = pickle.load(f)
test_df['host'] = test_df['host'].apply(lambda x: label_encoder(x, host_dict_reverse))

n_cat    = len(category_dict_reverse) + 1
cat_emb  = 256
n_host   = len(host_dict_reverse) + 1
host_emb = 256

In [None]:
n_host

## Bert Tokenizer

In [None]:
bert_config = transformers.RobertaConfig.from_json_file(BERT_CONFIG_PATH)
bert_config.output_hidden_states = True
tokenizer = transformers.RobertaTokenizer(BERT_VOCAB_PATH,BERT_MERGES_PATH)

In [None]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_segments2(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=238, a_max_len=238):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+6) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+6 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+6)))
        
        t = t[:t_new_len]
        q_len_head = round(q_new_len/3)
        q_len_tail = -1* (q_new_len -q_len_head)
        a_len_head = round(a_new_len/3)
        a_len_tail = -1* (a_new_len -a_len_head) 
        
        q = q[:q_len_head]+q[q_len_tail:]
        a = a[:a_len_head]+a[a_len_tail:]
    
    return t, q, a


def convert_lines(question, answer, max_sequence_length, tokenizer):
    
    all_tokens   = []
    all_masks    = []
    all_segments = []
    longer = 0
    for q, a in tqdm(zip(question, answer)):
        
        tokens_q = tokenizer.tokenize(q)
        tokens_a = tokenizer.tokenize(a)
        
        q_len = len(tokens_q)
        a_len = len(tokens_a)
        
        if (q_len+a_len+3) > max_sequence_length:
            longer += 1
            new_q_len = q_len/(a_len+q_len) * (max_sequence_length-3)
            new_a_len = a_len/(q_len+a_len) * (max_sequence_length-3)
            new_q_len, new_a_len = int(np.ceil(new_q_len)), int(np.floor(new_a_len))
            
            if new_a_len+new_q_len+3 != max_sequence_length:
                raise ValueError("too small %s" % str(new_a_len+new_q_len+3))
                
            tokens_q = tokens_q[:new_q_len]
            tokens_a = tokens_a[:new_a_len]
            
        stoken = ["[CLS]"] + tokens_q + ["[SEP]"] + tokens_a + ["[SEP]"]
        #print(stoken)
        
        ##############
        #token_ids
        ##############
        token_ids = tokenizer.encode(stoken, add_special_tokens=Ture)
        input_ids = token_ids + [0] * (max_sequence_length-len(token_ids))
        
        #############
        #input_masks
        #############
        attention_masks = _get_masks(stoken, max_sequence_length)
        
        ##############
        #input_segments
        ###############
        input_segments = _get_segments2(stoken, max_sequence_length)
        
        all_tokens.append(input_ids)
        all_masks.append(attention_masks)
        all_segments.append(input_segments)
    
    print(longer)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def convert_lines2(title, question, answer, max_sequence_length, tokenizer, t_max_len_seq=30, q_max_len_seq=238, a_max_len_seq=238):
    
    all_tokens   = []
    all_masks    = []
    all_segments = []
    
    longer = 0
    
    for t, q, a in tqdm(zip(title, question, answer)):
        
        tokens_t, tokens_q, tokens_a  = _trim_input(t, q, a, max_sequence_length=max_sequence_length)
        #print(tokens_t)
        #print(tokens_q)
        #print(tokens_a)
        
        stoken = ["<s>"] + tokens_t + ["</s>"] + ["</s>"] + tokens_q + ["</s>"] + ["</s>"] + tokens_a + ["</s>"]
        ##############
        #token_ids
        ##############
        token_ids = tokenizer.convert_tokens_to_ids(stoken)
        input_ids = token_ids + [0] * (max_sequence_length-len(token_ids))
        
        #############
        #input_masks
        #############
        attention_masks = _get_masks(stoken, max_sequence_length)
        #print(attention_masks)
        
        ##############
        #input_segments
        ###############
        input_segments = _get_segments2(stoken, max_sequence_length)
        #print(len(input_ids))
        
        all_tokens.append(input_ids)
        all_masks.append(attention_masks)
        all_segments.append(input_segments)
        #break
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
test_tokens, test_masks, test_segments = convert_lines2(test_df["question_title"],
                                                       test_df["question_body"], 
                                                       test_df["answer"],
                                                       max_sequence_length=512, 
                                                       tokenizer=tokenizer)

In [None]:
class QuestDataset_test(Dataset):

    def __init__(self, token_ids, masks, segments, hosts, categories, handmade_features):
                
        self.token_ids  = token_ids
        self.masks      = masks
        self.segments   = segments
        self.hosts      = hosts
        self.categories = categories
        self.handmades  = handmade_features

    def __len__(self):
        return self.token_ids.shape[0]

    def __getitem__(self, idx):
        token_id = self.token_ids[idx]
        mask     = self.masks[idx]
        segment  = self.segments[idx]
        host     = self.hosts[idx]
        category = self.categories[idx]
        handmade = self.handmades[idx]

        return [token_id, mask, segment, host, category, handmade]

In [None]:
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())

def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, torch.nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)

def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))

In [None]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)

        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0

        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)

        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))

    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim),
            self.weight
        ).view(-1, step_dim)

        if self.bias:
            eij = eij + self.b

        eij = torch.tanh(eij)
        a = torch.exp(eij)

        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10
        weighted_input = x * torch.unsqueeze(a, -1)
        
        return torch.sum(weighted_input, 1)

In [None]:
class QuestModel6(nn.Module):

    def __init__(self, n_cat, cat_emb, n_host, host_emb, num_labels):
        super().__init__()
        BERT_DIMS = 1024
        LSTM_UNITS = 512
        model_path = os.path.join(BERT_MODEL_PATH)
        self.bert_model = transformers.RobertaModel.from_pretrained(model_path,config=bert_config)
        set_trainable(self.bert_model, False)
        
        self.category_embedding = nn.Embedding(n_cat, cat_emb)
        self.host_embedding     = nn.Embedding(n_host, host_emb)
        self.embedding_dropout  = SpatialDropout(0.5)
        self.lstm1              = nn.LSTM(BERT_DIMS*2, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2              = nn.GRU(LSTM_UNITS*2, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.atten1             = Attention(LSTM_UNITS*2, 512)
        self.atten2             = Attention(LSTM_UNITS*2, 512)
        self.dropout1           = nn.Dropout(0.2)
        self.fc1                = nn.Linear(LSTM_UNITS*6 + int(cat_emb) + int(host_emb) + 8, 21)
        self.fc2                = nn.Linear(LSTM_UNITS*6 + int(cat_emb) + int(host_emb) + 8, 9)
        self._init_weights(self.category_embedding)
        self._init_weights(self.host_embedding)
        self._init_weights(self.fc1)
        self._init_weights(self.fc2)
    
    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            print("initailize weight")
            module.weight.data.normal_(mean=0.0, std=0.02)
        if isinstance(module, nn.Linear) and module.bias is not None:
            print("initailize bias")
            module.bias.data.zero_()
        
    def forward(self, token_ids, masks, segment, hosts, categories, handmades):
        
        category_embed = self.category_embedding(categories)
        host_embed     = self.host_embedding(hosts)
        external_features = torch.cat((category_embed, host_embed, handmades), 1)
        #print(external_features.shape)
        _, seq_output, hidden_states = self.bert_model(input_ids=token_ids,
                                                    attention_mask=masks,
                                                    token_type_ids=segment)
        
        last_four_layer = torch.cat((hidden_states[-5], hidden_states[-6]), 2)
        lstm_input      = self.embedding_dropout(last_four_layer)
        lstm1_output, _ = self.lstm1(lstm_input)
        lstm2_output, _ = self.lstm2(lstm1_output)
        
        meanpooled_output   = torch.mean(lstm2_output, 1)
        maxpooled_output, _ = torch.max(lstm2_output, 1)
        attention_output_q  = self.atten1(lstm2_output)
        attention_output_a  = self.atten2(lstm2_output)
        
        pooled_output_q = torch.cat((meanpooled_output, maxpooled_output, attention_output_q, external_features), 1)
        pooled_output_q = self.dropout1(pooled_output_q)
        
        pooled_output_a = torch.cat((meanpooled_output, maxpooled_output, attention_output_a, external_features), 1)
        pooled_output_a = self.dropout1(pooled_output_a)
        
        q_results     = self.fc1(pooled_output_q)
        a_results     = self.fc2(pooled_output_a)
        results       = torch.cat((q_results, a_results), 1)
        
        return results

In [None]:
def predict_result(model, test_loader, batch_size=batch_size):
    
    output = np.zeros((len(test_set), 30))
    model.eval()
    with torch.no_grad():
        for idx, inputs in enumerate(test_loader):
            start_index = idx * batch_size
            end_index   = min(start_index + batch_size, len(test_set))
            token_ids, masks, segments, hosts, categories, handmades = inputs
            token_ids   = token_ids.long().cuda()
            masks       = masks.long().cuda()
            segments    = segments.long().cuda()
            hosts       = hosts.long().cuda()
            categories  = categories.long().cuda()
            handmades   = handmades.float().cuda()
            
            predictions = model(token_ids, masks, segments, hosts, categories, handmades)
            predictions = torch.sigmoid(predictions)
            output[start_index:end_index, :] = predictions.detach().cpu().numpy()
            
    return output

In [None]:
results = []
! ls /kaggle/input/

In [None]:
pretrain_weighted_1 = glob.glob('../input/d-large-roberta-b32-steplr3-drop-tb-rmf-h56-1/*.pt')
pretrain_weighted_2 = glob.glob('../input/d-large-roberta-b32-steplr3-drop-tb-rmf-h56-2/*.pt')
pretrain_weighted = pretrain_weighted_1 + pretrain_weighted_2

In [None]:
pretrain_weighted

In [None]:
!nvidia-smi # check GPU Memory


In [None]:
#cuda.select_device(0) #clear GPU memory 
#cuda.close()

In [None]:
cuda.select_device(0) #restart cuda

In [None]:
!nvidia-smi #Check GPU Memory

In [None]:
test_set    = QuestDataset_test(test_tokens, test_masks, test_segments,
                                test_df['host'].values,
                                test_df['category'].values,
                                train_handmade_features)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [None]:
model = QuestModel6(n_cat, cat_emb, n_host, host_emb, num_labels=30)
model.cuda()

In [None]:
for i, weight in tqdm(enumerate(pretrain_weighted)):
    model.load_state_dict(torch.load(weight))
    results.append(predict_result(model, test_loader))

In [None]:
#from scipy.stats import rankdata

#output = np.array([np.array([rankdata(c) for c in p.T]).T for p in results]).mean(axis=0)
#max_val = output.max() + 1
#output = output/max_val + 1e-12

output_roberta = np.zeros((len(test_set),30))
for result in results:
    output_roberta += result
output_roberta /= len(results)

In [None]:
del results, result, test_set, test_loader, model; gc.collect()

In [None]:
torch.cuda.empty_cache() ## Clear_Memory

In [None]:
! ls /kaggle/input/d-large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-2

In [None]:
SEED                = 1414
DATA_DIR            = "../input/google-quest-challenge/"
WORK_DIR            = "../working/"
BERT_VOCAB_PATH     = "../input/xlnetlargecased/xlnet_cased_L-24_H-1024_A-16/spiece.model"
BERT_MODEL_PATH     = "../input/xlnetlargecased/xlnet_cased_L-24_H-1024_A-16/xlnet_model.ckpt.index"
BERT_CONFIG_PATH    = "../input/xlnetlargecased/xlnet_cased_L-24_H-1024_A-16/xlnet_config.json"
input_columns       = ['question_title', 'question_body', 'answer']
seed_everything(SEED)
batch_size          = 32

In [None]:
test_df  = pd.read_csv(os.path.join(DATA_DIR,"test.csv"))

In [None]:
from nltk.corpus import stopwords
eng_stopwords = set(stopwords.words("english"))

def add_external_features(df):
    
    eng_stopwords = set(stopwords.words("english"))
    #If the question is longer, it may be more clear, which may help users give a more 
    df['question_body']      = df['question_body'].apply(lambda x:str(x))
    df['question_num_words'] = df.question_body.str.count('\S+')
    df['question_title_num_words'] = df.question_title.str.count('\S+')
    
    #The assumption here is that longer answer could bring more useful detail
    df['answer']            = df['answer'].apply(lambda x:str(x))
    df['answer_num_words']  = df.answer.str.count('\S+')
    
    df["question_title_num_unique_words"] = df["question_title"].apply(lambda x: len(set(str(x).split())))
    df["question_body_num_unique_words"]  = df["question_body"].apply(lambda x: len(set(str(x).split())))
    df["answer_num_unique_words"]         = df["answer"].apply(lambda x: len(set(str(x).split())))
    
    df["question_title_num_chars"] = df["question_title"].apply(lambda x: len(str(x)))
    df["question_body_num_chars"]  = df["question_body"].apply(lambda x: len(str(x)))
    df["answer_num_chars"]         = df["answer"].apply(lambda x: len(str(x)))
    
    df['qt_words'] = df['question_title'].apply(lambda s: [f for f in s.split() if f not in eng_stopwords])
    df['q_words'] = df['question_body'].apply(lambda s: [f for f in s.split() if f not in eng_stopwords])
    df['a_words'] = df['answer'].apply(lambda s: [f for f in s.split() if f not in eng_stopwords])
    df['qa_word_overlap'] = df.apply(lambda s: len(np.intersect1d(s['q_words'], s['a_words'])), axis = 1)
    df['qt_word_overlap'] = df.apply(lambda s: len(np.intersect1d(s['qt_words'], s['a_words'])), axis = 1)
    
    df['qa_word_overlap_norm'] = df.apply(lambda s: s['qa_word_overlap']/(len(s['a_words']) + len(s['q_words'])  - s['qa_word_overlap']) , axis = 1)
    df['qta_word_overlap_norm'] = df.apply(lambda s: s['qt_word_overlap']/(len(s['a_words']) + len(s['qt_words']) - s['qt_word_overlap']), axis = 1)
    df.drop(['q_words', 'a_words', 'qt_words'], axis = 1, inplace = True)
    
    return df

In [None]:
test_df = add_external_features(test_df)

In [None]:
handmade_cols = ["question_body_num_unique_words", 'question_num_words',
                 "question_title_num_unique_words", "question_title_num_words", 
                 "answer_num_unique_words", "answer_num_words"]

with open('/kaggle/input/d-large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-2/C_scaler.pickle', mode='rb') as f:
    num_words_scaler = pickle.load(f)

test_df[handmade_cols]=  num_words_scaler.transform(test_df[handmade_cols].values)
train_handmade_features= test_df[handmade_cols + ['qa_word_overlap_norm', 'qta_word_overlap_norm']].values

In [None]:
def label_encoder(x, dict_reverse):
    try:
        return dict_reverse[x]
    except:
        return 0

with open('/kaggle/input/d-large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-2/C_category.pickle', mode='rb') as f:
    category_dict_reverse = pickle.load(f)
test_df['category'] = test_df['category'].apply(lambda x: label_encoder(x, category_dict_reverse))

with open('/kaggle/input/d-large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-2/C_host.pickle', mode='rb') as f:
    host_dict_reverse = pickle.load(f)
test_df['host'] = test_df['host'].apply(lambda x: label_encoder(x, host_dict_reverse))

n_cat    = len(category_dict_reverse) + 1
cat_emb  = 256
n_host   = len(host_dict_reverse) + 1
host_emb = 256

In [None]:
n_host

In [None]:
bert_config = transformers.XLNetConfig.from_json_file(BERT_CONFIG_PATH)
bert_config.output_hidden_states = True
tokenizer = transformers.XLNetTokenizer(BERT_VOCAB_PATH)

In [None]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_segments2(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q_len_head = round(q_new_len/3)
        q_len_tail = -1* (q_new_len -q_len_head)
        a_len_head = round(a_new_len/3)
        a_len_tail = -1* (a_new_len -a_len_head) 
        
        q = q[:q_len_head]+q[q_len_tail:]
        a = a[:a_len_head]+a[a_len_tail:]
    
    return t, q, a


def convert_lines(question, answer, max_sequence_length, tokenizer):
    
    all_tokens   = []
    all_masks    = []
    all_segments = []
    longer = 0
    for q, a in tqdm(zip(question, answer)):
        
        tokens_q = tokenizer.tokenize(q)
        tokens_a = tokenizer.tokenize(a)
        
        q_len = len(tokens_q)
        a_len = len(tokens_a)
        
        if (q_len+a_len+3) > max_sequence_length:
            longer += 1
            new_q_len = q_len/(a_len+q_len) * (max_sequence_length-3)
            new_a_len = a_len/(q_len+a_len) * (max_sequence_length-3)
            new_q_len, new_a_len = int(np.ceil(new_q_len)), int(np.floor(new_a_len))
            
            if new_a_len+new_q_len+3 != max_sequence_length:
                raise ValueError("too small %s" % str(new_a_len+new_q_len+3))
                
            tokens_q = tokens_q[:new_q_len]
            tokens_a = tokens_a[:new_a_len]
            
        stoken = ["[CLS]"] + tokens_q + ["[SEP]"] + tokens_a + ["[SEP]"]
        #print(stoken)
        
        ##############
        #token_ids
        ##############
        token_ids = tokenizer.encode(stoken, add_special_tokens=Ture)
        input_ids = token_ids + [0] * (max_sequence_length-len(token_ids))
        
        #############
        #input_masks
        #############
        attention_masks = _get_masks(stoken, max_sequence_length)
        
        ##############
        #input_segments
        ###############
        input_segments = _get_segments2(stoken, max_sequence_length)
        
        all_tokens.append(input_ids)
        all_masks.append(attention_masks)
        all_segments.append(input_segments)
    
    print(longer)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def convert_lines2(title, question, answer, max_sequence_length, tokenizer, t_max_len_seq=30, q_max_len_seq=239, a_max_len_seq=239):
    
    all_tokens   = []
    all_masks    = []
    all_segments = []
    
    longer = 0
    
    for t, q, a in tqdm(zip(title, question, answer)):
        
        tokens_t, tokens_q, tokens_a  = _trim_input(t, q, a, max_sequence_length=max_sequence_length)
        #print(tokens_t)
        #print(tokens_q)
        #print(tokens_a)
        
        stoken = tokens_t + ["[SEP]"] + tokens_q + ["[SEP]"] + tokens_a + ["[SEP]"] + ["[CLS]"]
        ##############
        #token_ids
        ##############
        token_ids = tokenizer.convert_tokens_to_ids(stoken)
        input_ids = token_ids + [0] * (max_sequence_length-len(token_ids))
        
        #############
        #input_masks
        #############
        attention_masks = _get_masks(stoken, max_sequence_length)
        #print(attention_masks)
        
        ##############
        #input_segments
        ###############
        input_segments = _get_segments2(stoken, max_sequence_length)
        
        all_tokens.append(input_ids)
        all_masks.append(attention_masks)
        all_segments.append(input_segments)
        #break
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
test_tokens, test_masks, test_segments = convert_lines2(test_df["question_title"],
                                                       test_df["question_body"], 
                                                       test_df["answer"],
                                                       max_sequence_length=512, 
                                                       tokenizer=tokenizer)

In [None]:
class QuestDataset_test(Dataset):

    def __init__(self, token_ids, masks, segments, hosts, categories, handmade_features):
                
        self.token_ids  = token_ids
        self.masks      = masks
        self.segments   = segments
        self.hosts      = hosts
        self.categories = categories
        self.handmades  = handmade_features

    def __len__(self):
        return self.token_ids.shape[0]

    def __getitem__(self, idx):
        token_id = self.token_ids[idx]
        mask     = self.masks[idx]
        segment  = self.segments[idx]
        host     = self.hosts[idx]
        category = self.categories[idx]
        handmade = self.handmades[idx]

        return [token_id, mask, segment, host, category, handmade]

In [None]:
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())

def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, torch.nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)

def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))

In [None]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)

        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0

        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)

        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))

    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim),
            self.weight
        ).view(-1, step_dim)

        if self.bias:
            eij = eij + self.b

        eij = torch.tanh(eij)
        a = torch.exp(eij)

        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10
        weighted_input = x * torch.unsqueeze(a, -1)
        
        return torch.sum(weighted_input, 1)

In [None]:
class QuestModel6(nn.Module):

    def __init__(self, n_cat, cat_emb, n_host, host_emb, num_labels):
        super().__init__()
        BERT_DIMS = 1024
        LSTM_UNITS = 512
        model_path = os.path.join(BERT_MODEL_PATH)
        self.bert_model = transformers.XLNetModel.from_pretrained(model_path,from_tf=True, config=bert_config)
        set_trainable(self.bert_model, False)
        
        self.category_embedding = nn.Embedding(n_cat, cat_emb)
        self.host_embedding     = nn.Embedding(n_host, host_emb)
        self.embedding_dropout  = SpatialDropout(0.5)
        self.lstm1              = nn.LSTM(BERT_DIMS*2, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2              = nn.GRU(LSTM_UNITS*2, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.atten1             = Attention(LSTM_UNITS*2, 512)
        self.atten2             = Attention(LSTM_UNITS*2, 512)
        self.dropout1           = nn.Dropout(0.2)
        self.fc1                = nn.Linear(LSTM_UNITS*6 + int(cat_emb) + int(host_emb) + 8, 21)
        self.fc2                = nn.Linear(LSTM_UNITS*6 + int(cat_emb) + int(host_emb) + 8, 9)
        self._init_weights(self.category_embedding)
        self._init_weights(self.host_embedding)
        self._init_weights(self.fc1)
        self._init_weights(self.fc2)
    
    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            print("initailize weight")
            module.weight.data.normal_(mean=0.0, std=0.02)
        if isinstance(module, nn.Linear) and module.bias is not None:
            print("initailize bias")
            module.bias.data.zero_()
        
    def forward(self, token_ids, masks, segment, hosts, categories, handmades):
        
        category_embed = self.category_embedding(categories)
        host_embed     = self.host_embedding(hosts)
        external_features = torch.cat((category_embed, host_embed, handmades), 1)
        #print(external_features.shape)
        seq_output, hidden_states = self.bert_model(input_ids=token_ids,
                                                    attention_mask=masks,
                                                    token_type_ids=segment)
        
        last_four_layer = torch.cat((hidden_states[-5], hidden_states[-6]), 2)
        lstm_input      = self.embedding_dropout(last_four_layer)
        lstm1_output, _ = self.lstm1(lstm_input)
        lstm2_output, _ = self.lstm2(lstm1_output)
        
        meanpooled_output   = torch.mean(lstm2_output, 1)
        maxpooled_output, _ = torch.max(lstm2_output, 1)
        attention_output_q  = self.atten1(lstm2_output)
        attention_output_a  = self.atten2(lstm2_output)
        
        pooled_output_q = torch.cat((meanpooled_output, maxpooled_output, attention_output_q, external_features), 1)
        pooled_output_q = self.dropout1(pooled_output_q)
        
        pooled_output_a = torch.cat((meanpooled_output, maxpooled_output, attention_output_a, external_features), 1)
        pooled_output_a = self.dropout1(pooled_output_a)
        
        q_results     = self.fc1(pooled_output_q)
        a_results     = self.fc2(pooled_output_a)
        results       = torch.cat((q_results, a_results), 1)
        
        return results

In [None]:
def predict_result(model, test_loader, batch_size=batch_size):
    
    output = np.zeros((len(test_set), 30))
    model.eval()
    with torch.no_grad():
        for idx, inputs in enumerate(test_loader):
            start_index = idx * batch_size
            end_index   = min(start_index + batch_size, len(test_set))
            token_ids, masks, segments, hosts, categories, handmades = inputs
            token_ids   = token_ids.long().cuda()
            masks       = masks.long().cuda()
            segments    = segments.long().cuda()
            hosts       = hosts.long().cuda()
            categories  = categories.long().cuda()
            handmades   = handmades.float().cuda()
            
            predictions = model(token_ids, masks, segments, hosts, categories, handmades)
            predictions = torch.sigmoid(predictions)
            output[start_index:end_index, :] = predictions.detach().cpu().numpy()
            
    return output

In [None]:
results = []
! ls /kaggle/input/

In [None]:
pretrain_weighted_1 = glob.glob('../input/d-large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-1/*.pt')
pretrain_weighted_2 = glob.glob('../input/d-large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-2/*.pt')
pretrain_weighted = pretrain_weighted_1 + pretrain_weighted_2

In [None]:
pretrain_weighted

In [None]:
!nvidia-smi # check GPU Memory

In [None]:
#cuda.select_device(0) #clear GPU memory 
#cuda.close()

In [None]:
cuda.select_device(0) #restart cuda

In [None]:
!nvidia-smi #Check GPU Memory

In [None]:
test_set    = QuestDataset_test(test_tokens, test_masks, test_segments,
                                test_df['host'].values,
                                test_df['category'].values,
                                train_handmade_features)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [None]:
model = QuestModel6(n_cat, cat_emb, n_host, host_emb, num_labels=30)
model.cuda()

In [None]:
for i, weight in tqdm(enumerate(pretrain_weighted)):
    model.load_state_dict(torch.load(weight))
    results.append(predict_result(model, test_loader))

In [None]:
#from scipy.stats import rankdata

#output = np.array([np.array([rankdata(c) for c in p.T]).T for p in results]).mean(axis=0)
#max_val = output.max() + 1
#output = output/max_val + 1e-12

output_xlnet = np.zeros((len(test_set),30))
for result in results:
    output_xlnet += result
output_xlnet /= len(results)

In [None]:
del results, result, test_set, test_loader, model; gc.collect()

In [None]:
torch.cuda.empty_cache() ## Clear_Memory

In [None]:
#!pip install ../input/sacremoses/sacremoses-master/ > /dev/null

import os
import sys
import glob
import torch

#sys.path.insert(0, "../input/transformers/transformers-master/")
sys.path.append("../input/mynlpscripts/")
#sys.path.append('/content/gdrive/My Drive/googlequest')
sys.path.extend(['../input/bert-utils/'])
#import transformers
import numpy as np
import pandas as pd
import math

In [None]:
import os

import matplotlib.pylab as plt
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub

print("TF version:", tf.__version__)
print("Hub version:", hub.__version__)
#print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")


In [None]:
#!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo

import re
import os
import sys
import json
import math

import logging
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

from tensorflow import keras
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint

from sklearn.model_selection import train_test_split
#from google.colab import auth, drive

#if not 'bert_repo' in sys.path:
#    sys.path.insert(0, 'bert_repo')

from modeling import BertModel, BertConfig
from tokenization import FullTokenizer, convert_to_unicode
from extract_features import InputExample, convert_examples_to_features


# get TF logger 
log = logging.getLogger('tensorflow')
log.handlers = []

In [None]:
import os
import json
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
import re
import gc
import pickle  
import random
from tensorflow import keras
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
import pandas as pd
#import tensorflow as tf
#import tensorflow_hub as hub
#from bert_tokenization import FullTokenizer
#import bert.tokenization as tokenization
import tensorflow.keras.backend as K
from scipy.stats import spearmanr
from math import floor, ceil

os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = "true"

from tqdm import tqdm
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import gc
import json
from sklearn.preprocessing import StandardScaler
import joblib
#import nltk
#from nltk.corpus import stopwords
#eng_stopwords = set(stopwords.words("english"))
import string


from copy import deepcopy

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D, Reshape
from tensorflow.keras.initializers import he_normal, he_uniform, glorot_normal, glorot_uniform, VarianceScaling
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding,GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, Bidirectional, Conv1D, MaxPool1D, Flatten, GRU, Concatenate, SpatialDropout1D, GlobalMaxPool1D
from tensorflow.keras.models import model_from_json
from scipy.stats import spearmanr, rankdata
from os.path import join as path_join
from numpy.random import seed
from urllib.parse import urlparse
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.linear_model import MultiTaskElasticNet, MultiTaskElasticNetCV
from sklearn.multioutput import MultiOutputRegressor, RegressorChain, ClassifierChain
from sklearn.model_selection import (TimeSeriesSplit, KFold, ShuffleSplit,
                                     StratifiedKFold, GroupShuffleSplit,
                                     GroupKFold, StratifiedShuffleSplit)

from sklearn.model_selection import BaseCrossValidator

seed(42)
tf.set_random_seed(42)
random.seed(42)

In [None]:
"""This file includes multilabel cross validators based on an implementation of
the Iterative Stratification algorithm described in the following paper:
Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-
Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M. (eds)
Machine Learning and Knowledge Discovery in Databases. ECML PKDD 2011. Lecture
Notes in Computer Science, vol 6913. Springer, Berlin, Heidelberg.
From scikit-learn 0.19.0, StratifiedKFold, RepeatedStratifiedKFold, and
StratifiedShuffleSplit were copied and modified, retaining compatibility
with scikit-learn.
Attribution to authors of scikit-learn/model_selection/_split.py under BSD 3 clause:
    Alexandre Gramfort <alexandre.gramfort@inria.fr>,
    Gael Varoquaux <gael.varoquaux@normalesup.org>,
    Olivier Grisel <olivier.grisel@ensta.org>,
    Raghav RV <rvraghav93@gmail.com>
"""

# Author: Trent J. Bradberry <trentjason@hotmail.com>
# License: BSD 3 clause

import numpy as np

from sklearn.utils import check_random_state
from sklearn.utils.validation import _num_samples, check_array
from sklearn.utils.multiclass import type_of_target

from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, \
    BaseShuffleSplit, _validate_shuffle_split


def IterativeStratification(labels, r, random_state):
    """This function implements the Iterative Stratification algorithm described
    in the following paper:
    Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of
    Multi-Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M.
    (eds) Machine Learning and Knowledge Discovery in Databases. ECML PKDD
    2011. Lecture Notes in Computer Science, vol 6913. Springer, Berlin,
    Heidelberg.
    """

    n_samples = labels.shape[0]
    test_folds = np.zeros(n_samples, dtype=int)

    # Calculate the desired number of examples at each subset
    c_folds = r * n_samples

    # Calculate the desired number of examples of each label at each subset
    c_folds_labels = np.outer(r, labels.sum(axis=0))

    labels_not_processed_mask = np.ones(n_samples, dtype=bool)

    while np.any(labels_not_processed_mask):
        # Find the label with the fewest (but at least one) remaining examples,
        # breaking ties randomly
        num_labels = labels[labels_not_processed_mask].sum(axis=0)

        # Handle case where only all-zero labels are left by distributing
        # across all folds as evenly as possible (not in original algorithm but
        # mentioned in the text). (By handling this case separately, some
        # code redundancy is introduced; however, this approach allows for
        # decreased execution time when there are a relatively large number
        # of all-zero labels.)
        if num_labels.sum() == 0:
            sample_idxs = np.where(labels_not_processed_mask)[0]

            for sample_idx in sample_idxs:
                fold_idx = np.where(c_folds == c_folds.max())[0]

                if fold_idx.shape[0] > 1:
                    fold_idx = fold_idx[random_state.choice(fold_idx.shape[0])]

                test_folds[sample_idx] = fold_idx
                c_folds[fold_idx] -= 1

            break

        label_idx = np.where(num_labels == num_labels[np.nonzero(num_labels)].min())[0]
        if label_idx.shape[0] > 1:
            label_idx = label_idx[random_state.choice(label_idx.shape[0])]

        sample_idxs = np.where(np.logical_and(labels[:, label_idx].flatten(), labels_not_processed_mask))[0]

        for sample_idx in sample_idxs:
            # Find the subset(s) with the largest number of desired examples
            # for this label, breaking ties by considering the largest number
            # of desired examples, breaking further ties randomly
            label_folds = c_folds_labels[:, label_idx]
            fold_idx = np.where(label_folds == label_folds.max())[0]

            if fold_idx.shape[0] > 1:
                temp_fold_idx = np.where(c_folds[fold_idx] ==
                                         c_folds[fold_idx].max())[0]
                fold_idx = fold_idx[temp_fold_idx]

                if temp_fold_idx.shape[0] > 1:
                    fold_idx = fold_idx[random_state.choice(temp_fold_idx.shape[0])]

            test_folds[sample_idx] = fold_idx
            labels_not_processed_mask[sample_idx] = False

            # Update desired number of examples
            c_folds_labels[fold_idx, labels[sample_idx]] -= 1
            c_folds[fold_idx] -= 1

    return test_folds


class MultilabelStratifiedKFold(_BaseKFold):
    """Multilabel stratified K-Folds cross-validator
    Provides train/test indices to split multilabel data into train/test sets.
    This cross-validation object is a variation of KFold that returns
    stratified folds for multilabel data. The folds are made by preserving
    the percentage of samples for each label.
    Parameters
    ----------
    n_splits : int, default=3
        Number of folds. Must be at least 2.
    shuffle : boolean, optional
        Whether to shuffle each stratification of the data before splitting
        into batches.
    random_state : int, RandomState instance or None, optional, default=None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Unlike StratifiedKFold that only uses random_state
        when ``shuffle`` == True, this multilabel implementation
        always uses the random_state since the iterative stratification
        algorithm breaks ties randomly.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0)
    >>> mskf.get_n_splits(X, y)
    2
    >>> print(mskf)  # doctest: +NORMALIZE_WHITESPACE
    MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=False)
    >>> for train_index, test_index in mskf.split(X, y):
    ...    print("TRAIN:", train_index, "TEST:", test_index)
    ...    X_train, X_test = X[train_index], X[test_index]
    ...    y_train, y_test = y[train_index], y[test_index]
    TRAIN: [0 3 4 6] TEST: [1 2 5 7]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    Notes
    -----
    Train and test sizes may be slightly different in each fold.
    See also
    --------
    RepeatedMultilabelStratifiedKFold: Repeats Multilabel Stratified K-Fold
    n times.
    """

    def __init__(self, n_splits=3, shuffle=False, random_state=None):
        super(MultilabelStratifiedKFold, self).__init__(n_splits, shuffle, random_state)

    def _make_test_folds(self, X, y):
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(type_of_target_y))

        num_samples = y.shape[0]

        rng = check_random_state(self.random_state)
        indices = np.arange(num_samples)

        if self.shuffle:
            rng.shuffle(indices)
            y = y[indices]

        r = np.asarray([1 / self.n_splits] * self.n_splits)

        test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

        return test_folds[np.argsort(indices)]

    def _iter_test_masks(self, X=None, y=None, groups=None):
        test_folds = self._make_test_folds(X, y)
        for i in range(self.n_splits):
            yield test_folds == i

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.
        y : array-like, shape (n_samples, n_labels)
            The target variable for supervised learning problems.
            Multilabel stratification is done based on the y labels.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting ``random_state``
        to an integer.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(MultilabelStratifiedKFold, self).split(X, y, groups)


class RepeatedMultilabelStratifiedKFold(_RepeatedSplits):
    """Repeated Multilabel Stratified K-Fold cross validator.
    Repeats Mulilabel Stratified K-Fold n times with different randomization
    in each repetition.
    Parameters
    ----------
    n_splits : int, default=5
        Number of folds. Must be at least 2.
    n_repeats : int, default=10
        Number of times cross-validator needs to be repeated.
    random_state : None, int or RandomState, default=None
        Random state to be used to generate random state for each
        repetition as well as randomly breaking ties within the iterative
        stratification algorithm.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> rmskf = RepeatedMultilabelStratifiedKFold(n_splits=2, n_repeats=2,
    ...     random_state=0)
    >>> for train_index, test_index in rmskf.split(X, y):
    ...     print("TRAIN:", train_index, "TEST:", test_index)
    ...     X_train, X_test = X[train_index], X[test_index]
    ...     y_train, y_test = y[train_index], y[test_index]
    ...
    TRAIN: [0 3 4 6] TEST: [1 2 5 7]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    TRAIN: [0 1 4 5] TEST: [2 3 6 7]
    TRAIN: [2 3 6 7] TEST: [0 1 4 5]
    See also
    --------
    RepeatedStratifiedKFold: Repeats (Non-multilabel) Stratified K-Fold
    n times.
    """
    def __init__(self, n_splits=5, n_repeats=10, random_state=None):
        super(RepeatedMultilabelStratifiedKFold, self).__init__(
            MultilabelStratifiedKFold, n_repeats, random_state,
            n_splits=n_splits)


class MultilabelStratifiedShuffleSplit(BaseShuffleSplit):
    """Multilabel Stratified ShuffleSplit cross-validator
    Provides train/test indices to split data into train/test sets.
    This cross-validation object is a merge of MultilabelStratifiedKFold and
    ShuffleSplit, which returns stratified randomized folds for multilabel
    data. The folds are made by preserving the percentage of each label.
    Note: like the ShuffleSplit strategy, multilabel stratified random splits
    do not guarantee that all folds will be different, although this is
    still very likely for sizeable datasets.
    Parameters
    ----------
    n_splits : int, default 10
        Number of re-shuffling & splitting iterations.
    test_size : float, int, None, optional
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. By default, the value is set to 0.1.
        The default will change in version 0.21. It will remain 0.1 only
        if ``train_size`` is unspecified, otherwise it will complement
        the specified ``train_size``.
    train_size : float, int, or None, default is None
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Unlike StratifiedShuffleSplit that only uses
        random_state when ``shuffle`` == True, this multilabel implementation
        always uses the random_state since the iterative stratification
        algorithm breaks ties randomly.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> msss = MultilabelStratifiedShuffleSplit(n_splits=3, test_size=0.5,
    ...    random_state=0)
    >>> msss.get_n_splits(X, y)
    3
    >>> print(mss)       # doctest: +ELLIPSIS
    MultilabelStratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,
                                     train_size=None)
    >>> for train_index, test_index in msss.split(X, y):
    ...    print("TRAIN:", train_index, "TEST:", test_index)
    ...    X_train, X_test = X[train_index], X[test_index]
    ...    y_train, y_test = y[train_index], y[test_index]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    TRAIN: [2 3 6 7] TEST: [0 1 4 5]
    TRAIN: [1 2 5 6] TEST: [0 3 4 7]
    Notes
    -----
    Train and test sizes may be slightly different from desired due to the
    preference of stratification over perfectly sized folds.
    """

    def __init__(self, n_splits=10, test_size="default", train_size=None,
                 random_state=None):
        super(MultilabelStratifiedShuffleSplit, self).__init__(
            n_splits, test_size, train_size, random_state)

    def _iter_indices(self, X, y, groups=None):
        n_samples = _num_samples(X)
        y = check_array(y, ensure_2d=False, dtype=None)
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(
                    type_of_target_y))

        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
                                                  self.train_size)

        n_samples = y.shape[0]
        rng = check_random_state(self.random_state)
        y_orig = y.copy()

        r = np.array([n_train, n_test]) / (n_train + n_test)

        for _ in range(self.n_splits):
            indices = np.arange(n_samples)
            rng.shuffle(indices)
            y = y_orig[indices]

            test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

            test_idx = test_folds[np.argsort(indices)] == 1
            test = np.where(test_idx)[0]
            train = np.where(~test_idx)[0]

            yield train, test

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.
        y : array-like, shape (n_samples, n_labels)
            The target variable for supervised learning problems.
            Multilabel stratification is done based on the y labels.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting ``random_state``
        to an integer.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(MultilabelStratifiedShuffleSplit, self).split(X, y, groups)

In [None]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [None]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [None]:
def build_module_fn(config_path, vocab_path, do_lower_case=True):

    def bert_module_fn(is_training):
        """Spec function for a token embedding module."""

        input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids")
        input_mask = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_mask")
        token_type = tf.placeholder(shape=[None, None], dtype=tf.int32, name="segment_ids")

        config = BertConfig.from_json_file(config_path)
        model = BertModel(config=config, is_training=is_training,
                          input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type)
          
        seq_output = model.all_encoder_layers[-1]
        pool_output = model.get_pooled_output()

        config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file")
        vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file")
        lower_case = tf.constant(do_lower_case)

        tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file)
        tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file)
        
        input_map = {"input_ids": input_ids,
                     "input_mask": input_mask,
                     "segment_ids": token_type}
        
        output_map = {"pooled_output": pool_output,
                      "sequence_output": seq_output}

        output_info_map = {"vocab_file": vocab_file,
                           "do_lower_case": lower_case}
                
        hub.add_signature(name="tokens", inputs=input_map, outputs=output_map)
        hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)

    return bert_module_fn

In [None]:
MODEL_DIR1 = '../input/bertbase' #"uncased_L-12_H-768_A-12" 

config_path1 = "{}/config.json".format(MODEL_DIR1)
vocab_path1 = "{}/vocab.txt".format(MODEL_DIR1)

tags_and_args = []
for is_training in (True, False):
    tags = set()
    if is_training:
        tags.add("train")
    tags_and_args.append((tags, dict(is_training=is_training)))

module_fn = build_module_fn(config_path1, vocab_path1)
spec = hub.create_module_spec(module_fn, tags_and_args=tags_and_args)
spec.export("bert-module1", 
            checkpoint_path="{}/bert_model.ckpt".format(MODEL_DIR1))

In [None]:
MODEL_DIR = '../input/bertlarge' #"uncased_L-12_H-768_A-12" 

config_path = "{}/bert_config.json".format(MODEL_DIR)
vocab_path = "{}/vocab.txt".format(MODEL_DIR)

tags_and_args = []
for is_training in (True, False):
    tags = set()
    if is_training:
        tags.add("train")
    tags_and_args.append((tags, dict(is_training=is_training)))

module_fn = build_module_fn(config_path, vocab_path)
spec = hub.create_module_spec(module_fn, tags_and_args=tags_and_args)
spec.export("bert-module", 
            checkpoint_path="{}/bert_model.ckpt".format(MODEL_DIR))

In [None]:
PATH = '../input/google-quest-challenge/'
BERT_PATH = '../input/bert-tfhub' #/bert_en_uncased_L-12_H-768_A-12
USE_PATH  = '../input/uselarge4/'
tokenizer = FullTokenizer(vocab_path, True)
tokenizer1 = FullTokenizer(vocab_path1, True)
MAX_SEQUENCE_LENGTH = 512
MAX_SEQUENCE_LENGTH_t = 120
MAX_SEQUENCE_LENGTH_q = 416
MAX_SEQUENCE_LENGTH_a =  416

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

In [None]:
def read_examples(str_list):
    """Read a list of `InputExample`s from a list of strings."""
    unique_id = 0
    for s in str_list:
        line = convert_to_unicode(s)
        if not line:
            continue
        line = line.strip()
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
        unique_id += 1

In [None]:
def features_to_arrays(features):

    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []

    for feature in features:
        all_input_ids.append(feature.input_ids)
        all_input_mask.append(feature.input_mask)
        all_segment_ids.append(feature.input_type_ids)

    return (np.array(all_input_ids, dtype='int32'), 
            np.array(all_input_mask, dtype='int32'), 
            np.array(all_segment_ids, dtype='int32'))

In [None]:
def build_preprocessor(voc_path, seq_len, lower=True):
    tokenizer = FullTokenizer(vocab_file=voc_path, do_lower_case=lower)
    def strings_to_arrays(sents):
        sents = np.atleast_1d(sents).reshape((-1,))
        examples = []
        for example in read_examples(sents):
            examples.append(example)

        features = convert_examples_to_features(examples, seq_len, tokenizer)
        arrays = features_to_arrays(features)
        return arrays
  
    return strings_to_arrays

In [None]:
targets = [       
    'question_asker_intent_understanding',
        'question_body_critical',
        'question_conversational',
        'question_expect_short_answer',
        'question_fact_seeking',
        'question_has_commonly_accepted_answer',
        'question_interestingness_others',
        'question_interestingness_self',
        'question_multi_intent',
        'question_not_really_a_question',
        'question_opinion_seeking',
        'question_type_choice',
        'question_type_compare',
        'question_type_consequence',
        'question_type_definition',
        'question_type_entity',
        'question_type_instructions',
        'question_type_procedure',
        'question_type_reason_explanation',
        'question_type_spelling',
        'question_well_written',
        'answer_helpful',
        'answer_level_of_information',
        'answer_plausible',
        'answer_relevance',
        'answer_satisfaction',
        'answer_type_instructions',
        'answer_type_procedure',
        'answer_type_reason_explanation',
        'answer_well_written'    
    ]

input_columns = ['question_title', 'question_body', 'answer']
#y_train = train[targets].values

In [None]:
def _get_masks1(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments1(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids1(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def _trim_input1(title, question, answer,tokenizer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):
  
    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

def _convert_to_bert_inputs1(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = _get_ids1(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks1(stoken, max_sequence_length)
    input_segments = _get_segments1(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays1(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        t, q, a = _trim_input1(t, q, a,tokenizer, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs1(t, q, a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]


def compute_output_arrays1(df, columns):
    return np.asarray(df[columns])

In [None]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids


def _trim_input(question,tokenizer,  max_sequence_length):
    """Trims tokenized input to max_sequence_length, 
    while keeping the same ratio of Q and A length"""
    q = tokenizer.tokenize(question)
    #a = tokenizer.tokenize(answer)
    
    q_len = len(q)
    #a_len = len(a)
    
    if (q_len+3) > max_sequence_length:
        
        new_q_len = max_sequence_length-3
        #new_a_len = a_len/(q_len+a_len) * (max_sequence_length-3)
        #new_q_len, new_a_len = int(ceil(new_q_len)), int(floor(new_a_len))
            
        if new_q_len+3 != max_sequence_length:
            raise ValueError("too small %s" % str(new_q_len+3))
        
        q = q[:new_q_len]
        #a = a[:new_a_len]
    
    return q

def _convert_to_bert_inputs(question, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + question + ["[SEP]"]

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks(stoken, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays_q(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        q = instance.question_body

        q= _trim_input(q, tokenizer, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(
            q, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

def compute_input_arays_a(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        q = instance.answer

        q= _trim_input(q,  tokenizer, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(
            q, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]
def compute_input_arays_t(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        q = instance.question_title

        q= _trim_input(q,  tokenizer, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(
            q, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

# Modeling

In [None]:
coef_nn = 2
coef_mte = 1.7
coef_lgb =1

In [None]:
class TDense(tf.keras.layers.Layer):
    def __init__(self,
                 output_size,
                 kernel_initializer=None,
                 bias_initializer="zeros",
                **kwargs):
        super().__init__(**kwargs)
        self.output_size = output_size
        self.kernel_initializer = kernel_initializer
        self.bias_initializer = bias_initializer
    def build(self,input_shape):
        dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx())
        if not (dtype.is_floating or dtype.is_complex):
            raise TypeError("Unable to build `TDense` layer with "
                          "non-floating point (and non-complex) "
                          "dtype %s" % (dtype,))
        input_shape = tf.TensorShape(input_shape)
        if tf.compat.dimension_value(input_shape[-1]) is None:
            raise ValueError("The last dimension of the inputs to "
                           "`TDense` should be defined. "
                           "Found `None`.")
        last_dim = tf.compat.dimension_value(input_shape[-1])
        ### tf 2.1 rc min_ndim=3 -> min_ndim=2
        self.input_spec = tf.keras.layers.InputSpec(min_ndim=2, axes={-1: last_dim})
        self.kernel = self.add_weight(
            "kernel",
            shape=[self.output_size,last_dim],
            initializer=self.kernel_initializer,
            dtype=self.dtype,
            trainable=True)
        self.bias = self.add_weight(
            "bias",
            shape=[self.output_size],
            initializer=self.bias_initializer,
            dtype=self.dtype,
            trainable=True)
        super(TDense, self).build(input_shape)
    def call(self,x):
        return tf.matmul(x,self.kernel,transpose_b=True)+self.bias

In [None]:
# Compatible with tensorflow backend

def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            spearmanr(col_trues, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.nanmean(rhos)

def average_spearmanr(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return np.average([
        spearmanr(y_t, y_p).correlation for y_t, y_p in zip(y_true.T, y_pred.T)
    ])
class SpearmanRhoCallback(Callback):
    def __init__(self, training_data, validation_data, patience, model_name):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        
        self.patience = patience
        self.value = -1
        self.bad_epochs = 0
        self.model_name = model_name

    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred_val = self.model.predict(self.x_val)
        rho_val = np.nanmean([spearmanr(self.y_val[:, ind], y_pred_val[:, ind] + np.random.normal(0, 1e-7, y_pred_val.shape[0])).correlation for ind in range(y_pred_val.shape[1])])
        #rho_val = compute_spearmanr(self.y_val, y_pred_val)
        if rho_val >= self.value:
            self.value = rho_val
            self.model.save_weights(self.model_name)
        else:
            self.bad_epochs += 1
        if self.bad_epochs >= self.patience:
            print("Epoch %05d: early stopping Threshold" % epoch)
            self.model.stop_training = True
        print('\rval_spearman-rho: %s' % (str(round(rho_val, 4))), end=100*' '+'\n')
        return rho_val

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [None]:
def create_learning_rate_scheduler(max_learn_rate=1e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=2,
                                   total_epoch_count=5):

    def lr_scheduler(epoch):
        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)

    return learning_rate_scheduler

In [None]:
class BertLayer1(tf.keras.layers.Layer):
    def __init__(self, bert_path, seq_len=64, n_tune_layers=3, 
                 pooling="cls", do_preprocessing=True, verbose=False,
                 tune_embeddings=False, trainable=True, **kwargs):

        self.trainable = trainable
        self.n_tune_layers = n_tune_layers
        self.tune_embeddings = tune_embeddings
        self.do_preprocessing = do_preprocessing

        self.verbose = verbose
        self.seq_len = seq_len
        self.pooling = pooling
        self.bert_path = bert_path

        self.var_per_encoder = 16
        if self.pooling not in ["cls", "mean", None]:
            raise NameError(
                f"Undefined pooling type (must be either 'cls', 'mean', or None, but is {self.pooling}"
            )

        super(BertLayer1, self).__init__(**kwargs)

    def build(self, input_shape):

        self.bert = hub.Module(self.build_abspath(self.bert_path), 
                               trainable=self.trainable, name=f"{self.name}_module")

        trainable_layers = []
        #trainable_layers.append("pooler")
        if self.tune_embeddings:
            trainable_layers.append("embeddings")

        #if self.pooling == "cls":
        #    trainable_layers.append("pooler")

        if self.n_tune_layers > 0:
            encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name]
            n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder)
            for i in range(self.n_tune_layers):
                trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/")
        
        # Add module variables to layer's trainable weights
        for var in self.bert.variables:
            if any([l in var.name for l in trainable_layers]):
                self._trainable_weights.append(var)
            else:
                self._non_trainable_weights.append(var)

        if self.verbose:
            print("*** TRAINABLE VARS *** ")
            for var in self._trainable_weights:
                print(var)

        self.build_preprocessor()
        self.initialize_module()

        super(BertLayer1, self).build(input_shape)

    def build_abspath(self, path):
        if path.startswith("https://") or path.startswith("gs://"):
            return path
        else:
            return os.path.abspath(path)

    def build_preprocessor(self):
        sess = tf.keras.backend.get_session()
        tokenization_info = self.bert(signature="tokenization_info", as_dict=True)
        vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                              tokenization_info["do_lower_case"]])
        self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case)

    def initialize_module(self):
        sess = tf.keras.backend.get_session()
        
        vars_initialized = sess.run([tf.is_variable_initialized(var) 
                                     for var in self.bert.variables])

        uninitialized = []
        for var, is_initialized in zip(self.bert.variables, vars_initialized):
            if not is_initialized:
                uninitialized.append(var)

        if len(uninitialized):
            sess.run(tf.variables_initializer(uninitialized))

    def call(self, input):

        if self.do_preprocessing:
            input = tf.numpy_function(self.preprocessor, 
                                    [input], [tf.int32, tf.int32, tf.int32], 
                                    name='preprocessor')
            for feature in input:
                feature.set_shape((None, self.seq_len))
        
        input_ids, input_mask, segment_ids = input
        
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
        
        #if self.pooling == "cls":
        pooled_cls = output["pooled_output"]
        #else:
        result = output["sequence_output"]
        
        input_mask = tf.cast(input_mask, tf.float32)
        mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
        masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
        
        #if self.pooling == "mean":
        pooled_mean = masked_reduce_mean(result, input_mask)
        #else:
        sequence_output = mul_mask(result, input_mask)
        if self.pooling == "cls":
            return pooled_cls
        else:
            return pooled_mean, sequence_output

    def get_config(self):
        config_dict = {
            "bert_path": self.bert_path, 
            "seq_len": self.seq_len,
            "pooling": self.pooling,
            "n_tune_layers": self.n_tune_layers,
            "tune_embeddings": self.tune_embeddings,
            "do_preprocessing": self.do_preprocessing,
            "verbose": self.verbose
        }
        super(BertLayer1, self).get_config()
        return config_dict

In [None]:
outputs = compute_output_arrays1(df_train, output_categories)
#inputs = compute_input_arays1(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arays1(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

test_inputst = compute_input_arays_t(df_test, input_categories, tokenizer1, MAX_SEQUENCE_LENGTH_t)
test_inputsq = compute_input_arays_q(df_test, input_categories, tokenizer1, MAX_SEQUENCE_LENGTH_q)
test_inputsa = compute_input_arays_a(df_test, input_categories, tokenizer1, MAX_SEQUENCE_LENGTH_a)

In [None]:
d= {0.: 0, 0.3: 1, 0.7: 2}
from numpy import copy
theArray = np.round(outputs[:, 19:20], decimals=1)
newArray = copy(theArray)
for k, v in d.items(): newArray[theArray==k] = v
newArray = np.squeeze(newArray)
newArray =newArray.astype(int)
np.unique(newArray)

In [None]:
%%time
all_predictions = []
cv_sum = []
cv_sum2 = []
cv_sum3 = []
kf = MultilabelStratifiedKFold(n_splits=10, random_state=42, shuffle=True)
#kf = KFold(n_splits=10, random_state=42, shuffle=True)
for i, (tr, val) in enumerate(stratified_group_k_fold(df_train.question_body, newArray, df_train.question_body, k=10,seed=42)):
#kf = KFold(n_splits=5, random_state=42, shuffle=True)
#for i, (tr, val) in enumerate(kf.split(X2.values)):
    if i==1 or i ==2 or i ==3 or i ==6 or i ==7 or i ==9:
        continue
    K.clear_session()
    print('starting fold: {0}'.format(i))
    json_file = open('../input/save-groupbert{}/nn_model{}.json'.format(i, i), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json, custom_objects={"BertLayer1": BertLayer1})
    # load weights into new model
    bst_model_path = '../input/save-groupbert{}/dl_model{}.h5'.format(i, i)
    print(bst_model_path)
    model.load_weights(bst_model_path)
    model.compile(optimizer=tf.keras.optimizers.Nadam(3e-5),loss=['binary_crossentropy'])
    #model.summary()
    print("Loaded model from disk")

    
    all_predictions.append(model.predict([test_inputs[0],test_inputs[1],test_inputs[2],test_inputst[0],test_inputst[1],test_inputst[2], 
                                          test_inputsq[0],test_inputsq[1],test_inputsq[2], test_inputsa[0],test_inputsa[1],test_inputsa[2]]))
    #model.save(data_dir + 'nn_model%s.h5'%i)
    del model
    #os.remove('best_model_batch.h5')
    gc.collect()

In [None]:
del test_inputs, test_inputst, test_inputsq,test_inputsa; gc.collect()

In [None]:
len(all_predictions)


In [None]:
def _get_masks2(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments2(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids2(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids


def _trim_input2(question, answer,tokenizer, max_sequence_length):
    """Trims tokenized input to max_sequence_length, 
    while keeping the same ratio of Q and A length"""
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    q_len = len(q)
    a_len = len(a)
    
    if (q_len+a_len+3) > max_sequence_length:
        
        new_q_len = q_len/(a_len+q_len) * (max_sequence_length-3)
        new_a_len = a_len/(q_len+a_len) * (max_sequence_length-3)
        new_q_len, new_a_len = int(ceil(new_q_len)), int(floor(new_a_len))
            
        if new_a_len+new_q_len+3 != max_sequence_length:
            raise ValueError("too small %s" % str(new_a_len+new_q_len+3))
        
        q = q[:new_q_len]
        a = a[:new_a_len]
    
    return q, a

def _convert_to_bert_inputs2(question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = _get_ids2(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks2(stoken, max_sequence_length)
    input_segments = _get_segments2(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays2(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        q, a = instance.question_title, instance.question_body

        q, a = _trim_input2(q, a, tokenizer, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs2(
            q, a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

In [None]:
def _get_masks3(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments3(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids3(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids


def _trim_input3(question, answer,tokenizer, max_sequence_length):
    """Trims tokenized input to max_sequence_length, 
    while keeping the same ratio of Q and A length"""
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    q_len = len(q)
    a_len = len(a)
    
    if (q_len+a_len+3) > max_sequence_length:
        
        new_q_len = q_len/(a_len+q_len) * (max_sequence_length-3)
        new_a_len = a_len/(q_len+a_len) * (max_sequence_length-3)
        new_q_len, new_a_len = int(ceil(new_q_len)), int(floor(new_a_len))
            
        if new_a_len+new_q_len+3 != max_sequence_length:
            raise ValueError("too small %s" % str(new_a_len+new_q_len+3))
        
        q = q[:new_q_len]
        a = a[:new_a_len]
    
    return q, a

def _convert_to_bert_inputs3(question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = _get_ids3(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks3(stoken, max_sequence_length)
    input_segments = _get_segments3(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays3(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        q, a = instance.question_title, instance.answer

        q, a = _trim_input3(q, a,tokenizer, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs3(
            q, a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]


In [None]:
def _get_masks1(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments1(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids1(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def _trim_input1(title, question, answer,tokenizer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):
  
    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

def _convert_to_bert_inputs1(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = _get_ids1(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks1(stoken, max_sequence_length)
    input_segments = _get_segments1(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays1(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        t, q, a = _trim_input1(t, q, a,tokenizer, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs1(t, q, a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]


def compute_output_arrays1(df, columns):
    return np.asarray(df[columns])

In [None]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids


def _trim_input(question,tokenizer,  max_sequence_length):
    """Trims tokenized input to max_sequence_length, 
    while keeping the same ratio of Q and A length"""
    q = tokenizer.tokenize(question)
    #a = tokenizer.tokenize(answer)
    
    q_len = len(q)
    #a_len = len(a)
    
    if (q_len+3) > max_sequence_length:
        
        new_q_len = max_sequence_length-3
        #new_a_len = a_len/(q_len+a_len) * (max_sequence_length-3)
        #new_q_len, new_a_len = int(ceil(new_q_len)), int(floor(new_a_len))
            
        if new_q_len+3 != max_sequence_length:
            raise ValueError("too small %s" % str(new_q_len+3))
        
        q = q[:new_q_len]
        #a = a[:new_a_len]
    
    return q

def _convert_to_bert_inputs(question, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + question + ["[SEP]"]

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks(stoken, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays_q(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        q = instance.question_body

        q= _trim_input(q, tokenizer, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(
            q, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

def compute_input_arays_a(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        q = instance.answer

        q= _trim_input(q,  tokenizer, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(
            q, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]
def compute_input_arays_t(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        q = instance.question_title

        q= _trim_input(q,  tokenizer, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(
            q, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [None]:
outputs = compute_output_arrays1(df_train, output_categories)
test_inputs = compute_input_arays1(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputstq = compute_input_arays2(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
inputsq = compute_input_arays_q(df_train, input_categories, tokenizer1, MAX_SEQUENCE_LENGTH_q)
test_inputst = compute_input_arays_t(df_test, input_categories, tokenizer1, MAX_SEQUENCE_LENGTH_t)
test_inputsq = compute_input_arays_q(df_test, input_categories, tokenizer1, MAX_SEQUENCE_LENGTH_q)
test_inputsa = compute_input_arays_a(df_test, input_categories, tokenizer1, MAX_SEQUENCE_LENGTH_a)

In [None]:
%%time
kf = MultilabelStratifiedKFold(n_splits=10, random_state=42, shuffle=True)
for i, (tr, val) in enumerate(kf.split(inputsq[1], outputs)):
    if i!=7:
        continue
    
    K.clear_session()
    print('starting fold: {0}'.format(i))
    json_file = open('../input/save-groupbertq-fold{}/nn_modelq{}.json'.format(i, i), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json, custom_objects={"BertLayer1": BertLayer1})
    # load weights into new model
    bst_model_path = '../input/save-groupbertq-fold{}/dl_modelq{}.h5'.format(i, i)
    print(bst_model_path)
    model.load_weights(bst_model_path)
    model.compile(optimizer=tf.keras.optimizers.Nadam(3e-5),loss=['binary_crossentropy'])
    #model.summary()
    print("Loaded model from disk")
    y_testq =  model.predict([test_inputstq[0],test_inputstq[1],test_inputstq[2],test_inputst[0],test_inputst[1],test_inputst[2], 
                                          test_inputsq[0],test_inputsq[1],test_inputsq[2]])
    print("y_testq shape", y_testq.shape)
    del model
    #os.remove('best_model_batch.h5')
    gc.collect()
    K.clear_session()
    print('starting fold: {0}'.format(i))

    json_file = open('../input/save-groupberta-fold{}/nn_modela{}.json'.format(i, i), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json, custom_objects={"BertLayer1": BertLayer1})
    # load weights into new model
    bst_model_path = '../input/save-groupberta-fold{}/dl_modela{}.h5'.format(i, i)
    print(bst_model_path)
    model.load_weights(bst_model_path)
    model.compile(optimizer=tf.keras.optimizers.Nadam(3e-5),loss=['binary_crossentropy'])
    #model.summary()
    print("Loaded model from disk")
    
    y_testa =  model.predict([test_inputs[0],test_inputs[1],test_inputs[2],test_inputst[0],test_inputst[1],test_inputst[2], 
                                          test_inputsq[0],test_inputsq[1],test_inputsq[2],test_inputsa[0],test_inputsa[1],test_inputsa[2]])
    print("y_testa shape", y_testa.shape)

    print("Saved model to disk")
    del model
    #os.remove('best_model_batch.h5')
    gc.collect()
    
    y_test = np.concatenate((y_testq, y_testa), axis=1)
    print("y_test shape", y_test.shape)
    all_predictions.append(y_test)
    gc.collect()
  

In [None]:
len(all_predictions)


In [None]:
test_preds = np.mean(all_predictions, axis=0)

In [None]:
output =0.23*output_roberta + 0.31*output_xlnet +  0.24*test_preds + 0.22*final_predictions

In [None]:
culture_category_list = list(df_test[(df_test['host'] == 'english.stackexchange.com') | (df_test['host'] == 'ell.stackexchange.com')].index)

In [None]:
num_raters_dict = {'question_asker_intent_understanding': 18,
 'question_body_critical': 18,
 'question_conversational': 6,
 'question_expect_short_answer': 6,
 'question_fact_seeking': 6,
 'question_has_commonly_accepted_answer': 6,
 'question_interestingness_others': 18,
 'question_interestingness_self': 18,
 'question_multi_intent': 6,
 'question_not_really_a_question': 6,
 'question_opinion_seeking': 6,
 'question_type_choice': 6,
 'question_type_compare': 6,
 'question_type_consequence': 6,
 'question_type_definition': 6,
 'question_type_entity': 6,
 'question_type_instructions': 6,
 'question_type_procedure': 6,
 'question_type_reason_explanation': 6,
 'question_type_spelling': 3,
 'question_well_written': 18,
 'answer_helpful': 18,
 'answer_level_of_information': 18,
 'answer_plausible': 18,
 'answer_relevance': 18,
 'answer_satisfaction': 30,
 'answer_type_instructions': 6,
 'answer_type_procedure': 6,
 'answer_type_reason_explanation': 6,
 'answer_well_written': 18
 }

In [None]:
def submssion_trick(output):
    #min_rater_value = list(num_raters_dict.values())
    predictions = np.zeros_like(output)
    for i in range(output.shape[0]):
        for j in range(output.shape[1]):
            col = output_categories[j]
            num_raters = num_raters_dict[col]
            if j == 19:
                if i not in culture_category_list:
                    predictions[i][j] = 0.0
                else:
                    predictions[i][j] = np.floor(output[i][j] * num_raters) / num_raters
            else:
                predictions[i][j] = np.floor(output[i][j] * num_raters) / num_raters
            if num_raters == 18:
                predictions[i][j] = max(predictions[i][j], 1/3)
            if num_raters == 30:
                predictions[i][j] = max(predictions[i][j], 1/5)
            
    return predictions

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for i in range(30):
    output[:, i] = scaler.fit_transform(output[:, i].reshape(-1, 1)).reshape(-1,)

In [None]:
predictions = submssion_trick(output)

In [None]:
submission = pd.read_csv(PATH +'sample_submission.csv')
## Submission Checker
for i in range(30): 
    if len(np.unique(predictions[:, i])) == 1:
        print(i)
        print(submission.columns[i+1])
        #predictions[:, i] = output[:, i]
        print('before', np.unique(predictions[:, i]))
        print('after', np.unique(predictions[:, i]))

In [None]:
submission.loc[:, 'question_asker_intent_understanding':] = predictions
submission.to_csv('submission.csv', index=False)
submission.head(10)