In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow_hub as hub
import tensorflow as tf
import bert_tokenization as tokenization
import tensorflow.keras.backend as K
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil
from tensorflow.keras.models import load_model

np.set_printoptions(suppress=True)

### Abuout this kernel

In this kernel, an example of adding features other than character strings is described with reference to "l Bert-base TF2.0 (minimalistic) III".

Referenced kernel
Bert-base TF2.0 (minimalistic) III<br>
https://www.kaggle.com/bibek777/bert-base-tf2-0-minimalistic-iii

Thank you.

In [None]:


#os.listdir('../input/')


#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))


In [None]:
PATH = '../input/google-quest-challenge/'


BERT_PATH = '../input/bert-base-from-tfhub/bert_en_uncased_L-12_H-768_A-12'

#Large model memory error.
#BERT_PATH = '../input/bert-base-from-tfhub-l24-h1024-a16/tf_bert_en_cased_L-24_H-1024_A-16'


tokenizer = tokenization.FullTokenizer(BERT_PATH+'/assets/vocab.txt', True)
MAX_SEQUENCE_LENGTH = 512

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

In [None]:
#test only
#df_train = df_train[0:51]
#df_test = df_train[0:41]
#df_sub = df_sub[0:41]

In [None]:
#
targets = [
        'question_asker_intent_understanding',
        'question_body_critical',
        'question_conversational',
        'question_expect_short_answer',
        'question_fact_seeking',
        'question_has_commonly_accepted_answer',
        'question_interestingness_others',
        'question_interestingness_self',
        'question_multi_intent',
        'question_not_really_a_question',
        'question_opinion_seeking',
        'question_type_choice',
        'question_type_compare',
        'question_type_consequence',
        'question_type_definition',
        'question_type_entity',
        'question_type_instructions',
        'question_type_procedure',
        'question_type_reason_explanation',
        'question_type_spelling',
        'question_well_written',
        'answer_helpful',
        'answer_level_of_information',
        'answer_plausible',
        'answer_relevance',
        'answer_satisfaction',
        'answer_type_instructions',
        'answer_type_procedure',
        'answer_type_reason_explanation',
        'answer_well_written'    
    ]

input_columns = ['question_title','question_body','answer']

In [None]:
#add features sample

#Sentense count in each comment:
import string
import re
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler

eng_stopwords = set(stopwords.words("english"))


def include_window_datas(df_q):
    out_df = pd.DataFrame()
    #  '\n' can be used to count the number of sentences in each comment
    out_df['count_sent']=df_q["question_body"].apply(lambda x: len(re.findall("\n",str(x)))+1)
    #Word count in each comment:
    out_df['count_word']=df_q["question_body"].apply(lambda x: len(str(x).split()))
    #Unique word count
    out_df['count_unique_word']=df_q["question_body"].apply(lambda x: len(set(str(x).split())))
    #Letter count
    out_df['count_letters']=df_q["question_body"].apply(lambda x: len(str(x)))
    #punctuation count
    out_df["count_punctuations"] =df_q["question_body"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    #upper case words count
    out_df["count_words_upper"] = df_q["question_body"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    #title case words count
    out_df["count_words_title"] = df_q["question_body"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    #Number of stopwords
    out_df["count_stopwords"] = df_q["question_body"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
    
    #Average length of the words 
    #out_df["mean_word_len"] = df_q["question_body"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

    #answer_count_word
    out_df['answer_count_word']=df_q["answer"].apply(lambda x: len(str(x).split()))
    out_df['question_title_count_word']=df_q["question_title"].apply(lambda x: len(str(x).split()))

    

    return out_df


#add features 
df_train_add_features = include_window_datas(df_train)
df_test_add_features = include_window_datas(df_test)

#Get df len
ADD_FEATURES_COL_LEN = len(df_train_add_features.columns)


In [None]:
display(df_train_add_features.head(5))

print("*" * 100)
display(df_test_add_features.head(5))

In [None]:
# StandardScaler
sc = StandardScaler()
train_add_features = sc.fit_transform(df_train_add_features)
test_add_features = sc.fit_transform(df_test_add_features)

train_add_features[:5]

In [None]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

def _convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks(stoken, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays(df, columns,append_features, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        t, q, a = _trim_input(t, q, a, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    #append
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32),
            np.asarray(append_features) #append features 
           ]


def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [None]:
def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            spearmanr(col_trues, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.mean(rhos)


class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, valid_data, test_data, batch_size=16, fold=None):

        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        self.test_inputs = test_data
        
        self.batch_size = batch_size
        self.fold = fold
        
    def on_train_begin(self, logs={}):
        self.valid_predictions = []
        self.test_predictions = []
        
    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions.append(
            self.model.predict(self.valid_inputs, batch_size=self.batch_size))
        
        rho_val = compute_spearmanr(
            self.valid_outputs, np.average(self.valid_predictions, axis=0))
        
        print("\nvalidation rho: %.4f" % rho_val)
        
        if self.fold is not None:
            self.model.save_weights(f'bert-base-{fold}-{epoch}.h5py')
        
        self.test_predictions.append(
            self.model.predict(self.test_inputs, batch_size=self.batch_size)
        )

def bert_model():
    
    input_word_ids = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    
    #Add Features
    inps = tf.keras.layers.Input(shape=(ADD_FEATURES_COL_LEN,),name='input_features')
    x2 = tf.keras.layers.Dense(512, activation='elu', name='features_dense')(inps)
    x2 = tf.keras.layers.Dropout(0.2, name='dropout_dense')(x2)
    
    
    
    bert_layer = hub.KerasLayer(BERT_PATH, trainable=True)
    
    _, x = bert_layer([input_word_ids, input_masks, input_segments])
    
    #x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Lambda(lambda x: x[:, 0])(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    
    # bert + Features
    combined = tf.keras.layers.concatenate([x, x2])    
    
    out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(combined)
    #out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_masks, input_segments, inps], outputs=out)
    
    return model    
        
def train_and_predict(model, train_data, valid_data, test_data, 
                      learning_rate, epochs, batch_size, loss_function, fold):
        
    custom_callback = CustomCallback(
        valid_data=(valid_data[0], valid_data[1]), 
        test_data=test_data,
        batch_size=batch_size,
        fold=None)

    #print(train_data[0])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss=loss_function, optimizer=optimizer)
    model.fit(train_data[0], train_data[1],  epochs=epochs, 
              batch_size=batch_size, callbacks=[custom_callback])
    
    return custom_callback


In [None]:
gkf = GroupKFold(n_splits=5).split(X=df_train.question_body, groups=df_train.question_body) ############## originaln_splits=5

outputs = compute_output_arrays(df_train, output_categories)
inputs = compute_input_arays(df_train, input_categories, train_add_features,tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arays(df_test, input_categories, test_add_features,tokenizer, MAX_SEQUENCE_LENGTH)

In [None]:
histories = []
count = 1
for fold, (train_idx, valid_idx) in enumerate(gkf):
    #print(str(count) + "回目のループ")
    #print(train_idx)
  
    # will actually only do 3 folds (out of 5) to manage < 2h
    if fold < 3:
        K.clear_session()
        model = bert_model()


        
        train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
   
        train_outputs = outputs[train_idx]

        valid_inputs = [inputs[i][valid_idx] for i in range(len(test_inputs))]
        valid_outputs = outputs[valid_idx]

        
        
        
        model.summary()
        name = "model_" + str(count) + ".png"
        tf.keras.utils.plot_model(model, to_file=name)
        count +=1
        
        # history contains two lists of valid and test preds respectively:
        #  [valid_predictions_{fold}, test_predictions_{fold}]
        # Largeでbatch_sizeを8だとメモリ不足になるため6に調整 epochsを5から３にした→元の戻した
        history = train_and_predict(model, 
                          train_data=(train_inputs, train_outputs), 
                          valid_data=(valid_inputs, valid_outputs),
                          test_data=test_inputs, 
                          #learning_rateから「1e-5」→「 3e-5」
                          learning_rate=3e-5, epochs=5, batch_size=8,
                          loss_function='binary_crossentropy', fold=fold)
        #epochs 5→1   batch_size 8→16
        

        histories.append(history)

In [None]:
test_predictions = [histories[i].test_predictions for i in range(len(histories))]
test_predictions = [np.average(test_predictions[i], axis=0) for i in range(len(test_predictions))]
test_predictions = np.mean(test_predictions, axis=0)

df_sub.iloc[:, 1:] = test_predictions

df_sub.to_csv('submission.csv', index=False)

In [None]:
df_sub.head()