# BERT for Spooky Author Identification
_By Nick Brooks_

### **References:**
- Source for `bert_encode` function: https://www.kaggle.com/user123454321/bert-starter-inference
- All pre-trained BERT models from Tensorflow Hub: https://tfhub.dev/s?q=bert
- TF Hub Documentation for Bert Model: https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1

In [None]:
# We will use the official tokenization script created by the Google team
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from tensorflow.keras import callbacks
from keras.utils import to_categorical

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import word_tokenize
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix, log_loss
import pprint

import tokenization

import re
import gc
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
notebookstart = time.time()
pd.options.display.max_colwidth = 500

print("Tensorflow Version: ", tf.__version__)
print("TF-Hub version: ", hub.__version__)
print("Eager mode enabled: ", tf.executing_eagerly())
print("GPU available: ", tf.test.is_gpu_available())

In [None]:
MAX_LEN = 64
BATCH_SIZE = 16
EPOCHS = 10
SEED = 42
NROWS = None
TEXTCOL = "text"
TARGETCOL = "author"
NCLASS = 3

# Helper Functions

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

def text_processing(df, TEXTCOL, sentiment=False):
    df[TEXTCOL + '_num_words'] = df[TEXTCOL].apply(lambda comment: len(comment.split())) # Count number of Words
    df[TEXTCOL + '_num_unique_words'] = df[TEXTCOL].apply(lambda comment: len(set(w for w in comment.split())))
    df[TEXTCOL + '_words_vs_unique'] = df[TEXTCOL+'_num_unique_words'] / df[TEXTCOL+'_num_words'] * 100 # Count Unique Words
    
    col_names = [TEXTCOL + '_num_words', TEXTCOL + '_num_unique_words', TEXTCOL + '_words_vs_unique']
    if sentiment:
        df[TEXTCOL+"_vader_Compound"]= df[TEXTCOL].apply(lambda x:SIA.polarity_scores(x)['compound'])
        col_names.append(TEXTCOL+"_vader_Compound")

    return df, col_names

def build_model(bert_layer, max_len=512, dropout=.2):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    numeric_inputs = Input(shape=(len(num_cols),), dtype=tf.float32, name="numeric_inputs")
    
    # Bert Layer
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    # Sequence Output
    sequence_output = SpatialDropout1D(dropout)(sequence_output)
    sequence_output = Bidirectional(LSTM(128, return_sequences=True))(sequence_output)
    sequence_output = GlobalAveragePooling1D()(sequence_output)
    
    # Pooled Output
    pooled_output = Dense(36, activation='relu')(pooled_output)
    
    # Dense Inputs
    numeric_x = Dense(512, activation='relu')(numeric_inputs)
    numeric_x = Dropout(dropout)(numeric_x)
    numeric_x = Dense(64, activation='relu')(numeric_x)
    
    # Concatenate
    cat = concatenate([
        pooled_output,
        sequence_output,
        numeric_x
    ])
    cat = Dropout(dropout)(cat)
    
    # Output Layer
    out = Dense(3, activation='softmax')(cat)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids, numeric_inputs], outputs=out)
    model.compile(Adam(lr=1e-6), loss='categorical_crossentropy', metrics=['acc'])
    
    return model

In [None]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
train = pd.read_csv("../input/spooky-author-identification/train.zip", nrows=NROWS)
test = pd.read_csv("../input/spooky-author-identification/test.zip")
testdex = test.id
submission = pd.read_csv("../input/spooky-author-identification/sample_submission.zip")
sub_cols = submission.columns

print("Train Shape: {} Rows, {} Columns".format(*train.shape))
print("Test Shape: {} Rows, {} Columns".format(*test.shape))

length_info = [len(x) for x in np.concatenate([train[TEXTCOL].values, test[TEXTCOL].values])]
print("Train Sequence Length - Mean {:.1f} +/- {:.1f}, Max {:.1f}, Min {:.1f}".format(
    np.mean(length_info), np.std(length_info), np.max(length_info), np.min(length_info)))

In [None]:
train.head()

In [None]:
# Text Processing
SIA = SentimentIntensityAnalyzer()
train_df, dense_vars = text_processing(train.copy(), TEXTCOL, sentiment=True)
test_df, _ = text_processing(test.copy(), TEXTCOL, sentiment=True)

In [None]:
# TF-IDF
count_vectorizer = TfidfVectorizer(
    analyzer="word",
    tokenizer=word_tokenize,
    preprocessor=None,
    stop_words='english',
    sublinear_tf=True,
    ngram_range=(1, 1),
    max_features=500)    

hash_loc_tfidf = count_vectorizer.fit(train_df[TEXTCOL])
tfvocab = hash_loc_tfidf.get_feature_names()
print("Number of TF-IDF Features: {}".format(len(tfvocab)))

train_tfidf = count_vectorizer.transform(train_df[TEXTCOL])
test_tfidf = count_vectorizer.transform(test_df[TEXTCOL])

# Normalisation - Standard Scaler
for d_i in dense_vars:
    scaler = StandardScaler()
    scaler.fit(train_df.loc[:,d_i].values.reshape(-1, 1))
    train_df.loc[:,d_i] = scaler.transform(train_df.loc[:,d_i].values.reshape(-1, 1))
    test_df.loc[:,d_i] = scaler.transform(test_df.loc[:,d_i].values.reshape(-1, 1))
    
# Sparse Stack
train_num = hstack([csr_matrix(train_df.loc[:,dense_vars].values),train_tfidf]).toarray()
test_num = hstack([csr_matrix(test_df.loc[:,dense_vars].values),test_tfidf]).toarray()
num_cols = train_df[dense_vars].columns.tolist() + tfvocab

In [None]:
# Bert Pre-Processing
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

train_input_word_ids, train_input_mask, train_segment_ids, train_numeric_inputs = *bert_encode(train[TEXTCOL].values, tokenizer, max_len=MAX_LEN), train_num
test_input = (*bert_encode(test[TEXTCOL].values, tokenizer, max_len=MAX_LEN), test_num)

label_mapper = {name: i for i,name in enumerate(set(train[TARGETCOL].values))}
num_label = np.vectorize(label_mapper.get)(train[TARGETCOL].values)
train_labels = to_categorical(num_label)

del test, train_num, test_num, train_df, test_df
_ = gc.collect()

In [None]:
sns.countplot(train[TARGETCOL].values)
plt.title("Spooky Authors")
plt.show()

# Model: Build, Train, Predict, Submit

In [None]:
model = build_model(bert_layer, max_len=MAX_LEN)
model.summary()

In [None]:
oof_preds = np.zeros((train_input_word_ids.shape[0], NCLASS))
test_preds = np.zeros((testdex.shape[0], NCLASS))

n_splits = 3
folds = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
plot_metrics = ['loss','acc']

fold_hist = {}
for i, (trn_idx, val_idx) in enumerate(folds.split(train_input_word_ids)):
    modelstart = time.time()
    model = build_model(bert_layer, max_len=MAX_LEN)
    
    es = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2, verbose=1,
                                 mode='min', baseline=None, restore_best_weights=True)
    
    
    history = model.fit(
        x=[train_input_word_ids[trn_idx],
            train_input_mask[trn_idx],
            train_segment_ids[trn_idx],
            train_numeric_inputs[trn_idx]],
        y=train_labels[trn_idx],
        validation_data=(
            [train_input_word_ids[val_idx],
            train_input_mask[val_idx],
            train_segment_ids[val_idx],
            train_numeric_inputs[val_idx]],
            train_labels[val_idx]),
        epochs=EPOCHS,
        batch_size=18,
        callbacks=[es]
    )

    best_index = np.argmin(history.history['val_loss'])
    fold_hist[i] = history
    
    oof_preds[val_idx] = model.predict(
        [train_input_word_ids[val_idx],
        train_input_mask[val_idx],
        train_segment_ids[val_idx],
        train_numeric_inputs[val_idx]])
    test_preds += model.predict(test_input)
    best_metrics = {metric: scores[best_index] for metric, scores in history.history.items()}
    pprint.pprint(best_metrics)
    
    f, ax = plt.subplots(1,len(plot_metrics),figsize = [12,4])
    for p_i,metric in enumerate(plot_metrics):
        ax[p_i].plot(history.history[metric], label='Train ' + metric)
        ax[p_i].plot(history.history['val_' + metric], label='Val ' + metric)
        ax[p_i].set_title("{} Fold Loss Curve - {}\nBest Epoch {}".format(i, metric, best_index))
        ax[p_i].legend()
        ax[p_i].axvline(x=best_index, c='black')
    plt.show()

In [None]:
train['error'] = 1 - np.max(((train_labels) - oof_preds), axis = 1)
train = pd.concat([train, pd.DataFrame(oof_preds, columns=label_mapper.keys())], axis=1)


f,ax = plt.subplots(1,1,figsize = [6,4])
sns.distplot(train['error'], ax = ax)
ax.set_title("Classification Errors: Target - Pred Probability")
plt.tight_layout(pad=1)
plt.show()

In [None]:
cnf_matrix = confusion_matrix(num_label, np.argmax(oof_preds,axis=1))
print("Logloss: {:.2f}".format(log_loss(train_labels, oof_preds)))

print("\nConfusion Matrix:")
print(cnf_matrix)

In [None]:
show_cols = [
    'id',
    TEXTCOL,
    TARGETCOL,
    'error'] + list(label_mapper.keys())

display(train[show_cols].sort_values(by = 'error', ascending=True).iloc[:20])

In [None]:
final_pred = test_preds/n_splits
pd.DataFrame(final_pred).describe()

In [None]:
submission = pd.DataFrame(final_pred, columns=label_mapper.keys())
submission['id'] = testdex

submission = submission[sub_cols]
submission.to_csv('submission_bert.csv', index=False)
print(submission.shape)

In [None]:
!head submission_bert.csv

In [None]:
oof_pd = pd.DataFrame(oof_preds/n_splits, columns = label_mapper.keys())
oof_pd.to_csv("oof_dense_bert.csv")
print(oof_pd.shape)

In [None]:
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))