# 1. **Import Libraries**

In [None]:
import os
import sys
import time
import shutil

import random

from tqdm import tqdm
from glob import glob
from typing import Union

import pandas as pd
import numpy as np
from sklearn.utils import shuffle as sk_shuffle

import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import Sequence, plot_model, to_categorical
from tensorflow.keras.metrics import Mean
from tensorflow.keras.losses import (
    SparseCategoricalCrossentropy, CategoricalCrossentropy, 
    sparse_categorical_crossentropy, categorical_crossentropy
)
from tensorflow.keras.layers import (
    Layer, 
    Input, InputLayer, Embedding, 
    Dropout, Dense, 
    Dot, Concatenate, Average, Add,
    Bidirectional, LSTM,
    Lambda, Reshape
)
from tensorflow.keras.callbacks import (
    EarlyStopping, TensorBoard, 
    ModelCheckpoint, ReduceLROnPlateau, 
    LearningRateScheduler, Callback
)
from tensorflow.keras.activations import softmax, sigmoid
from tensorflow.keras.initializers import Identity, GlorotNormal, TruncatedNormal

In [None]:
tpu_available = False
try:
    # detect and init the TPU
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print(f"TPU: {tpu}")
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)

    # instantiate a distribution strategy
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    tpu_available = True
except ValueError:
    print('Cannot use TPU')

# 2. **Concat Datasets**

In [None]:
columns_for_concat = ['review', 'Negative_Review', 'Positive_Review']
list_of_review_dfs = []

In [None]:
extra_data_dir = '../input/hotel-and-restaurant-reviews/'
extra_datasets = os.listdir(extra_data_dir)
for dataset in extra_datasets:
    dataset_path = os.path.join(extra_data_dir, dataset)
    dataset_df = pd.read_csv(dataset_path)
    
    for column in columns_for_concat:
        if column not in dataset_df.columns:
            continue
        
        list_of_review_dfs.append(
            dataset_df[column].rename({
                column: 'review'
            })
        )

In [None]:
main_dataset_path = '../input/hotel-comment/{}_data.csv'
main_datasets = ['training', 'valuating', 'testing']
for dataset in main_datasets:
    dataset_df = pd.read_csv(main_dataset_path.format(dataset))
    list_of_review_dfs.append(
        dataset_df['Comment'].rename({
            'Comment': 'review'
        })
    )

In [None]:
sentences_df = pd.concat(list_of_review_dfs, ignore_index=True)
n_sentences_raw = len(sentences_df)

sentences_df.dropna(inplace=True)
sentences_df.drop_duplicates(inplace=True)
sentences_len = sentences_df.str.len()
sentences_df = pd.concat((sentences_df, sentences_len), axis='columns', names=['review', 'seq_len'])
sentences_df.rename({0: 'review', 1: 'seq_len'}, axis='columns', inplace=True)
sentences_df = sentences_df.loc[
    (sentences_df['seq_len']>32) & (sentences_df['seq_len']<512)
]
sentences_df.drop(['seq_len'], axis='columns', inplace=True)
sentences_df = sentences_df.reset_index(drop=True)
sentences_df = sentences_df['review']
n_sentences = len(sentences_df)

print(f'Filter {n_sentences_raw} sentences down to {n_sentences} sentences')

sentences_df.sample(n=7)

# 3. **BERT Word Embeddings**

In [None]:
pip install stellargraph

In [None]:
from stellargraph.utils import plot_history

In [None]:
pip install keras-bert

In [None]:
from keras_bert import (
    PretrainedList, 
    get_pretrained, 
    get_checkpoint_paths,
    load_trained_model_from_checkpoint, 
    load_vocabulary,
    extract_embeddings,
    Tokenizer
)

from keras_bert.bert import get_model
from keras_bert.loader import load_trained_model_from_checkpoint
from keras_bert.optimizers import AdamWarmup


# model_path = get_pretrained(PretrainedList.multi_cased_base)
model_path = '/kaggle/input/bert-pretrained/uncased_L-4_H-512_A-8'
paths = get_checkpoint_paths(model_path)
print(f"Config: {paths.config}")
print(f"Ckpt: {paths.checkpoint}")
print(f"Vocab: {paths.vocab}")

In [None]:
vocabs = load_vocabulary(paths.vocab)
tokenizer = Tokenizer(vocabs, cased=False)

In [None]:
MAX_LEN = 512
SPECIAL_TOKENS = ['[MASK]', '[PAD]', '[CLS]', '[SEP]', '[UNK]']
tokens_dict = tokenizer._token_dict
inverse_tokens_dict = tokenizer._token_dict_inv

In [None]:
pretrained_args = {
    'config_file': paths.config,
    'checkpoint_file': paths.checkpoint,
    'training': True,
    'seq_len': MAX_LEN,
}
compile_args = {
    'optimizer': AdamWarmup(decay_steps=100420,
                            warmup_steps=8192,
                            learning_rate=1e-4,
                            weight_decay=0.01, 
                            weight_decay_pattern=['embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo'],),
    'metrics': ["accuracy"],
    'loss': SparseCategoricalCrossentropy() 
}

if tpu_available:
    with tpu_strategy.scope():
        model = load_trained_model_from_checkpoint(**pretrained_args)
        model_args = {
            'inputs': model.inputs,
            'outputs': model.outputs[0],
            'name': 'Masked-Language-Model',
        }
        bert_mlm = Model(**model_args)
        bert_mlm.compile(**compile_args)
else:
    model = load_trained_model_from_checkpoint(**pretrained_args)
    model_args = {
        'inputs': model.inputs,
        'outputs': model.outputs[0],
        'name': 'Masked-Language-Model',
    }
    bert_mlm = Model(**model_args)
    bert_mlm.compile(**compile_args)

bert_mlm.load_weights('../input/bertforhotelandrestaurant/ep029_acc1.000_val_acc1.000.h5')
bert_mlm.summary()

In [None]:
model.inputs

In [None]:
model.outputs

In [None]:
sequence_embeddings  = model.layers[-6].output
sequence_embeddings

In [None]:
plot_model(bert_mlm, show_shapes=True)

# 4. **Data Generator**

In [None]:
def tokenize_sentences(sentences,
                       seq_len: int=512, 
                       use_cased: bool=False):
    
    if isinstance(sentences, str):
        sentences = [sentences]
    elif not isinstance(sentences, (list, tuple)):
        raise ValueError(f'Wrong type of argument `sentences`: {type(sentences)}')
    
    sentences_tokenized, sentences_segmented, sentences_n_tokens = [], [], []
    for sentence in sentences:
        if not use_cased:
            sentence = sentence.lower()
        tokens, segments = tokenizer.encode(sentence, max_len=seq_len)
        
        # 0-padding
        PAD_token = tokenizer._token_dict ['[PAD]']
        n_pads = seq_len - len(tokens)
        tokens.extend([PAD_token]*n_pads)
        segments.extend([PAD_token]*n_pads)
        sentences_tokenized.append(tokens)
        sentences_segmented.append(segments)

    return np.array(sentences_tokenized), np.array(sentences_segmented)

In [None]:
class DataGenerator(Sequence):

    def __init__(self,
                 sentences: pd.Series,
                 max_samples=None,
                 sentence_len: int=512,
                 batch_size: int=16,
                 mask_ratio: float=0.19,
                 mask_step: float=0.0,
                 shuffle: bool=True):
        
        self.sentences = sentences
        self.indices = sentences.index.tolist()
        self.sentence_len = sentence_len                
        self.batch_size = batch_size
        self.mask_ratio = min(0.89, max(mask_ratio, 0.02))
        self.mask_step = mask_step
        self.max_samples = len(sentences) if max_samples is None else max_samples
        self.shuffle = shuffle
        self.special_tokens = [
            tokens_dict[tok] for tok in SPECIAL_TOKENS
        ]
        self.on_epoch_end()

    def __len__(self):
        """
        Denotes the number of batches per epoch
        """
        return int(len(self.indices) // self.batch_size)

    def __getitem__(self, index):
        """
        Generate one batch of data
        """
        # Generate indexes of the batch
        start_index = self.batch_size * index
        end_index = self.batch_size * (index+1)            
        indices = self.indices[start_index:min(end_index, len(self.sentences))]

        sentences_batch = self.sentences.loc[indices].tolist()

        # Generate data
        tokens_batch, segments_batch = tokenize_sentences(sentences_batch)
        spec_tok_mask = np.zeros(tokens_batch.shape)
        spec_tok = [
            np.where(tokens_batch==tokens_dict[token]) 
            for token in SPECIAL_TOKENS[1:]
        ]
        for t in range(len(SPECIAL_TOKENS)-1):
            spec_tok_mask[spec_tok[t][0], spec_tok[t][1]] = 1

        rand_mask = np.random.rand(*tokens_batch.shape) < self.mask_ratio
        bert_mask = rand_mask * (1-spec_tok_mask)
        
        mask_token = np.ones(tokens_batch.shape) * tokens_dict['[MASK]']
        tokens_masked_batch = np.where(bert_mask, mask_token, tokens_batch)
        
        return [tokens_batch, segments_batch, bert_mask], tokens_masked_batch # to_categorical(tokens_masked_batch, num_classes=len(tokens_dict))

    def on_epoch_end(self):
        """
        Update indices after each epoch
        """
        if self.shuffle:
            self.indices = sk_shuffle(self.indices)
        self.indices = self.indices[:self.max_samples]
        self.mask_ratio += self.mask_step

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(sentences_df, test_size=41020, random_state=27)
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=11)

train_generator = DataGenerator(train_set, max_samples=41020, mask_step=0.03)
val_generator = DataGenerator(val_set)
test_generator = DataGenerator(test_set)
len(train_generator), len(val_generator), len(test_generator)

# 5. **Train**

In [None]:
output_dir = '/kaggle/working/models'
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

log_dir = '/kaggle/working/logs'
if not os.path.isdir(log_dir):
    os.makedirs(log_dir)

viz_dir = '/kaggle/working/visualizations'
if not os.path.isdir(viz_dir):
    os.makedirs(viz_dir)

In [None]:
logger = TensorBoard(log_dir=log_dir)

model_format = 'ep={epoch:03d}_loss={loss:.3f}_val_loss={val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(
    filepath=os.path.join(output_dir, model_format),
    monitor='loss', 
    mode='min',
    save_weights_only=True, 
    save_best_only=False, 
    save_freq='epoch'
)
lr_reducer = ReduceLROnPlateau(
    monitor='loss', factor=0.1, patience=3, verbose=1)
early_stopper = EarlyStopping(
    monitor='val_loss', mode='min', min_delta=0, patience=7, verbose=1)

In [None]:
train_history = bert_mlm.fit_generator(
    generator=train_generator,
    steps_per_epoch=len(train_generator),
    validation_data=val_generator,
    validation_steps=len(val_generator),
    callbacks=[checkpoint, lr_reducer, early_stopper], 
    epochs=50,
    initial_epoch=29
)

hist_fig = plot_history(train_history, return_figure=True)
hist_fig.savefig(f'{viz_dir}/train_history.png')