# **Import Libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc 
import sys
import time
import shutil

from tqdm import tqdm
from glob import glob
from ast import literal_eval

import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
os.environ['TF_KERAS'] = '1'

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical

In [None]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# **Load Data**

In [None]:
data_root = '/kaggle/input/hotel-comment/'

In [None]:
sample_files = dict()
for dataset in ['training']:
    sample_files[dataset] = f'{data_root}/{dataset}_data.csv'
    print(f'{sample_files[dataset]} --> {os.path.isfile(sample_files[dataset])}')

In [None]:
sample_dfs = dict()
for dataset, sample_file in sample_files.items():
    sample_dfs[dataset] = pd.read_csv(sample_file)
    display(sample_dfs[dataset].sample(n=11))

# **Load BERT pretrained Word Embeddings**

In [None]:
pip install keras-bert

In [None]:
from keras_bert import (
    PretrainedList, 
    get_pretrained, 
    get_checkpoint_paths,
    load_trained_model_from_checkpoint, 
    load_vocabulary,
    extract_embeddings,
    Tokenizer
)

# model_path = get_pretrained(PretrainedList.multi_cased_base)
model_path = '/kaggle/input/bert-pretrained/uncased_L-4_H-512_A-8'
paths = get_checkpoint_paths(model_path)
print(paths.config, paths.checkpoint, paths.vocab)

In [None]:
with tpu_strategy.scope():
    bert_model = load_trained_model_from_checkpoint(
        config_file=paths.config,
        checkpoint_file=paths.checkpoint,
        output_layer_num=1,
    )

vocabs = load_vocabulary(paths.vocab)
tokenizer = Tokenizer(vocabs, cased=True)

seq_len, embed_dim = K.int_shape(bert_model.outputs[0])[1:]
print(f"Sequence Length: {seq_len}")
print(f"Embedding Dim: {embed_dim}")

In [None]:
def process_word_embeddings(sentences: list, seq_len: int=512, n_pads=10, use_cased: bool=False):
    
    tokens, segments, n_tokens = [], [], []

    # Tokenize and numberize tokens
    for sentence in sentences:
        if not use_cased:
            sentence = sentence.lower()
        token, segment = tokenizer.encode(sentence, max_len=seq_len)
        tokens.append(token)
        segments.append(segment)
        n_tokens.append(min(seq_len, np.count_nonzero(token)+n_pads))
                
    # 0-padding
    for i in range(len(tokens)):
        tokens[i].extend([0] * (seq_len-len(tokens[i])))
        segments[i].extend([0] * (seq_len-len(segments[i])))
        
    # Get predictions by batch
    tokens, segments = np.array(tokens), np.array(segments)
    predictions = bert_model.predict([tokens, segments])
    
    # Clip predictions for less memory storage
    outputs = []
    for prediction, len_pred in zip(list(predictions), n_tokens):
        outputs.append(prediction[:len_pred, :])
    return outputs

In [None]:
N_LABELS = 43

# **Sentences Embedding**

In [None]:
for dataset, sample_df in sample_dfs.items():
    print(f'\n\n\nProcessing {dataset} dataset')
    dir_path = f'/kaggle/working/{dataset}'
    if not os.path.isdir(dir_path):
        print(f'Creating {dir_path}')
        os.makedirs(dir_path)
    
    texts = sample_df.Comment.values.tolist()
    labels = sample_df.label_encoder.values.tolist()
    
    ###########################################
    start_idx, end_idx = 75_000, len(texts)
    texts = texts[start_idx:end_idx]
    labels = labels[start_idx:end_idx]
    ###########################################

    n_samples = len(labels)
    batch_size = 128
    n_batches = n_samples//batch_size + 1
    for b_idx in tqdm(range(n_batches)):
        
        # Get samples by batch
        if b_idx != n_batches-1:
            b_samples = texts[b_idx*batch_size:(b_idx+1)*batch_size]
            b_labels = labels[b_idx*batch_size:(b_idx+1)*batch_size]
        else:
            b_samples = texts[b_idx*batch_size:]
            b_labels = labels[b_idx*batch_size:]
        
        # Apply BERT for word embeddings
        embeddings = process_word_embeddings(b_samples)
        
        # Apply LabelEncoder
        labels_multihot = []
        for l in b_labels:
            l = literal_eval(l)
            labels_multihot += [
                np.sum(to_categorical(l, num_classes=N_LABELS), axis=0)
            ]
        
        # Feed data into DataFrame
        for w_idx, (w_embs, mt_label) in enumerate(zip(embeddings, labels_multihot)):
            np.savez_compressed(f'{dir_path}/sample_{b_idx*batch_size+w_idx:07d}.npz', 
                                emb=w_embs, 
                                mtl=mt_label)
            del w_embs, mt_label

        del b_samples, b_labels
        del embeddings, labels_multihot
        _ = gc.collect() 

# **Labels Embedding**

In [None]:
import pickle
from sklearn.preprocessing import LabelEncoder

# label_encoder = LabelEncoder()
# label_encoder.fit(labels)

# Store model
filename = data_root + 'label_encoder.pkl'
# pickle.dump(label_encoder, open(filename, 'wb'))

# Load model
label_encoder = pickle.load(open(filename, 'rb'))

In [None]:
labels = list(label_encoder.classes_)
labels

In [None]:
labels_vector = []
for label in tqdm(labels):
    
    lab_emb = process_word_embeddings([label], n_pads=3)[0]
    
    # Remove 0-paddings
    label_embedding = lab_emb[~np.all(lab_emb==0, axis=1)]
    # print(f'{label.shape} --> {label_embedding.shape}')
    
    # Get mean of label embeddings
    labels_vector += [np.mean(label_embedding, axis=0)]
    
labels_embeddings = np.vstack(labels_vector)
np.save('/kaggle/working/labels_embeddings.npy', labels_embeddings)
labels_embeddings

# **Compress**

In [None]:
# from IPython.display import FileLink
# os.chdir(r'/kaggle/working')

# dir_path = f'/kaggle/working/'
    
# shutil.make_archive(dir_path+"data", 'zip', dir_path)
# FileLink(dir_path+"data.zip")

In [None]:
# shutil.rmtree('/kaggle/working/valuating')
# shutil.rmtree('/kaggle/working/testing')