# **Import Libraries**

In [None]:
import os
import gc
import sys
import time
import shutil

import random
import pickle

from ast import literal_eval
from tqdm import tqdm as print_progress
from glob import glob

import dask.dataframe as dd
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
os.environ['TF_KERAS'] = '1'

import tensorflow as tf

from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.metrics import TopKCategoricalAccuracy
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, Callback
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [None]:
from tensorflow.keras.layers import (
    Layer, 
    Input, InputLayer, Embedding, 
    Dropout, Dense, 
    Dot, Concatenate, Average, Add,
    Bidirectional, LSTM,
    Lambda, Reshape
)
from tensorflow.keras.activations import softmax, sigmoid
from tensorflow.keras.initializers import Identity, GlorotNormal
from tensorflow.keras.utils import plot_model

In [None]:
pip install stellargraph

In [None]:
pip install gradient-centralization-tf

# **Load data**

In [None]:
datasets_path = '../input/hotel-comment'
sample_dfs = dict()
for dataset in ['training', 'valuating', 'testing']:
    print(f'\n\n\nProcessing {dataset} ...')
    sample_dfs[dataset] = dd.read_csv(
        os.path.join(datasets_path, f'{dataset}_data*.csv')).compute()
    print(f"{dataset}-set contains {len(sample_dfs[dataset])} samples")
    print(sample_dfs[dataset].sample(n=3))

In [None]:
filename = os.path.join(datasets_path, 'label_encoder.pkl')
label_encoder = pickle.load(open(filename, 'rb'))
labels = list(label_encoder.classes_)
len(labels)

# **Pretrained Sentence-Transformer**

In [None]:
pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

model_version = '../input/sentence-transformers/distilUSE'
embedder = SentenceTransformer(model_version)

In [None]:
import torch

def tensor_to_nparray(tensor: torch.Tensor) -> np.array:
    return tensor.cpu().numpy() if torch.cuda.is_available() else tensor.numpy()

In [None]:
labels_vector = embedder.encode(labels, convert_to_numpy=True, output_value='token_embeddings')
labels_vector = [np.mean(tensor_to_nparray(l), axis=0) for l in labels_vector]
labels_matrix = np.vstack(labels_vector)
labels_matrix = np.expand_dims(labels_matrix, axis=0)
np.save('./labels_embeddings.npy', labels_matrix)
labels_matrix.shape

# **Word Embeddings**

In [None]:
for dataset, sample_df in sample_dfs.items():
    if dataset != 'testing':
        continue
    print(f'\n\n\nProcessing {dataset}-set ...')
    dir_path = f'/kaggle/working/{dataset}'
    if not os.path.isdir(dir_path):
        print(f'Creating {dir_path}')
        os.makedirs(dir_path)
    
    texts = sample_df.Comment.values.tolist()
    labels = sample_df.label_encoder.values.tolist()
    batch_size = 32
    
    ###########################################
    # start_idx, end_idx = 0, batch_size*1_000
    # texts = texts[start_idx:end_idx]
    # labels = labels[start_idx:end_idx]
    ###########################################
    
    n_samples = len(labels)
    n_batches = n_samples//batch_size + 1
    for b_idx in print_progress(range(n_batches)):
        
        # Get samples by batch
        if b_idx != n_batches-1:
            b_samples = texts[b_idx*batch_size:(b_idx+1)*batch_size]
            b_labels = labels[b_idx*batch_size:(b_idx+1)*batch_size]
        else:
            b_samples = texts[b_idx*batch_size:]
            b_labels = labels[b_idx*batch_size:]
        
        # Apply sentence-BERT for word embeddings
        embeddings = embedder.encode(b_samples, 
                                     batch_size=batch_size,
                                     output_value='token_embeddings',
                                     convert_to_numpy=True,
                                     show_progress_bar=False)
        embeddings = [tensor_to_nparray(e) for e in embeddings]

        # Apply LabelEncoder
        labels_multiclass = []
        for l in b_labels:
            l = literal_eval(l) if ',' in l else [int(ch) for ch in l[1:-1].split()]
            labels_multiclass += [np.sum(to_categorical(l, num_classes=labels_matrix.shape[-2]), axis=0)]
        
        # Feed data into DataFrame
        for w_idx, (w_embs, mt_label) in enumerate(zip(embeddings, labels_multiclass)):
            np.savez_compressed(f'{dir_path}/sample_{b_idx*batch_size+w_idx:07d}.npz', 
                                emb=w_embs, 
                                mtl=mt_label)
            del w_embs, mt_label

        del b_samples, b_labels
        del embeddings, labels_multiclass
        _ = gc.collect()

In [None]:
import shutil
from IPython.display import FileLink

os.chdir(r'/kaggle/working')

dir_path = '/kaggle/working/testing'
    
shutil.make_archive(dir_path+"data", 'zip', dir_path)
# FileLink(dir_path+"data.zip")
shutil.rmtree('/kaggle/working/testing')