# **Import Libraries**

In [None]:
import os
import gc
import sys
import time
import shutil

import random
import pickle

from ast import literal_eval
from tqdm import tqdm as print_progress
from glob import glob

import dask.dataframe as dd
import pandas as pd
import numpy as np
import torch

import matplotlib.pyplot as plt
from IPython.display import display, HTML

# **Load data**

In [None]:
import spacy

# Download SpaCy models if needed
spacy_model = 'en_core_web_sm'
try:
    nlp = spacy.load(spacy_model)
except OSError:
    spacy.cli.download(spacy_model)
    nlp = spacy.load(spacy_model)
    
from nltk.tokenize import sent_tokenize

In [None]:
def capitalize_documents(text: str) -> str:
    # sentences = [sent.text for sent in nlp(text).sents]
    sentences = sent_tokenize(text)
    sentences = [sent.capitalize() for sent in sentences]
    text = ' '.join(sentences)
    return text

In [None]:
datasets_path = '../input/hotel-comment'
sample_dfs = dict()

for dataset in ['training', 'valuating', 'testing']:
    print(f'\n\n\nProcessing {dataset} ...')
    
    # Read data
    print('\tReading data ...')
    fn = os.path.join(datasets_path, f'{dataset}_data_augmented.csv')
    sample_dfs[dataset] = pd.read_csv(fn)#.compute()
    
    # Data Augmentation
    # print('\tAugmenting data ...')
    # ds_1, ds_2, ds_3 = ds.copy(), ds.copy(), ds.copy()
    # ds_1.Comment = ds_1.Comment.str.lower()
    # ds_2.Comment = ds_2.Comment.str.upper()
    # ds_3.Comment = ds_3.Comment.apply(lambda x: capitalize_documents(x))
    # sample_dfs[dataset] = pd.concat([ds, ds_1, ds_2, ds_3], ignore_index=True)
    # sample_dfs[dataset].drop_duplicates(subset=['Comment'], inplace=True)
    
    print(f"{dataset}-set contains {len(sample_dfs[dataset])} samples")
    print(sample_dfs[dataset].sample(n=3))

In [None]:
# for dataset in ['training', 'valuating', 'testing']:
#     sample_dfs[dataset].to_csv(f'{dataset}_data_augmented.csv', index=False)

In [None]:
filename = os.path.join(datasets_path, 'label_encoder.pkl')
label_encoder = pickle.load(open(filename, 'rb'))
labels = list(label_encoder.classes_)
labels

# **Sentences Embedding**

In [None]:
pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

model_version = '../input/sentence-transformers/distilUSE'
embedder = SentenceTransformer(model_version)

In [None]:
def tensor_to_nparray(tensor: torch.Tensor) -> np.array:
    return tensor.cpu().numpy() if torch.cuda.is_available() else tensor.numpy()

In [None]:
labels_vector = embedder.encode(labels, batch_size=16, convert_to_numpy=True, output_value='token_embeddings')
labels_vector = [np.mean(tensor_to_nparray(l), axis=0) for l in labels_vector]
labels_matrix = np.vstack(labels_vector)
labels_matrix = np.expand_dims(labels_matrix, axis=0)
np.save('labels_embeddings.npy', labels_matrix)
labels_matrix.shape

In [None]:
from tensorflow.keras.utils import Sequence, to_categorical

for dataset, sample_df in sample_dfs.items():
    print(f'\n\n\nProcessing {dataset} dataset')
    dir_path = f'/kaggle/working/{dataset}'
    if not os.path.isdir(dir_path):
        print(f'Creating {dir_path}')
        os.makedirs(dir_path)
    
    if dataset != 'testing':
        continue
        
    texts = sample_df.Comment.values.tolist()
    labels = sample_df.label_encoder.values.tolist()
    
    n_samples = len(labels)
    batch_size = 128
    n_batches = n_samples//batch_size + (0 if n_samples%batch_size==0 else 1)
    for b_idx in print_progress(range(n_batches)):
        
        if not (500 < b_idx <= 1000):
            continue
        
        # Get samples by batch
        if b_idx != n_batches-1:
            b_samples = texts[b_idx*batch_size:(b_idx+1)*batch_size]
            b_labels = labels[b_idx*batch_size:(b_idx+1)*batch_size]
        else:
            b_samples = texts[b_idx*batch_size:]
            b_labels = labels[b_idx*batch_size:]
        
        # Apply sentence-BERT for word embeddings
        embeddings = embedder.encode(sentences=b_samples, 
                                     batch_size=batch_size, 
                                     output_value='token_embeddings', 
                                     show_progress_bar=False)
        embeddings = [tensor_to_nparray(e) for e in embeddings]
        
        # Apply LabelEncoder
        labels_multihot = []
        for l in b_labels:
            l = literal_eval(l)
            labels_multihot += [np.sum(to_categorical(l, num_classes=len(labels)), axis=0)]
        
        # Feed data into DataFrame
        for w_idx, (w_embs, mt_label) in enumerate(zip(embeddings, labels_multihot)):
            np.savez_compressed(f'{dir_path}/sample_{b_idx*batch_size+w_idx:07d}.npz', 
                                emb=w_embs, 
                                mtl=mt_label)
            del w_embs, mt_label

        del b_samples, b_labels
        del embeddings, labels_multihot
        _ = gc.collect() 
        
    del texts, labels
    _ = gc.collect() 

In [None]:
# os.chdir(r'/kaggle/working')
# dir_path = '/kaggle/working/'
# shutil.make_archive(dir_path+"data", 'zip', dir_path)