In [1]:
%%capture
import csv
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import pandas as pd

from scrips.tools import *

  from tqdm.autonotebook import tqdm, trange





# Importing data

In [14]:
df = pd.read_csv('database/hotel_review_en_vi_with_distance.csv')

  df = pd.read_csv('database/hotel_review_en_vi_with_distance.csv')


# Preprocessing

We suggest on doing on big dataframe first since it is better to re-use than training again. 

However, we still simulate two type of preprocessing on `type = "sample"` and `type = "population"`

## Sample

In [15]:
sample = df.sample(n = 30)

We set `type = 'sample'` as default setting

In [16]:
def preprocess(df: pd.DataFrame,type = 'sample'):
    print("Preprocesing...")
    if type == 'sample':
        df['processed_comment'] = df['comment'].astype(str).apply(normalization)
        df['processed_comment'] = df.apply(lambda row: word_segmentation(row['processed_comment'], row['language']), axis=1)
        return df
    elif type == 'population':
        file_path = 'database/processed_hotel_reviews.csv'
        if os.path.isfile(file_path):
            df = pd.read_csv(file_path, index_col=0)
        else:    
            df['processed_comment'] = df['comment'].astype(str).apply(normalization)
            df['processed_comment'] = df.apply(lambda row: word_segmentation(row['processed_comment'], row['language']), axis=1)
            df.to_csv(file_path,index= True,encoding = 'utf-8-sig')
        return df

In [20]:
%%time
processed_df = preprocess(sample,type = 'sample')
processed_df[['comment','processed_comment']]

Preprocesing...
CPU times: total: 250 ms
Wall time: 268 ms


Unnamed: 0,comment,processed_comment
46633,Really nice hotel with great facilities and a ...,really nice hotel with great facilities and a ...
65037,Hotel is located at center position. Easy for ...,hotel is located at center position easy for t...
115320,Đáng giá và là ưu tiên hàng đầu,đáng_giá và là ưu_tiên hàng_đầu
26071,"Thật tuyệt, từ khách sạn có thể đi bộ để tận h...",thật tuyệt từ khách_sạn có_thể đi bộ để tận_hư...
114753,"Near the market, love it",near the market love it
13686,The staff at the reception was very friendly a...,the staff at the reception was very friendly a...
58930,"Balcony is close to the beach, and close to ev...",balcony is close to the beach and close to eve...
27074,"Very pleasant stay. Welcome drink on arrival, ...",very pleasant stay welcome drink on arrival fr...
22801,Front reception make me feel unreliable,front reception make me feel unreliable
64394,Khách sạn ok,khách_sạn ok


We storing the **index** also too use that again if needed

In [21]:
def embedding(docs,encoder,type='sample'):
    print("Embedding...")
    if type == 'sample':
        corpus_embeddings = encoder.encode(docs, show_progress_bar=False)
        return corpus_embeddings
    elif type == 'population':
        file_path = f'database/corpus_embeddings.npy'
        try:
            # Load pre-computed embeddings if they exist
            corpus_embeddings = np.load(file_path)
        except FileNotFoundError:
            # Compute embeddings if they do not exist
            corpus_embeddings = encoder.encode(docs, show_progress_bar=True)
            # Save embeddings for future use
            np.save(file_path, corpus_embeddings)
        return corpus_embeddings

We use this encoder since [SBERT](https://www.sbert.net/docs/sentence_transformer/pretrained_models.html) shows the performance of various SBERT model. Moreover, this model are handle good Vietnamese because I was manualy check it by hand. 

In [27]:
%%capture
encoder = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [37]:
%%time
docs = processed_df.processed_comment.values
corpus_embeddings = embedding(docs,encoder,type='sample')

Embedding...
CPU times: total: 16.6 s
Wall time: 6.05 s


In [40]:
corpus_embeddings,corpus_embeddings.shape

(array([[-0.07522741,  0.04226091, -0.00299444, ..., -0.05836292,
         -0.01389816,  0.13570267],
        [-0.13403764, -0.3085379 , -0.00700287, ..., -0.10716237,
         -0.01595334, -0.00046053],
        [ 0.02331897,  0.06717801, -0.02198911, ...,  0.04537337,
         -0.00477611, -0.08457769],
        ...,
        [-0.08349197, -0.04923766, -0.00464056, ..., -0.00493112,
          0.09841105,  0.08930065],
        [-0.0814629 , -0.31173104, -0.00586778, ...,  0.02137453,
          0.02712864,  0.0228137 ],
        [ 0.01565422, -0.02806365, -0.01363612, ...,  0.04486922,
          0.15557139, -0.00773376]], dtype=float32),
 (30, 768))

# Population

Since we need to extract model again and again, the re-use of corpus embedding are needed.

In [41]:
df.shape

(124603, 30)

In [42]:
%%time
processed_df = preprocess(df,type = 'population')
processed_df[['comment','processed_comment']]

Preprocesing...
CPU times: total: 14min 44s
Wall time: 14min 47s


Unnamed: 0,comment,processed_comment
0,A tourist class hotel with very basic amenitie...,a tourist class hotel with very basic amenitie...
1,The hotel rooms etc are good but very very sti...,the hotel rooms etc are good but very very sti...
2,Really enjoy my stay here. Will stay again,really enjoy my stay here will stay again
3,excellent,excellent
4,Great and decent place for sightseeing and tours,great and decent place for sightseeing and tours
...,...,...
124598,"Phòng ốc sạch sẽ, tiện nghi, nhân viên thân th...",phòng_ốc sạch_sẽ tiện_nghi nhân_viên thân_thiệ...
124599,"Sạch sẽ, tiện nghi sẽ quay lại, ko có j để chê",sạch_sẽ tiện_nghi sẽ quay lại ko có j để chê
124600,"Không gian sạch sẽ, thoải mái, giá cả phù hợp",không_gian sạch_sẽ thoải_mái giá_cả phù_hợp
124601,"Sạch sẽ , thơm , sẽ trải nghiệm tiêdp",sạch_sẽ thơm sẽ trải nghiệm tiêdp


In [43]:
%%time
docs = processed_df.processed_comment.values
corpus_embeddings = embedding(docs,encoder,type='population')

Embedding...


Batches:   0%|          | 0/3894 [00:00<?, ?it/s]

CPU times: total: 10h 39min 36s
Wall time: 2h 52min 16s


In [44]:
corpus_embeddings.shape

(124603, 768)