In [1]:
%%capture
import csv
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import pandas as pd

from scripts.dataframe_selection import *
from scripts.tools import *

# Importing data

In [2]:
df = pd.read_csv('database/hotel_review_en_vi_with_distance.csv')

  df = pd.read_csv('database/hotel_review_en_vi_with_distance.csv')


In [3]:
# df

# Preprocessing

We suggest on doing on big dataframe first since it is better to re-use than training again. 

However, we still simulate two type of preprocessing on `type = "sample"` and `type = "population"`

## Sample

In [4]:
sample = df.sample(n = 30)

We set `type = 'sample'` as default setting

In [5]:
def preprocess(df: pd.DataFrame,type = 'sample'):
    print("Preprocesing...")
    if type == 'sample':
        df['processed_comment'] = df['comment'].astype(str).apply(normalization)
        df['processed_comment'] = df.apply(lambda row: word_segmentation(row['processed_comment'], row['language']), axis=1)
        return df
    elif type == 'population':
        file_path = 'database/processed_hotel_reviews.csv'
        if os.path.isfile(file_path):
            df = pd.read_csv(file_path, index_col=0)
        else:    
            df['processed_comment'] = df['comment'].astype(str).apply(normalization)
            df['processed_comment'] = df.apply(lambda row: word_segmentation(row['processed_comment'], row['language']), axis=1)
            df.to_csv(file_path,index= True,encoding = 'utf-8-sig')
        return df

In [6]:
%%time
processed_df = preprocess(sample,type = 'sample')
processed_df[['comment','processed_comment']]

Preprocesing...
CPU times: total: 109 ms
Wall time: 279 ms


Unnamed: 0,comment,processed_comment
95321,"Wonderful, professional staff that speak Engli...",wonderful professional staff that speak englis...
22495,We really enjoyed beautiful and calm atmospher...,we really enjoyed beautiful and calm atmospher...
52143,The pictures look a million times better than ...,the pictures look a million times better than ...
103876,Pool was nice. Close to beach. Staff were a bi...,pool was nice close to beach staff were a bit ...
57965,"Excellent location, spacious rooms with nice v...",excellent location spacious rooms with nice vi...
80281,May come from door opened to fresh air,may come from door opened to fresh air
50869,"Staff were prompt, location was good. Room was...",staff were prompt location was good room was c...
79987,"Very nice hotel. Friendly staff, very accommod...",very nice hotel friendly staff very accommodat...
109911,Very convenient location and clean and friendly,very convenient location and clean and friendly
86700,Do covid 19 nên Little Beach Hội An chưa mở cử...,do covid 19 nên little beach hội an chưa mở_cử...


We storing the **index** also too use that again if needed

In [7]:
def embedding(docs,encoder,type='sample'):
    print("Embedding...")
    if type == 'sample':
        corpus_embeddings = encoder.encode(docs, show_progress_bar=False)
        return corpus_embeddings
    elif type == 'population':
        file_path = f'database/corpus_embeddings.npy'
        try:
            # Load pre-computed embeddings if they exist
            corpus_embeddings = np.load(file_path)
        except FileNotFoundError:
            # Compute embeddings if they do not exist
            corpus_embeddings = encoder.encode(docs, show_progress_bar=True)
            # Save embeddings for future use
            np.save(file_path, corpus_embeddings)
        return corpus_embeddings

We use this encoder since [SBERT](https://www.sbert.net/docs/sentence_transformer/pretrained_models.html) shows the performance of various SBERT model. Moreover, this model are handle good Vietnamese because I was manualy check it by hand. 

In [8]:
%%capture
encoder = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [9]:
%%time
docs = processed_df.processed_comment.values
corpus_embeddings = embedding(docs,encoder,type='sample')

Embedding...
CPU times: total: 4.23 s
Wall time: 2.88 s


In [10]:
corpus_embeddings,corpus_embeddings.shape

(array([[-0.11418362,  0.07794645, -0.00995048, ...,  0.19378589,
          0.19440794, -0.05447968],
        [ 0.13635494, -0.07272205, -0.0089131 , ..., -0.02221631,
         -0.00774254,  0.0802676 ],
        [ 0.0761268 ,  0.24591573, -0.00789728, ..., -0.0491119 ,
         -0.08772048,  0.09289229],
        ...,
        [ 0.01215874,  0.07802645, -0.02126045, ...,  0.08571438,
          0.01309518, -0.01906305],
        [-0.0317916 , -0.02985834, -0.00224115, ...,  0.00880288,
         -0.00788458,  0.11723903],
        [ 0.0534906 , -0.15266998, -0.00647984, ..., -0.0505357 ,
         -0.1616588 ,  0.01065221]], dtype=float32),
 (30, 768))

# Population

Since we need to extract model again and again, the re-use of corpus embedding are needed.

In [11]:
df.shape

(124603, 30)

In [12]:
%%time
processed_df = preprocess(df,type = 'population')
processed_df[['comment','processed_comment']]

Preprocesing...
CPU times: total: 1.11 s
Wall time: 1.79 s


  df = pd.read_csv(file_path, index_col=0)


Unnamed: 0,comment,processed_comment
0,A tourist class hotel with very basic amenitie...,a tourist class hotel with very basic amenitie...
1,The hotel rooms etc are good but very very sti...,the hotel rooms etc are good but very very sti...
2,Really enjoy my stay here. Will stay again,really enjoy my stay here will stay again
3,excellent,excellent
4,Great and decent place for sightseeing and tours,great and decent place for sightseeing and tours
...,...,...
124598,"Phòng ốc sạch sẽ, tiện nghi, nhân viên thân th...",phòng_ốc sạch_sẽ tiện_nghi nhân_viên thân_thiệ...
124599,"Sạch sẽ, tiện nghi sẽ quay lại, ko có j để chê",sạch_sẽ tiện_nghi sẽ quay lại ko có j để chê
124600,"Không gian sạch sẽ, thoải mái, giá cả phù hợp",không_gian sạch_sẽ thoải_mái giá_cả phù_hợp
124601,"Sạch sẽ , thơm , sẽ trải nghiệm tiêdp",sạch_sẽ thơm sẽ trải nghiệm tiêdp


In [13]:
%%time
docs = processed_df.processed_comment.values
corpus_embeddings = embedding(docs,encoder,type='population')

Embedding...
CPU times: total: 78.1 ms
Wall time: 451 ms


In [14]:
corpus_embeddings.shape

(124603, 768)

# Process

In [43]:
sample = df.sample(n = 2000)
sample.head(5)

Unnamed: 0,hotel_id,hotel_name,addressline1,city,state,numberrooms,yearopened,yearrenovated,number_of_reviews,rating_average,...,accommodationtype2,accommodationtype3,accommodationtype4,language,reviewer_continent,longitude,latitude,hotel_coordinate,dis2coast,nearest_coordinate
115379,9412242,Camy Hotel,236 Phan Chu Trinh,Vung Tau,Ba Ria Vung Tau,15.0,2018.0,,253,9.0,...,,,,vi,Asia,107.08362,10.330615,POINT (107.08362 10.330615),0.383,POINT (11920843.302334243 1148465.2649964693)
117333,10889436,BLESSED Hotel,"46-48 Hung Gia 1 st, Tan Phong ward, District ...",Ho Chi Minh City,Ho Chi Minh,18.0,2020.0,2020.0,612,9.0,...,,,,en,Asia,106.708514,10.733383,POINT (106.708514 10.733383),40.738,POINT (11907922.454514029 1165509.8719933222)
94627,3849840,MAY BUNGALOW,246/2 nguyen dinh chieu,Phan Thiet,Binh Thuan,12.0,2018.0,2021.0,817,9.3,...,,,,en,Europe,108.247068,10.953706,POINT (108.247068 10.953706),0.094,POINT (12049988.609741127 1218651.862620137)
82335,1624612,Muong Thanh Luxury Phu Quoc Hotel,"Group 3, Duong Bao Hamlet, Duong To Commune, P...",Phu Quoc Island,Kien Giang,276.0,2016.0,2016.0,1050,8.0,...,,,,en,Asia,103.979855,10.132683,POINT (103.979855 10.132683),0.454,POINT (11574539.250734488 1126288.9418734254)
18475,281592,Villa Pink House,"67/6 (Old Number 7/8) Hai Thuong Street, Ward ...",Dalat,Lam Dong,21.0,2001.0,2019.0,641,7.7,...,,,,en,Europe,108.430097,11.94406,POINT (108.430097 11.94406),73.526,POINT (12134860.5102322 1295168.6063270567)


In [16]:
selection = Selection(sample)

In [17]:
selection.language.value_counts()

language
en    38
vi    12
Name: count, dtype: int64

In [44]:
filtered_hotel = selection.filtering(
    extracting = 'review',
#     province=['Da Nang'],
#     star_rating=[3],
#     region=['Central Vietnam'],
#     dis2coast=[0, 0.5],
#     seaside=0
)
filtered_hotel

Unnamed: 0,hotel_id,star_rating,coordinate,province,region,dis2coast,seaside,comment,sentiment,national,language
26680,294583,4.0,POINT (106.697094 10.773663),TP. Ho Chi Minh,unknown,44.875,0,"Hotel is good, considering the price and locat...",1,KR,en
57281,961865,3.0,POINT (105.853913 21.032068),Ha Noi,North Vietnam,97.641,0,only one comment is breakfast quality.,1,TH,en
64161,1195442,3.0,POINT (108.434546 11.941766),Lam Dong,Central Vietnam,72.967,0,"sạch sẽ, nhân viên lễ phép, phòng đồ đạc ok, c...",1,VN,vi
68969,1206496,3.0,POINT (108.329178 15.886979),Quang Nam,Central Vietnam,3.346,0,"Toilet gets very wet when shower is done, as t...",1,MY,en
107143,6494378,3.0,POINT (109.200022 12.226268),Khanh Hoa,Central Vietnam,0.041,1,Difficult to find the place. No view as showed...,0,VN,en
51552,861998,3.0,POINT (106.692631 10.768471),TP. Ho Chi Minh,unknown,44.789,0,The location was great. Check out the egg coff...,1,IN,en
11180,186800,2.0,POINT (108.326616 15.880324),Quang Nam,Central Vietnam,4.139,0,"This hotel need to be deep clean, maintain bec...",-1,FR,en
110729,6998893,3.0,POINT (105.847697 21.032573),Ha Noi,North Vietnam,98.243,0,A massive shout out to the staff at this hotel...,1,NZ,en
27450,294583,4.0,POINT (106.697094 10.773663),TP. Ho Chi Minh,unknown,44.875,0,too poor,-1,US,en
35975,400379,2.0,POINT (108.231065 16.066114),Da Nang,Central Vietnam,0.34,1,My wife and I strongly suspect that the hotel ...,1,US,en


In [19]:
filtered_hotel.shape

(8, 11)

In [20]:
filtered_hotel = selection.filtering(
    extracting = 'hotel',
#     province=['Da Nang'],
#     star_rating=[3],
    region=['Central Vietnam'],
#     dis2coast=[0, 0.5],
    seaside=0
)
filtered_hotel.shape

(7, 7)

In [21]:
def extract_embedding(df: pd.DataFrame,corpus_embedding):
    return corpus_embedding[df.index]

In [22]:
extracted_embedding = extract_embedding(filtered_hotel,corpus_embeddings)

In [23]:
len(extracted_embedding)

7

In [24]:
sample.index

Index([ 26680,  57281,  64161,  68969, 107143,  51552,  11180, 110729,  27450,
        35975, 104535,  10109,  80840,  21682,  87350,  18343,  34544,  52086,
        84170, 114159,  62812,  55890, 104285, 118611,   8350,  26257,  99366,
         6941,  82818,   3612,  99334,  35173, 106544,  71483,  12636,  93174,
        18167, 120404,  91846,   5565, 104808,  80241,  97863,  41611,  82698,
       104489,  20827,  13482, 111895,  88181],
      dtype='int64')

In [25]:
len(processed_df[processed_df.index.isin(sample.index)].processed_comment.values)

50

# Topic Modeling run

In [41]:
%%capture
from stopwordsiso import stopwords
import pandas as pd
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from stopwordsiso import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic import BERTopic


In [38]:
encoder = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
stop_word_mul = stopwords(['vi','en'])
umap_model=UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model=HDBSCAN(metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model=CountVectorizer(stop_words=list(stop_word_mul), min_df=2, ngram_range=(1,2))
keybert_model=KeyBERTInspired()
pos_model=PartOfSpeech('en_core_web_sm')
mmr_model=MaximalMarginalRelevance(diversity=0.3)
representation_model={
    'KeyBERT':keybert_model,
    'MMR':mmr_model,
    'POS':pos_model
}

In [57]:
def run(df: pd.DataFrame,top_n_words=100, **kwargs):
    """
    Topic Modeling with BERTopic process
    """
    ### PROCESSED DF
    file_path = 'database/processed_hotel_reviews.csv'
    if os.path.isfile(file_path):
        processed_df = pd.read_csv(file_path, index_col=0)
    else:    
        processed_df = preprocess(df,type = 'sample')
    docs = processed_df[processed_df.index.isin(df.index)].processed_comment.values.astype('str')
    
    ### EMBEDDINGS
    encoder = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

    file_path = 'database/corpus_embeddings.npy'
    if os.path.isfile(file_path):
        corpus_embeddings = np.load(file_path)
        extracted_embedding = extract_embedding(df, corpus_embeddings)
    else:
        corpus_embeddings = embedding(docs,encoder, type='sample')
        df = df.reset_index()
        extracted_embedding = extract_embedding(df, corpus_embeddings)
    
    ### Set default values for parameters
    umap_defaults = {
        'n_neighbors': 15,
        'n_components': 5,
        'min_dist': 0.0,
        'metric': 'cosine',
        'random_state': 42
    }
    
    hdbscan_defaults = {
        'metric': 'euclidean',
        'min_cluster_size': 5,
        'min_samples': 1,
        'prediction_data': True
    }
    
    ### Override defaults with any provided kwargs
    umap_params = {**umap_defaults, **kwargs.get('umap_params', {})}
    hdbscan_params = {**hdbscan_defaults, **kwargs.get('hdbscan_params', {})}
    
    ### DIMENSIONALITY REDUCTION:
    umap_model = UMAP(**umap_params)
    
    ### CLUSTERING:
    hdbscan_model = HDBSCAN(**hdbscan_params)

    ### VECTORIZERS:
    stop_word_mul = stopwords(['vi','en'])
    vectorizer_model = CountVectorizer(stop_words=list(stop_word_mul), min_df=2, ngram_range=(1, 2))

    ### c-TF-IDF:
    ctfidf_model = ClassTfidfTransformer()
    
    ### REPRESENTATION:
    keybert_model = KeyBERTInspired()
    pos_model = PartOfSpeech('en_core_web_sm')
    mmr_model = MaximalMarginalRelevance(diversity=0.3)
    representation_model = {
        'KeyBERT': keybert_model,
        'MMR': mmr_model,
        'POS': pos_model
    }
    
    topic_model = BERTopic(
        embedding_model=encoder,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        # hyperparameters
        top_n_words=top_n_words,
        verbose=True
    )
    topics, probs = topic_model.fit_transform(docs, extracted_embedding)
    return topic_model

    


In [58]:
topic_model = run(filtered_hotel,top_n_words = 20)


  processed_df = pd.read_csv(file_path, index_col=0)


In [59]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,8,-1_night_stayed_window_nt,"[night, stayed, window, nt, staff, dinner, ama...","[hotel, hotels, visit, trip, nice, staff, reco...","[night, window, dinner, vietnam, trip, recomme...","[night, window, staff, close, trip, amazing, b...",[great locationnear to night market but avoid ...
1,0,11,0_bus_ks_service_breakfast,"[bus, ks, service, breakfast, price, hotel, ma...","[hotel hotel, hotel, hotel clean, restaurants,...","[bus, service, breakfast, price, hotel, clean,...","[bus, service, breakfast, price, hotel, market...",[tôi đã lưu_trú ở ks 2 đêm phòng ks tuy không ...
2,1,11,1_location_view_đồ_ko,"[location, view, đồ, ko, breakfast, clean, cus...","[hotel, friendly staff, restaurants, staff, ni...","[location, view, clean, customer, booked, frie...","[location, view, breakfast, clean, river, mone...",[they give us welcome drink and snack when we ...
3,2,8,2_phòng_mùi_khá_ăn,"[phòng, mùi, khá, ăn, bus, hotel, stay, giá_cả...","[hotel hotel, hotel, phòng, visit, trip, giúp ...","[phòng, bus, hotel, stay, giúp thuê, located, ...","[phòng, ăn, bus, hotel, polite, trip, morning,...",[phục_vụ thân_thiện giá_cả phải_chăng gần thun...
4,3,7,3_hotel_buffet_sáng_ăn,"[hotel, buffet, sáng, ăn, khá, view, staff, tầ...","[hotel, hotel hotel, hotels, phòng, visit, sta...","[hotel, buffet, sáng, view, staff, toilet, hot...","[hotel, khá, view, staff, day, time, hotels, t...",[henry greeted us and prepared our room he is ...
5,4,5,4_friendly staff_plans_wet_giá_cả,"[friendly staff, plans, wet, giá_cả, customer,...","[friendly staff, nice, hotel clean, hotel, đẹp...","[friendly staff, plans, wet, customer, hotel c...","[friendly staff, plans, wet, customer, staff, ...","[phòng mới đẹp giá_cả hợp_lý, great view great..."
