In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install umap
!pip install bertopic
!pip install hdbscan

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
#from umap import umap_ as UMAP
#from bertopic import BERTopic
#from sentence_transformers import SentenceTransformer, util
#from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
#from bertopic.vectorizers import ClassTfidfTransformer
%matplotlib inline

In [None]:
# Import data (not cleaned)
df = pd.read_csv("/content/drive/MyDrive/Priceline/JintongYu/all_data.csv")
#df.head()
df_para = df.iloc[:,32:37]
df_para.head()

Unnamed: 0,about_title,about_content,30_attractions_char,trip_clean,paragraph
0,About Rome,"All roads lead to Rome, so they say. Or maybe ...","['Historic Sites • Ancient Ruins', 'Ancient Ru...",historic site ancient ruin ancient ruin archit...,\nCaput Mundi (Latin)The Capital of the world\...
1,"Perfect beaches, world-famous tapas, and Gaudí...",There’s nowhere in the world like Barcelona. F...,['Points of Interest & Landmarks • Architectur...,landmark architectural building landmark archi...,\nBarcelona (/ˌbɑːrsəˈloʊnə/ (listen) BAR-sə-L...
2,A spellbinding city where cultures collide,"Over the centuries, many cultures have added t...","['Historic Sites • Architectural Buildings', '...",historic site architectural building historic ...,"\nIstanbul (/ˌɪstænˈbʊl/ IST-an-BUUL,[7][8] US..."
3,About Milan,If you skip Milan in favor of Italy’s more pop...,"['Religious Sites • Churches & Cathedrals', 'P...",religious site church cathedral landmark archi...,"\nMilan (/mɪˈlæn/ mil-AN, US also /mɪˈlɑːn/ mi..."
4,"A wonderland of art and architecture, all fuel...",Florence makes art-lovers' hearts beat double ...,"['Points of Interest & Landmarks', 'Art Museum...",landmark art museum historic walking area hist...,\nFlorence (/ˈflɒrəns/ FLORR-ənss; Italian: Fi...


# BERTopic Modeling

BERTopic model on all text

In [None]:
# Fill NA values for enabling columns aggregation
df_para["30_attractions_char"] = df_para["30_attractions_char"].fillna("[]")
df_para["about_title"] = df_para["about_title"].fillna("[]")
df_para["about_content"] = df_para["about_content"].fillna("[]")
# Aggregate all raw text
df_raw = df_para[["about_title", "about_content", "30_attractions_char", "paragraph"]].agg(" ".join, axis=1)

In [None]:
docs = df_raw.values

In [None]:
# Initiate models
embedding_model = SentenceTransformer("all-miniLM-L6-v2")
vectorizer_model = CountVectorizer(stop_words="english")

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model, # Step 1 - Extract embeddings
    umap_model=UMAP.UMAP(n_neighbors=8, n_components=1, min_dist=0.0, metric='cosine'), # Step 2 - Reduce dimensionality
    hdbscan_model=HDBSCAN(min_cluster_size=4, metric='euclidean', cluster_selection_method='eom', prediction_data=True), # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics
    #ctfidf_model = ctfidf_model, # Step 5 - Extract topic words
    n_gram_range=(1,3),
    calculate_probabilities=False,
    verbose=True
)

In [None]:
topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

2023-04-30 16:49:02,279 - BERTopic - Transformed documents to Embeddings
2023-04-30 16:49:04,478 - BERTopic - Reduced dimensionality
2023-04-30 16:49:04,498 - BERTopic - Clustered reduced embeddings


In [None]:
topic_info = topic_model.get_topic_info()
print(topic_info)

    Topic  Count                                    Name
0      -1     15        -1_city_francisco_orleans_madrid
1       0     58                   0_city_beach_area_san
2       1     14        1_sydney_city_brisbane_melbourne
3       2     14                  2_city_york_london_new
4       3     13        3_singapore_city_tokyo_kathmandu
5       4     11      4_city_detroit_denver_philadelphia
6       5     10             5_india_city_mumbai_chennai
7       6      9        6_lagos_nairobi_charleston_accra
8       7      8  7_berlin_amsterdam_brussels_copenhagen
9       8      8   8_edmonton_toronto_calgary_pittsburgh
10      9      6           9_texas_houston_austin_dallas
11     10      5             10_rome_athens_milan_venice
12     11      5         11_paris_munich_montreal_quebec
13     12      4       12_atlanta_nashville_memphis_city


As the optimal number of clusters is greater than 15, the current amount of clusters generated by the BERTopic model is not sufficient. 

In [None]:
doc_clusters = {}
for doc_index, cluster_number in enumerate(topics):
    if cluster_number != -1:  # ignore unclustered documents
        if cluster_number not in doc_clusters:
            doc_clusters[cluster_number] = [doc_index]
        else:
            doc_clusters[cluster_number].append(doc_index)

In [None]:
topic_model.visualize_topics()

# Feature Engineering

Convert all features into numeric values

In [None]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import gensim
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

In [None]:
# Convert continuous variables
X_cont = df[["int_visitor_2019 (million)", "Dec-Feb Avg Temp (°F)", "Jun-Aug Avg Temp (°F)"]].values
scaler = StandardScaler()
X_cont = scaler.fit_transform(X_cont)

In [None]:
print(X_cont)

[[ 7.94712271e-01 -6.45396142e-02 -2.88033385e-01]
 [-2.23743610e-01 -9.99909516e-03 -1.83715459e-01]
 [ 2.01879743e-01 -2.82701691e-01 -4.96669238e-01]
 [-4.21354452e-01 -7.19025843e-01 -2.88033385e-01]
 [-1.24938188e-01 -4.46323248e-01  1.29238320e-01]
 [-3.52950699e-01 -5.00863767e-01  4.42192099e-01]
 [-4.36555286e-01  4.45414239e-02 -4.96669238e-01]
 [-6.64567797e-01 -8.28106881e-01 -1.83715459e-01]
 [-4.36555286e-01 -6.45396142e-02  5.46510025e-01]
 [-3.33151613e-03  9.90819430e-02 -2.88033385e-01]
 [-3.83352367e-01  1.35351388e+00  4.42192099e-01]
 [-7.70973635e-01  1.02627077e+00  5.46510025e-01]
 [-5.88563627e-01  1.40805440e+00  7.55145878e-01]
 [-3.90952784e-01  9.17189729e-01  6.50827952e-01]
 [-7.93774886e-01 -7.19025843e-01 -2.88033385e-01]
 [-8.62178639e-01  1.46259492e+00  5.46510025e-01]
 [-8.24176554e-01  1.24443284e+00  4.42192099e-01]
 [-7.32971550e-01  1.29897336e+00  6.50827952e-01]
 [-8.46977805e-01  1.35351388e+00  2.33556246e-01]
 [-8.08975720e-01  1.18989232e+

In [None]:
# Convert binary categorical variables
X_cat = df[["mountain","beach","island","lake","forest","river","desert","sea sports","old town","golf","lgbtq+"]].values
#X_cat = scaler.fit_transform(X_cat)

In [None]:
print(X_cat)

[[1 0 0 ... 1 1 1]
 [1 1 0 ... 1 1 1]
 [1 1 1 ... 1 0 0]
 ...
 [1 1 0 ... 1 1 1]
 [1 0 0 ... 1 1 1]
 [1 1 0 ... 1 1 1]]


In [None]:
# Convert text data
corpus = df_raw.tolist()
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(corpus)
X_text = X_text.toarray()
#dictionary = vectorizer.vocabulary_

In [None]:
print(X_text)

[[ 1 56  0 ...  0  0  0]
 [ 0 18  0 ...  0  0  0]
 [ 0 33  0 ...  0  0  0]
 ...
 [ 0 25  0 ...  0  0  0]
 [ 0  5  0 ...  0  0  0]
 [ 0  6  0 ...  0  0  0]]


In [None]:
# Standardize numeric version of text
X_text = scaler.fit_transform(X_text)

In [None]:
print(X_text)

[[ 1.06925997  3.36384864 -0.07474351 ... -0.07474351 -0.07474351
  -0.07474351]
 [-0.24900575  0.31067392 -0.07474351 ... -0.07474351 -0.07474351
  -0.07474351]
 [-0.24900575  1.51587446 -0.07474351 ... -0.07474351 -0.07474351
  -0.07474351]
 ...
 [-0.24900575  0.87310084 -0.07474351 ... -0.07474351 -0.07474351
  -0.07474351]
 [-0.24900575 -0.73383322 -0.07474351 ... -0.07474351 -0.07474351
  -0.07474351]
 [-0.24900575 -0.65348652 -0.07474351 ... -0.07474351 -0.07474351
  -0.07474351]]


In [None]:
X = np.hstack((X_cat, X_cont, X_text))

In [None]:
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=15, affinity='euclidean', linkage='ward')
cluster.fit_predict(X)

array([ 0,  9,  1,  4,  4,  0,  9,  4,  4,  7,  7, 11, 11,  7,  7,  2,  6,
        7,  7,  7,  1,  9,  4,  2, 11,  8,  2,  8, 11, 11, 12,  1,  1,  1,
        4,  1,  4,  8, 11,  9,  6, 13,  2,  6,  0,  1,  2,  6,  1,  4,  3,
        9,  7,  5,  7,  4, 11, 14,  5,  4,  7,  4,  1,  1,  9,  9,  1,  4,
        1,  2,  4,  2,  9,  6,  8,  6,  8, 11,  4,  4,  5, 11,  4,  2,  9,
        4,  9,  7,  1,  6,  9, 11,  4, 11,  8,  6,  4,  7, 11,  3,  8,  7,
        2,  9, 11,  2,  7,  3,  2,  2,  3,  4,  2,  8,  7, 11,  7,  8,  7,
        7,  7, 11,  7, 11,  7,  1,  2,  9,  9,  8,  1,  4,  8,  2,  8,  4,
        2,  2,  2,  6,  8,  8,  6,  2, 11,  3,  9,  2,  1,  2,  8,  8, 11,
        2, 11, 10,  9,  1,  3,  3,  9,  1,  9,  9,  6,  6,  2,  4,  1,  2,
       11,  4,  2,  2,  2,  8,  6,  8,  6,  8])

In [None]:
# Get the cluster labels
labels = cluster.labels_
# Count the number of occurrences of each label
unique, counts = np.unique(labels, return_counts=True)

# Print the count of members within each cluster
for label, count in zip(unique, counts):
    print(f"Cluster {label}: {count} members")

Cluster 0: 3 members
Cluster 1: 19 members
Cluster 2: 27 members
Cluster 3: 7 members
Cluster 4: 24 members
Cluster 5: 3 members
Cluster 6: 14 members
Cluster 7: 21 members
Cluster 8: 19 members
Cluster 9: 19 members
Cluster 10: 1 members
Cluster 11: 20 members
Cluster 12: 1 members
Cluster 13: 1 members
Cluster 14: 1 members


# Ensemble Model

Combine clustering results of numeric features and text features

In [None]:
X_nontext = np.hstack((X_cat, X_cont))

In [None]:
cluster2 = AgglomerativeClustering(n_clusters=15, affinity='euclidean', linkage='ward')
cluster2.fit_predict(X_cat)

array([ 2,  1,  3,  2,  2,  5,  3,  2,  3,  1, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11,  7, 12, 10,  1,  3,  1, 10,  5, 10,  2,  5, 13,  0,  0,
        0,  6,  2,  1, 13,  2,  5,  9,  9, 10,  0,  0, 10, 10,  0,  0,  6,
        6,  6,  6,  6,  8,  6,  6,  8,  6,  6,  6,  0,  0,  6,  9,  4,  4,
        0,  8,  0,  6,  6,  6,  6,  6,  4,  6,  0,  6,  7,  4,  7,  9,  1,
        0,  0,  7,  4,  9,  0,  9,  4, 10,  4,  6,  8,  4,  2,  2,  2,  0,
        0,  0,  9,  7, 14,  9,  2,  0,  0,  2, 12,  9, 12,  7,  7, 12, 12,
        7, 10,  1,  7,  1,  1,  0,  8,  8, 14,  8,  4,  2, 13,  0,  0,  3,
        2,  2, 13,  1,  5,  7, 14,  5,  0, 14,  9, 14,  1,  3,  1,  1, 14,
        6,  1,  8,  6,  4,  4,  9,  9,  8,  9,  8,  9,  0,  0,  9,  9,  8,
        0,  2, 10, 12,  0, 13, 14,  1,  4,  1])

In [None]:
# get the cluster labels
labels = cluster2.labels_
# count the number of occurrences of each label
unique, counts = np.unique(labels, return_counts=True)

# print the count of members within each cluster
for label, count in zip(unique, counts):
    print(f"Cluster {label}: {count} members")

Cluster 0: 28 members
Cluster 1: 16 members
Cluster 2: 16 members
Cluster 3: 6 members
Cluster 4: 12 members
Cluster 5: 6 members
Cluster 6: 22 members
Cluster 7: 10 members
Cluster 8: 11 members
Cluster 9: 16 members
Cluster 10: 9 members
Cluster 11: 10 members
Cluster 12: 6 members
Cluster 13: 5 members
Cluster 14: 7 members


In [None]:
from sklearn.metrics import silhouette_score
# Combine cluster assignments
clusters_combined = []
for i in range(len(topics)):
    if topics[i] == -1:  # handle unassigned documents
        clusters_combined.append(labels[i])
    else:
        clusters_combined.append(topics[i])

# Evaluate performance
score = silhouette_score(X_nontext, clusters_combined)
print(f"Silhouette score: {score}")

Silhouette score: -0.13858990974223914


In [None]:
cluster_labels = cluster2.labels_
for i in range(len(np.unique(cluster_labels))):
    print(f"\nCluster {i}:")
    members = np.where(cluster_labels == i)[0]
    print(members)


Cluster 0:
[ 32  33  34  44  45  48  49  62  63  68  70  78  85  86  90 101 102 103
 109 110 125 133 134 144 165 166 170 174]

Cluster 1:
[  1   9  23  25  37  84 121 123 124 139 148 150 151 154 177 179]

Cluster 2:
[  0   3   4   7  29  36  39  98  99 100 108 111 131 136 137 171]

Cluster 3:
[  2   6   8  24 135 149]

Cluster 4:
[ 66  67  76  81  88  92  94  97 130 157 158 178]

Cluster 5:
[  5  27  30  40 140 143]

Cluster 6:
[ 35  50  51  52  53  54  56  57  59  60  61  64  71  72  73  74  75  77
  79  95 153 156]

Cluster 7:
[ 20  80  82  87 105 115 116 119 122 141]

Cluster 8:
[ 55  58  69  96 126 127 129 155 161 163 169]

Cluster 9:
[ 41  42  65  83  89  91 104 107 113 146 159 160 162 164 167 168]

Cluster 10:
[ 22  26  28  43  46  47  93 120 172]

Cluster 11:
[10 11 12 13 14 15 16 17 18 19]

Cluster 12:
[ 21 112 114 117 118 173]

Cluster 13:
[ 31  38 132 138 175]

Cluster 14:
[106 128 142 145 147 152 176]


In [None]:
for key, value in doc_clusters.items():
    print(key, ": ", value)

10 :  [0, 3, 4, 7, 8]
5 :  [2, 165, 166, 167, 168, 169, 170, 172, 173, 174]
7 :  [6, 31, 32, 33, 34, 35, 38, 39]
0 :  [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 28, 40, 42, 43, 46, 47, 51, 52, 53, 54, 59, 60, 65, 66, 71, 73, 74, 76, 80, 82, 83, 87, 89, 91, 92, 93, 97, 100, 101, 104, 111, 113, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 150, 152, 154, 162, 164]
1 :  [20, 24, 56, 64, 79, 106, 114, 145, 146, 147, 148, 149, 151, 153]
3 :  [21, 22, 23, 25, 26, 27, 29, 171, 175, 176, 177, 178, 179]
11 :  [30, 36, 37, 127, 129]
2 :  [41, 44, 61, 72, 107, 109, 128, 155, 156, 157, 158, 159, 161, 163]
4 :  [45, 67, 68, 70, 78, 85, 86, 90, 103, 110, 112]
9 :  [48, 58, 62, 63, 102, 105]
12 :  [49, 55, 81, 96]
6 :  [84, 136, 137, 138, 139, 140, 141, 143, 144]
8 :  [88, 125, 130, 131, 132, 133, 134, 160]


In [None]:
from sklearn.metrics.cluster import adjusted_rand_score

# assume clusters1 and clusters2 are the two sets of cluster labels
ari = adjusted_rand_score(cluster_labels, topics)
print(f"Adjusted Rand Index: {ari}")

Adjusted Rand Index: 0.06755166964078407


**Conclusion**  
Not much similarity between these 2 clustering results.