# I. Library Setup

In [62]:
import pandas as pd
import numpy as np
import warnings
# Embedding data
from sentence_transformers import SentenceTransformer

# Data clustering
from sklearn.cluster import KMeans

# Cluster evaluation
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Topic modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Data visualization
import plotly.graph_objects as go

# Reduce data dimensionality
import umap

# II. Data Preparation

## 2.1 Load "post_text"

In [63]:
# load post 
df_PostText = pd.read_csv("../data/df_PostText_groupby.csv", encoding='ISO-8859-1')
df_PostText.head()

Unnamed: 0,User_name,post_text
0,10Ronaldinho,feliz aniversÃ¡rio pra vc minha mÃ£e sua falta...
1,143redangel,rate phenomen tatak angel locsin angel neil â...
2,50cent,sweden gon germani tonightglggreenlightgang wo...
3,5SOS,merch grab love amazon music silver line gol...
4,ABdeVilliers17,flipkart â big billion sale kick wrogn activ...


In [64]:
list_post_per_user = list(df_PostText["post_text"])

## 2.2 Embedding "post_text" using three models: e5-base, e5-small, e5-large

In [65]:
# def get_e5_embeddings(list_post_per_user, model_name):
#     if model_name == 'e5-base':
#         model = SentenceTransformer('intfloat/e5-base-v2')
#     elif model_name == 'e5-small':
#         model = SentenceTransformer('intfloat/e5-small-v2')
#     elif model_name == 'e5-large':
#         model = SentenceTransformer('intfloat/e5-large-v2')
#     else:
#         raise ValueError("model_name phải là 'e5-base', 'e5-small' hoặc 'e5-large'")
    
#     embeddings = model.encode(list_post_per_user)
#     embedding_df = pd.DataFrame(embeddings)
#     return embedding_df

### Embedding "post_text" using e5-base model

In [66]:
# e5_base_embeddings = get_e5_embeddings(list_post_per_user, 'e5-base')
# embedding_df.to_csv("../data/e5-base_embbeding.csv", index=False)

### Embedding "post_text" using e5-small model

In [67]:
# e5_small_embeddings = get_e5_embeddings(list_post_per_user, 'e5-small')
# embedding_df.to_csv("../data/e5-small_embbeding.csv", index=False)

### Embedding "post_text" using e5-large model

In [68]:
# e5_large_embeddings = get_e5_embeddings(list_post_per_user, 'e5-large')
# embedding_df.to_csv("../data/e5-large_embbeding.csv", index=False)

# III. Clustering

In [69]:
def clustering(clustering_df , num_cluters):
    kmeans = KMeans(n_clusters=num_cluters, init="k-means++", max_iter=300, n_init=10, random_state=42)
    clustering_df["cluster"] = kmeans.fit_predict(clustering_df.values)

    silhouette = silhouette_score(clustering_df.iloc[:, :-1], clustering_df["cluster"])
    dbi  = davies_bouldin_score(clustering_df.iloc[:, :-1], clustering_df["cluster"])
    ch_index = calinski_harabasz_score(clustering_df.iloc[:, :-1], clustering_df["cluster"])

    eval_metrics_df = pd.DataFrame({
        f"Davies-Bouldin Index": [dbi],
        f"Silhouette Score": [silhouette],
        f"Calinski-Harabaz Index": [ch_index]
    })

    return clustering_df, eval_metrics_df

## 3.1 Clustering with E5-base model results

In [70]:
df_e5_base = pd.read_csv("../data/e5-base_embbeding.csv", encoding='ISO-8859-1')
df_e5_base_clustering, e5_base_metrics = clustering(df_e5_base, num_cluters=5)
e5_base_metrics

Unnamed: 0,Davies-Bouldin Index,Silhouette Score,Calinski-Harabaz Index
0,4.569991,0.023231,7.45261


## 3.2 Clustering with E5-small model results

In [71]:
df_e5_small = pd.read_csv("../data/e5-small_embbeding.csv", encoding='ISO-8859-1')
df_e5_small_clustering, e5_small_metrics = clustering(df_e5_small, num_cluters=5)
e5_small_metrics

Unnamed: 0,Davies-Bouldin Index,Silhouette Score,Calinski-Harabaz Index
0,5.161076,0.009018,6.368805


## 3.3 Clustering with E5-large model results

In [72]:
df_e5_large = pd.read_csv("../data/e5-large_embbeding.csv", encoding='ISO-8859-1')
df_e5_large_clustering, e5_large_metrics = clustering(df_e5_large, num_cluters=5)
e5_large_metrics

Unnamed: 0,Davies-Bouldin Index,Silhouette Score,Calinski-Harabaz Index
0,5.069198,0.020068,6.490568


# VI. Model Evaluation

In [73]:
df_eval_metrics = pd.concat([e5_base_metrics, e5_small_metrics, e5_large_metrics], axis=0)
df_eval_metrics.index = ["e5-base", "e5-small", "e5-large"]

df_eval_metrics

Unnamed: 0,Davies-Bouldin Index,Silhouette Score,Calinski-Harabaz Index
e5-base,4.569991,0.023231,7.45261
e5-small,5.161076,0.009018,6.368805
e5-large,5.069198,0.020068,6.490568


# V. Topic Modeling

In [74]:
for  i in range(5) :
    list_post_sr = pd.Series(list_post_per_user)
    list_post_sr = list(list_post_sr[df_e5_base_clustering["cluster"] == i])

    vectorizer = TfidfVectorizer()
    embedding_vectors = vectorizer.fit_transform(list_post_sr)

    lda_model = LatentDirichletAllocation(n_components=2, random_state=42)
    lda_model.fit(embedding_vectors)

    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda_model.components_):
        print("Topic ",i,":")
        top_words_idx = topic.argsort()[-50:][::-1]
        top_words = [feature_names[i] for i in top_words_idx]
        print(top_words)

Topic  0 :
['thank', 'kardashian', 'love', 'jenner', 'kyli', 'kim', 'night', 'video', 'new', 'show', 'life', 'kardashianjennernewsupd', 'fifth', 'world', 'friend', 'girl', 'peopl', 'famili', 'birthday', 'everyon', 'music', 'way', 'kendal', 'fan', 'harmoni', 'home', 'collect', 'photo', 'song', 'normani', 'link', 'victoriajustic', 'stori', 'share', 'babi', 'thing', 'beauti', 'kourtney', 'live', 'work', 'look', 'women', 'gaga', 'ebru', 'kany', 'jauregui', 'kati', 'morn', 'snooki', 'season']
Topic  0 :
['victoriajustic', 'ebru', 'karla', 'nina', 'codi', 'serena', 'austin', 'snooki', 'serenawilliam', 'origin', 'kardashianjennernewsupd', 'à¹', 'william', 'kardashiansist', 'thesnookishop', 'keyssoulcar', 'gaga', 'kendal', 'kyli', 'keyssoulcarecom', 'forev', 'alway', 'victoria', 'dobrev', 'jenner', '²ð', 'luci', 'hale', 'britney', '¼ð', 'ig', 'gyrlvers', 'vic', 'simpson', 'instagram', 'dona', 'venus', 'codeï¼', 'que', 'ari', 'new', 'code', 'mav', 'kati', 'cleanskincar', 'miley', 'vampir', 'mei

In [75]:
labels = [
            '1: Pop Culture Icons: Kardashian, Music & Fashion',
            '2: NBA & WWE',
            '3: Music: Pop-Rock & EDM',
            '4: Bollywood & Sports',
            '5: Hollywood & Entertainment Icons'
        ]

# VI. Results Visualization


In [76]:
def compute_umap(clustering_df):
    Umap = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=3, random_state=42)
    umap_result = Umap.fit_transform(clustering_df.iloc[:, :-1])
    df_umap = pd.DataFrame(data=umap_result, columns=['first_dim', 'second_dim', 'third_dim'])
    return df_umap

df_umap = compute_umap(df_e5_base_clustering)
df_umap['cluster'] = df_e5_base_clustering['cluster']


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [77]:
def create_3d_scatter_plot(df_umap, custom_labels):
    # Centers of each cluster
    centers = df_umap.groupby('cluster')[['first_dim', 'second_dim', 'third_dim']].mean().values

    # Custom colors for each cluster
    custom_colors = ['#4535C1', '#FF8225', '#ED3EF7', '#00712D', '#F5004F', '#FFEB55', '#FF8C9E', '#836FFF']

    # Create 3D scatter plot figure
    fig = go.Figure()

    # Loop through each cluster and add scatter points
    for cluster_id, center in enumerate(centers):
        cluster_points = df_umap[df_umap['cluster'] == cluster_id]

        fig.add_trace(go.Scatter3d(
            x=cluster_points['first_dim'],
            y=cluster_points['second_dim'],
            z=cluster_points['third_dim'],
            mode='markers',
            marker=dict(
                size=8,
                color=custom_colors[cluster_id],
                opacity=0.7
            ),
            name=custom_labels[cluster_id]
        ))

        # Add cluster center text
        fig.add_trace(go.Scatter3d(
            x=[center[0]],
            y=[center[1]],
            z=[center[2]],
            mode='text',
            textfont=dict(size=15, color='black', family='Arial'),
            showlegend=False
        ))

    # Customize axis labels and title
    fig.update_layout(
        autosize=True,
        scene=dict(
            xaxis_title='First Dimension',
            yaxis_title='Second Dimension',
            zaxis_title='Third Dimension'
        ),
        title='Users Clustering in 3D',
        legend_title='Choose Topics',
        # Adjust legend to appear below the plot area
        legend=dict(
            orientation='h',
            x=0.5, y=-0.15,  # Position the legend below the plot
            xanchor='center',  # Center the legend horizontally
            font=dict(size=10),  # Reduce font size for better visibility
            itemclick="toggleothers",
            itemdoubleclick="toggle"
        ),
        margin=dict(l=0, r=0, b=0, t=30)  # Reduce margins for responsiveness
    )
    return fig

fig = create_3d_scatter_plot(df_umap, labels)

# Show the plot
fig.show()
