In [92]:
reviews = [
    "Battery life is very good but heats sometimes",
    "Delivery was fast and packaging was nice",
    "Battery drains quickly after update",
    "Customer service was very bad",
    "‡§´‡•ã‡§®‡§ï‡•ã ‡§¨‡•ç‡§Ø‡§æ‡§ü‡•ç‡§∞‡•Ä ‡§∞‡§æ‡§Æ‡•ç‡§∞‡•ã ‡§õ‡•à‡§®",
]


from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from hdbscan import HDBSCAN


In [93]:
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
embeddings = model.encode([
    "‡§Ø‡•ã ‡§Æ‡•ã‡§¨‡§æ‡§á‡§≤ ‡§∞‡§æ‡§Æ‡•ç‡§∞‡•ã ‡§õ",
    "This phone has great battery life",
    "price ‡§Ö‡§®‡•Å‡§∏‡§æ‡§∞ ‡§∞‡§æ‡§Æ‡•ç‡§∞‡•ã ‡§õ"
])

In [94]:
embeddings

array([[ 0.04999362, -0.23125897, -0.01702522, ...,  0.15209763,
         0.07102852, -0.13554004],
       [ 0.0848873 , -0.395048  , -0.01153254, ..., -0.01053692,
         0.03710957, -0.1216694 ],
       [-0.00959331, -0.00097401, -0.01989246, ...,  0.04983946,
         0.01411792, -0.06305858]], shape=(3, 768), dtype=float32)

In [95]:
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Convert sentences to embeddings
sentences = ["First sentence", "Second sentence"]
embeddings = model.encode(sentences)

embeddings.shape

(2, 384)

In [96]:
embedding_review = model.encode(reviews)

In [97]:
reviews = [
    "Battery life is very good but heats sometimes",
    "Delivery was fast and packaging was nice",
    "Battery drains quickly after update",
    "‡§´‡•ã‡§®‡§ï‡•ã ‡§¨‡•ç‡§Ø‡§æ‡§ü‡•ç‡§∞‡•Ä ‡§∞‡§æ‡§Æ‡•ç‡§∞‡•ã ‡§õ‡•à‡§®",
    "Size was perfect but quality is average",
    "Delivery was delayed and box was damaged",
     "Battery life is very good but heats sometimes",
    "Delivery was fast and packaging was nice",
    "Battery drains quickly after update",
    "Customer service was very bad",
    "‡§´‡•ã‡§®‡§ï‡•ã ‡§¨‡•ç‡§Ø‡§æ‡§ü‡•ç‡§∞‡•Ä ‡§∞‡§æ‡§Æ‡•ç‡§∞‡•ã ‡§õ‡•à‡§®",
]

def select_topic_params(n_reviews):
    if n_reviews <= 30:
        return {
            "min_cluster_size": 3,
            "min_samples": 2,
            "n_neighbors": 5
        }

    elif n_reviews <= 100:
        return {
            "min_cluster_size": 5,
            "min_samples": 3,
            "n_neighbors": 8
        }

    elif n_reviews <= 300:
        return {
            "min_cluster_size": 10,
            "min_samples": 5,
            "n_neighbors": 10
        }

    elif n_reviews <= 1000:
        return {
            "min_cluster_size": 20,
            "min_samples": 8,
            "n_neighbors": 15
        }

    else:
        return {
            "min_cluster_size": 40,
            "min_samples": 15,
            "n_neighbors": 20
        }


params = select_topic_params(len(reviews))
print(params)


{'min_cluster_size': 3, 'min_samples': 2, 'n_neighbors': 5}


In [99]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import math

n_reviews = len(reviews)

min_cluster_size = max(5, math.ceil(n_reviews / 20))
min_samples = max(2, min_cluster_size // 2)

print(f"Auto min_cluster_size: {min_cluster_size}")
print(f"Auto min_samples: {min_samples}")


umap_model = UMAP(n_neighbors=params["n_neighbors"], n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(
    min_cluster_size=params["min_cluster_size"],
    min_samples=params["min_samples"],
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True
)

topic_model = BERTopic(
    embedding_model=model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    language="multilingual",
    calculate_probabilities=True,
    verbose=True
)



topics, probs = topic_model.fit_transform(reviews)


2026-01-25 13:40:37,411 - BERTopic - Embedding - Transforming documents to embeddings.


Auto min_cluster_size: 5
Auto min_samples: 2


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 12.10it/s]
2026-01-25 13:40:37,502 - BERTopic - Embedding - Completed ‚úì
2026-01-25 13:40:37,502 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-25 13:40:37,637 - BERTopic - Dimensionality - Completed ‚úì
2026-01-25 13:40:37,638 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-25 13:40:37,644 - BERTopic - Cluster - Completed ‚úì
2026-01-25 13:40:37,649 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-25 13:40:37,659 - BERTopic - Representation - Completed ‚úì


In [100]:
from collections import defaultdict

topic_to_reviews = defaultdict(list)

for review, topic in zip(reviews, topics):
    if topic != -1:   # ignore noise
        topic_to_reviews[topic].append(review)


In [101]:
topic_info = topic_model.get_topic_info()

def get_topic_name(topic_id):
    return topic_info[topic_info.Topic == topic_id].Name.values[0]


In [102]:
final_topics = []

for topic_id, revs in topic_to_reviews.items():
    final_topics.append({
        "topic_id": topic_id,
        "topic_name": get_topic_name(topic_id),
        "count": len(revs),
        "reviews": revs
    })


In [103]:
for t in final_topics:
    print(f"\nüîπ Topic: {t['topic_name']} ({t['count']} reviews)")
    for r in t["reviews"]:
        print(" -", r)



üîπ Topic: 0_battery_life_drains_after (4 reviews)
 - Battery life is very good but heats sometimes
 - Battery drains quickly after update
 - Battery life is very good but heats sometimes
 - Battery drains quickly after update

üîπ Topic: 1_was_delivery_and_nice (4 reviews)
 - Delivery was fast and packaging was nice
 - Size was perfect but quality is average
 - Delivery was delayed and box was damaged
 - Delivery was fast and packaging was nice

üîπ Topic: 2_‡§®‡§ï_customer_service_bad (3 reviews)
 - ‡§´‡•ã‡§®‡§ï‡•ã ‡§¨‡•ç‡§Ø‡§æ‡§ü‡•ç‡§∞‡•Ä ‡§∞‡§æ‡§Æ‡•ç‡§∞‡•ã ‡§õ‡•à‡§®
 - Customer service was very bad
 - ‡§´‡•ã‡§®‡§ï‡•ã ‡§¨‡•ç‡§Ø‡§æ‡§ü‡•ç‡§∞‡•Ä ‡§∞‡§æ‡§Æ‡•ç‡§∞‡•ã ‡§õ‡•à‡§®


In [108]:
from keybert import KeyBERT

# Initialize KeyBERT model
kw_model = KeyBERT()

for t in final_topics:
    # Combine all reviews of the cluster
    reviews_text = " ".join(t["reviews"])
    
    # Extract top 3 keywords/keyphrases
    keywords = kw_model.extract_keywords(reviews_text, top_n=1, keyphrase_ngram_range=(1, 2))
    
    # Create topic name from keywords
    topic_name = ", ".join([k[0] for k in keywords])
    
    print(f"\nüîπ Topic: {topic_name} ({t['count']} reviews)")
    
    print("Reviews:")
    for r in t["reviews"]:
        print(" -", r)



üîπ Topic: battery life (4 reviews)
Reviews:
 - Battery life is very good but heats sometimes
 - Battery drains quickly after update
 - Battery life is very good but heats sometimes
 - Battery drains quickly after update

üîπ Topic: average delivery (4 reviews)
Reviews:
 - Delivery was fast and packaging was nice
 - Size was perfect but quality is average
 - Delivery was delayed and box was damaged
 - Delivery was fast and packaging was nice

üîπ Topic: ‡§®‡§ï customer (3 reviews)
Reviews:
 - ‡§´‡•ã‡§®‡§ï‡•ã ‡§¨‡•ç‡§Ø‡§æ‡§ü‡•ç‡§∞‡•Ä ‡§∞‡§æ‡§Æ‡•ç‡§∞‡•ã ‡§õ‡•à‡§®
 - Customer service was very bad
 - ‡§´‡•ã‡§®‡§ï‡•ã ‡§¨‡•ç‡§Ø‡§æ‡§ü‡•ç‡§∞‡•Ä ‡§∞‡§æ‡§Æ‡•ç‡§∞‡•ã ‡§õ‡•à‡§®
