
#Topic Modeling & Clustering
Train topic modeling to group customer survey comments into themes like wait time, prescription issues, facility navigation, etc—helping identify key areas of concern and emerging patterns

In [2]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [31]:
from bertopic import BERTopic
import pandas as pd
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance


In [32]:
from sre_constants import error
df = pd.read_csv('./hospital.csv')

# pre process dataset
df.dropna(subset=['Feedback'])
df.drop(columns=['Unnamed: 3', 'Ratings', 'Sentiment Label'], errors='ignore')


Unnamed: 0,Feedback
0,Good and clean hospital. There is great team o...
1,Had a really bad experience during discharge. ...
2,I have visited to take my second dose and Proc...
3,That person was slightly clueless and offered...
4,There is great team of doctors and good OT fac...
...,...
991,very careful about safety measures every one i...
992,I do not trust in their reports I got same tes...
993,They just want the patients to return to their...
994,I suggest you not to visit this hospital if yo...


In [33]:
keybert_model = KeyBERTInspired()
mmr_model = MaximalMarginalRelevance(diversity=0.3)

topic_model = BERTopic(
    embedding_model="all-MiniLM-L6-v2",
    representation_model=[keybert_model, mmr_model],  # Multiple models
    verbose=True
)
topics, probs = topic_model.fit_transform(df['Feedback'])


2025-06-25 23:12:16,973 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

2025-06-25 23:12:31,434 - BERTopic - Embedding - Completed ✓
2025-06-25 23:12:31,436 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-25 23:12:33,806 - BERTopic - Dimensionality - Completed ✓
2025-06-25 23:12:33,807 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-25 23:12:33,834 - BERTopic - Cluster - Completed ✓
2025-06-25 23:12:33,837 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-25 23:12:39,486 - BERTopic - Representation - Completed ✓


In [34]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,301,-1_hospital_patients_appointment_patient,"[hospital, patients, appointment, patient, sur...",[Had an emergency visit to the hospital with m...
1,0,153,0_treatment_doctor_patient_surgery,"[treatment, doctor, patient, surgery, doctors,...",[We have visited the hospital for knee replace...
2,1,90,1_hospital_hospitals_emergency_patients,"[hospital, hospitals, emergency, patients, icu...",[Worst hospital please never go to this hospit...
3,2,80,2_hospital_facilities_facility_rooms,"[hospital, facilities, facility, rooms, staffs...",[This hospital is nice All the sections are we...
4,3,65,3_nursing_nurses_nurse_patients,"[nursing, nurses, nurse, patients, hospital, p...",[She had an effective communication with the p...
5,4,44,4_hospital_inpatient_outpatient_experienced,"[hospital, inpatient, outpatient, experienced,...",[I had an good experience in this Hospital whe...
6,5,35,5_appointment_appointments_patients_patient,"[appointment, appointments, patients, patient,...",[They asked me to come after 2 days to show to...
7,6,34,6_hospital_patients_tests_patient,"[hospital, patients, tests, patient, test, doc...",[They just say wait doctor will come or test ...
8,7,33,7_service_services_staffs_staff,"[service, services, staffs, staff, support, he...",[The support service team was also good in the...
9,8,30,8_hospitals_hospital_patients_nu,"[hospitals, hospital, patients, nu, patient, s...",[The doctors the nurses and every single staff...


In [43]:
documents = df['Feedback'].tolist()
df['Topics'] = topics
df['Topic Probabilities'] = probs

def generate_readable_label(topic_id, top_n=3):
    if topic_id == -1:
        return "Other / Uncategorized"
    topic = topic_model.get_topic(topic_id)
    if topic:
        return " / ".join([word for word, _ in topic[:top_n]])
    return "Unknown"

df['Topic Label'] = df['Topics'].apply(generate_readable_label)
df_topic_only = df[["Feedback", "Topics","Topic Label"]].copy()
df_topic_only.head()

Unnamed: 0,Feedback,Topics,Topic Label
0,Good and clean hospital. There is great team o...,2,hospital / facilities / facility
1,Had a really bad experience during discharge. ...,12,discharge / patients / hospital
2,I have visited to take my second dose and Proc...,0,treatment / doctor / patient
3,That person was slightly clueless and offered...,-1,Other / Uncategorized
4,There is great team of doctors and good OT fac...,2,hospital / facilities / facility


In [41]:
df_topic_only.sort_values(by="Topics").reset_index(drop=True)

Unnamed: 0,Feedback,Topic Label
0,Good and clean hospital. There is great team o...,hospital / facilities / facility
1,Had a really bad experience during discharge. ...,discharge / patients / hospital
2,I have visited to take my second dose and Proc...,treatment / doctor / patient
3,That person was slightly clueless and offered...,Other / Uncategorized
4,There is great team of doctors and good OT fac...,hospital / facilities / facility
...,...,...
991,very careful about safety measures every one i...,Other / Uncategorized
992,I do not trust in their reports I got same tes...,hospital / patients / tests
993,They just want the patients to return to their...,hospital / patients / tests
994,I suggest you not to visit this hospital if yo...,hospital / hospitals / emergency


In [44]:
df_topic_only["Topic Label"].value_counts()

Unnamed: 0_level_0,count
Topic Label,Unnamed: 1_level_1
Other / Uncategorized,301
treatment / doctor / patient,153
hospital / hospitals / emergency,90
hospital / facilities / facility,80
nursing / nurses / nurse,65
hospital / inpatient / outpatient,44
appointment / appointments / patients,35
hospital / patients / tests,34
service / services / staffs,33
hospitals / hospital / patients,30
