<span style="color:blue; font-size:24px;"><b>BERTopic analysis of university students complaints</b></span>

<span style="color:orange; font-size:18px;"><b>0. Loading the data</b></span>

In [2]:
import os
import sys
import pandas as pd

# Project root path
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(BASE_DIR)

DATA_PATH = os.path.join(BASE_DIR, "data", "Datasetprojpowerbi.csv")
df = pd.read_csv(DATA_PATH)

texts = df["Reports"].tolist()
print(f"Loaded {len(texts)} docs.")

Loaded 1005 docs.


<span style="color:orange; font-size:18px;"><b>1. Cleaning and tokenizing</b></span>

In [3]:
from src.preprocessing.clean_text import normalize_texts, lemmatize_and_tokenize

# 1. Normalization
cleaned_texts = normalize_texts(texts)

# 2. Lemmatization and tokenization
tokenized_texts = lemmatize_and_tokenize(cleaned_texts)
join_texts = [" ".join(words) for words in tokenized_texts]

print(f"Cleaned and tokenized {len(tokenized_texts)} docs.")

Cleaned and tokenized 1005 docs.


<span style="color:orange; font-size:18px;"><b>2. Training BERTopic models for diff topic number and calculating c_v and WETC</b></span>

In [4]:
from gensim.models import CoherenceModel
from src.vectorization.vecorize_lsa_lda import build_dictionary
from src.topic_models.bertopic_model import train_bertopic
from src.metrics.wetc import wetc
from sklearn.feature_extraction.text import CountVectorizer
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

vectorizer = CountVectorizer(ngram_range=(1,2))

dictionary = build_dictionary(tokenized_texts)

for num_topics in range(5, 17):
    bert_model, topics = train_bertopic(
        join_texts, 
        nr_topics=num_topics, 
        verbose=False, 
        min_topic_size=7,
        vectorizer=vectorizer,
        top_n_words=10
    )
    
    topics_list = [[w for w,_ in words] for tid, words in bert_model.get_topics().items() if tid!=-1]
    
    coherence_model = CoherenceModel(
        topics=topics_list,
        texts=tokenized_texts,   
        dictionary=dictionary,
        coherence='c_v',
        window_size=10
    )
    cv_score = coherence_model.get_coherence()
    wetc_score = wetc(topics_list)
    waver = 0.7 * wetc_score + 0.3 * cv_score
    print(f"Num topics: {num_topics}, C_v: {cv_score:.3f},  WETC: {wetc_score:.3f}, Average: {waver:.3f}")

Num topics: 5, C_v: 0.475,  WETC: 0.651, Average: 0.598
Num topics: 6, C_v: 0.556,  WETC: 0.637, Average: 0.613
Num topics: 7, C_v: 0.564,  WETC: 0.638, Average: 0.616
Num topics: 8, C_v: 0.593,  WETC: 0.635, Average: 0.623
Num topics: 9, C_v: 0.570,  WETC: 0.643, Average: 0.621
Num topics: 10, C_v: 0.577,  WETC: 0.642, Average: 0.622
Num topics: 11, C_v: 0.602,  WETC: 0.638, Average: 0.627
Num topics: 12, C_v: 0.626,  WETC: 0.641, Average: 0.636
Num topics: 13, C_v: 0.631,  WETC: 0.637, Average: 0.635
Num topics: 14, C_v: 0.659,  WETC: 0.638, Average: 0.645
Num topics: 15, C_v: 0.664,  WETC: 0.641, Average: 0.648
Num topics: 16, C_v: 0.669,  WETC: 0.641, Average: 0.649


<span style="color:orange; font-size:18px;"><b>3. Topics statistics for num_topics=16 (15 topics + 1 outliers)</b></span>

In [7]:
from collections import Counter
import numpy as np

# train BERTopic
bert_model, topics = train_bertopic(
    join_texts, 
    nr_topics=16, 
    verbose=False, 
    min_topic_size=7, 
    top_n_words=10,
    calculate_probabilities=True
)

topics_per_doc, probs = bert_model.transform(join_texts)  # probs.shape = (num_docs, num_topics)

num_docs, num_topics = probs.shape

# Creating topic words dictionary
topic_words_dict = {}
for topic_id, words in bert_model.get_topics().items():
    if topic_id == -1:
        continue  # outliers
    topic_words_dict[topic_id] = [w for w, _ in words]

# Topic ids with max probability
top1_idx = np.argmax(probs, axis=1)

# Topic id's with second max probability
probs_copy = probs.copy()
for i in range(num_docs):
    probs_copy[i, top1_idx[i]] = -1  # replace max probability with -1
top2_idx = np.argmax(probs_copy, axis=1)

# Count number of complaints for each topic (first and second max)
top1_counts = Counter(top1_idx)
top2_counts = Counter(top2_idx)

# Searcing for representative original document - the one with max probability of the top probability topic
representative_idx = {}   

for topic_id in topic_words_dict.keys():
    docs_idx = np.where(top1_idx == topic_id)[0]
    if len(docs_idx) == 0:
        representative_idx[topic_id] = None
    else:
        doc_probs = probs[docs_idx, topic_id]
        rep_idx = docs_idx[np.argmax(doc_probs)]
        representative_idx[topic_id] = rep_idx

# Final table
rows = []
for topic_id in topic_words_dict.keys():
    idx = representative_idx.get(topic_id)
    orig_text = texts[idx] if idx is not None else ""
    rows.append({
        'Topic': topic_id,
        'Top Words': ", ".join(topic_words_dict[topic_id]),
        'Documents (max prob)': top1_counts.get(topic_id, 0),
        'Documents (second max prob)': top2_counts.get(topic_id, 0),
        'Representative Document': orig_text
    })

df = pd.DataFrame(rows).sort_values('Topic').reset_index(drop=True)
df.sort_values(by="Documents (max prob)", ascending=False)

Unnamed: 0,Topic,Top Words,Documents (max prob),Documents (second max prob),Representative Document
3,3,"course, online, class, material, availability,...",174,479,I'm having trouble finding the course material...
0,0,"food, option, cafeteria, campus, offer, cantin...",141,226,"6. ""The pizza available in the cantine is terr..."
2,2,"academic, balance, stress, workload, time, wor...",100,2,The academic workload is becoming overwhelming...
1,1,"job, opportunity, internship, field, career, p...",97,41,There seems to be a scarcity of available inte...
4,4,"international, language, cultural, sometimes, ...",92,2,"10. ""I've had some negative experiences with o..."
5,5,"sport, athletic, athlete, team, eligibility, g...",86,0,I'm disappointed with the gender inequality in...
6,6,"financial, aid, loan, scholarship, pay, tuitio...",74,28,"The financial aid process is so slow, and I n..."
7,7,"research, access, database, software, technolo...",49,140,It's frustrating to have limited access to res...
10,10,"housing, rent, affordable, live, near, apartme...",37,68,"9. ""The housing options near campus are very l..."
8,8,"medical, expense, insurance, treatment, pay, c...",34,3,It's frustrating that my health insurance doe...
