In [10]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [11]:
# Load and filter data
df = pd.read_csv("../Data/ucr_submissions.csv")
df_filtered = df[df['selftext'].notna()]
df_filtered = df_filtered[~df_filtered['selftext'].isin(['[deleted]', '[removed]'])]
df_sample = df_filtered.sample(n=1000, random_state=42)

# 1: Extract 'selftext' column as a list for clustering
docs = df_sample['selftext'].tolist()

# 2: Generate BERT embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(docs, show_progress_bar=True)

# 3: Dimensionality reduction with UMAP
umap_model = umap.UMAP(n_neighbors=30, n_components=5, metric='cosine', random_state=42)
reduced_embeddings = umap_model.fit_transform(embeddings)

# 4: Clustering with HDBSCAN
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
clusters = hdbscan_model.fit_predict(reduced_embeddings)

# 5: Add clusters as a new column to the DataFrame
df_sample['cluster'] = clusters
df_clusters = df_sample[['selftext', 'cluster']]
# Inspect the results
print(df_clusters.head())        # Check the first few rows
print(df_clusters['cluster'].value_counts())  # Count documents in each cluster

  df = pd.read_csv("../Data/ucr_submissions.csv")


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

                                                selftext  cluster
52028  I found Abigail Guadalupe Castillo Hernandez's...       37
28955  Does anyone have any recommendations for apart...       16
27070  Hey is anyone by the name of Alexander missing...       37
2400   I went to the rec center and there are barely ...       30
39011  Lost my keys on campus yesterday. If anybody h...       37
cluster
-1     362
 16    101
 32     31
 13     29
 36     28
 12     23
 37     18
 44     18
 26     17
 54     15
 6      14
 56     14
 14     14
 52     13
 51     12
 39     12
 45     11
 50     11
 25     11
 27     10
 8      10
 15     10
 41     10
 35      9
 34      9
 29      9
 21      8
 58      8
 20      7
 43      7
 18      7
 42      7
 24      7
 33      7
 11      6
 5       6
 48      6
 23      6
 1       6
 7       6
 2       6
 0       6
 49      5
 19      5
 46      5
 30      5
 57      5
 4       4
 22      4
 38      4
 3       4
 40      4
 55      4
 9       4
 1

In [12]:
# 6: Concatenate documents in each cluster
clustered_documents = df_clusters.groupby('cluster')['selftext'].apply(' '.join).reset_index()

# Rename the column for clarity
clustered_documents.rename(columns={'selftext': 'concatenated_text'}, inplace=True)

# Inspect the concatenated documents
print(clustered_documents.head())

   cluster                                  concatenated_text
0       -1  It says the PSYC012 lecture with John Franchak...
1        0  Does that mean it’s over zoom? Anyone else reg...
2        1  Title\n\nAny help is appreciated The title has...
3        2  [deleted]\n\n[View Poll](https://www.reddit.co...
4        3  &#x200B;\n\nhttps://preview.redd.it/sryzjn29aw...


In [13]:
# 7: Calcualte c-TF-IDF
vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')  

# Fit and transform the concatenated documents for each cluster
tfidf_matrix = vectorizer.fit_transform(clustered_documents['concatenated_text'])

# Convert the TF-IDF matrix to a DataFrame for easier interpretation
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=clustered_documents['cluster'], columns=vectorizer.get_feature_names_out())


In [14]:
# Filter out cluster -1 from tfidf_df
tfidf_df_no_noise = tfidf_df[tfidf_df.index != -1]

# Get the top 5 words for each cluster based on TF-IDF scores
top_n = 5
top_words = {}

# Loop through each cluster (excluding -1) to get the top words
for cluster in tfidf_df_no_noise.index:
    sorted_words = tfidf_df_no_noise.loc[cluster].sort_values(ascending=False).head(top_n)
    top_words[cluster] = list(sorted_words.index)

# Convert the dictionary to a DataFrame for easy viewing
top_words_df = pd.DataFrame.from_dict(top_words, orient='index', columns=[f'top_word_{i+1}' for i in range(top_n)])
top_words_df.index.name = 'cluster'

# Display the top 5 TF-IDF words for each cluster (excluding -1)
print(top_words_df)


        top_word_1     top_word_2    top_word_3              top_word_4  \
cluster                                                                   
0             zoom         canvas         email                 checked   
1            title           yeah           pls                 advance   
2             poll           view           www                     com   
3              png          https       preview                    redd   
4            right           girl       hearing                 careful   
5           honors        program         apply                   worth   
6            tired         school        online                    life   
7             wifi        connect    connecting                 printer   
8        decisions         portal        gotten                    know   
9           shower        lecture    understand                    warn   
10           media            abt       context                     ppl   
11          stolen       

In [15]:
# 8:Apply NFM

In [16]:
vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english', ngram_range=(2,2))
tfidf_matrix = vectorizer.fit_transform(clustered_documents['concatenated_text'])

# Convert the TF-IDF matrix to a DataFrame for easier interpretation (optional)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=clustered_documents['cluster'], columns=vectorizer.get_feature_names_out())
print("TF-IDF matrix created with n-grams 2.")

TF-IDF matrix created with n-grams 2.


In [17]:
# Set the number of topics you want to extract
n_topics = 4
# Initialize and fit NMF
nmf_model = NMF(n_components=n_topics, random_state=42, init='random', solver='mu', beta_loss='frobenius', max_iter=200)
W = nmf_model.fit_transform(tfidf_matrix)  # Document-topic matrix
H = nmf_model.components_  # Topic-term matrix

print("NMF applied with Frobenius norm objective function.")

NMF applied with Frobenius norm objective function.


In [18]:
# 9: Get the feature names (terms) from the TF-IDF vectorizer
terms = vectorizer.get_feature_names_out()

# Display the top words for each topic
n_top_words = 5
for topic_idx, topic in enumerate(H):
    top_terms = [terms[i] for i in topic.argsort()[-n_top_words:]][::-1]
    print(f"Topic #{topic_idx+1}: {' | '.join(top_terms)}")

Topic #1: financial aid | work study | campus housing | cal grant | summer classes
Topic #2: don want | don know | does know | help appreciated | computer science
Topic #3: reddit com | www reddit | https www | com poll | view poll
Topic #4: https preview | preview redd | auto webp | png width | format png
