In [205]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

# Load your dataset
df = pd.read_csv('./dataset/quora_questions_filtered.csv')
documents = list(df['Question'])



In [113]:
# !pip install textblob

In [114]:
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jadha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [116]:
def preprocess_text(doc):
    # Convert to lowercase
    doc = doc.lower()
    
    # Remove stopwords
    doc = ' '.join([word for word in doc.split() if word not in stop_words])
    
    return doc

In [117]:
documents_preprocessed = [preprocess_text(doc) for doc in documents]

In [118]:
# Initialize the BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Or choose another model

# Create embeddings for the documents
embeddings = model.encode(documents, show_progress_bar=True)


Batches: 100%|██████████| 961/961 [04:13<00:00,  3.79it/s]


In [240]:
num_topics = 8
topic_model = BERTopic(nr_topics=num_topics,n_gram_range=(15,40), min_topic_size= 20,top_n_words=20)

# Fit the model on the embeddings
topics, probs = topic_model.fit_transform(documents, embeddings)


In [241]:
# Print topics with their top words
print(topic_model.get_topics())

# Optional: Visualize topics
topic_model.visualize_topics()


{-1: [('would be safe for unmarried couples without the harassment of police hotel staff and moral police', 0.00017065340376948578), ('be safe for unmarried couples without the harassment of police hotel staff and moral police', 0.00017065340376948578), ('would be safe for unmarried couples without the harassment of police hotel staff and moral', 0.00017065340376948578), ('hillstation would be safe for unmarried couples without the harassment of police hotel staff and moral', 0.00014364996949599784), ('hillstation would be safe for unmarried couples without the harassment of police hotel staff and moral police', 0.00014364996949599784), ('hillstation would be safe for unmarried couples without the harassment of police hotel staff and', 0.00014364996949599784), ('safe for unmarried couples without the harassment of police hotel staff and moral police what', 5.0221127793967085e-05), ('be safe for unmarried couples without the harassment of police hotel staff and moral police what', 5.022

In [None]:
topicsDocs=[]
# Get the top 30 words for each topic
for topic_num in set(topics):
    if topic_num == -1:
        continue  # Skip the -1 topic as it represents outliers
    
    top_words = topic_model.get_topic(topic_num)  # Get the top 30 words
    print(f"Topic {topic_num}:")
    for word, score in top_words[:]:
        print(f"  {word}: {score:.4f}")
    doc=" ".join([ word for word, score in top_words])

    topicsDocs.append([doc,topic_num])
    print("\n")


Topic 0:
  what were the major effects of the cambodia earthquake and how do these effects compare: 0.0009
  what were the major effects of the cambodia earthquake and how do these effects compare to the: 0.0009
  were the major effects of the cambodia earthquake and how do these effects compare to: 0.0009
  what were the major effects of the cambodia earthquake and how do these effects compare to: 0.0009
  were the major effects of the cambodia earthquake and how do these effects compare to the: 0.0009
  the major effects of the cambodia earthquake and how do these effects compare to the: 0.0009
  be safe for unmarried couples without the harassment of police hotel staff and moral police: 0.0007
  would be safe for unmarried couples without the harassment of police hotel staff and moral: 0.0007
  would be safe for unmarried couples without the harassment of police hotel staff and moral police: 0.0007
  was the significance of the battle of somme and how did this battle compare and con

In [196]:
topicsDocs

[['would be safe for unmarried couples without the harassment of police hotel staff and moral police would be safe for unmarried couples without the harassment of police hotel staff and moral be safe for unmarried couples without the harassment of police hotel staff and moral police were the major effects of the cambodia earthquake and how do these effects compare to the were the major effects of the cambodia earthquake and how do these effects compare to what were the major effects of the cambodia earthquake and how do these effects compare the major effects of the cambodia earthquake and how do these effects compare to the what were the major effects of the cambodia earthquake and how do these effects compare to what were the major effects of the cambodia earthquake and how do these effects compare to the significance of the battle of somme and how did this battle compare and contrast to the battle battle of somme and how did this battle compare and contrast to the battle of signific

In [244]:
topic_id_to_label = {
0:"Cambodia Earthquake and Historical Battles",
1:"Job Prospects by Major",
2:"Drug Testing and Probability",
3:"Universe and Energy",
4:"Dating and Communication",
5:"Snapchat Account Recovery",
6:"Gun Safety Regulations",
}


In [245]:
def SaveTopicsToFile(topic_model, n_words, file_name,title):
    with open(file_name, 'w') as f:
        f.write(title + '\n\n')
        for topic_num in set(topics):
            if topic_num == -1:
                continue  # Skip the -1 topic as it represents outliers
            f.write(f"The Topic {topic_id_to_label[topic_num]}:\n")
            top_words = topic_model.get_topic(topic_num)[:n_words]  # Get the top 30 words
            for word, score in top_words:
                f.write(f"{word}:\n")
            f.write("\n\n")
        f.close()

In [246]:
SaveTopicsToFile(topic_model, 5, 'BerTopic.txt','topics With BerTopic and ngrams')

In [247]:
df['Topic']=topics

In [248]:
df['Topic'].value_counts()

Topic
-1    13445
 0     4468
 1     4077
 2     2371
 3     2177
 4     1798
 5     1751
 6      648
Name: count, dtype: int64

In [249]:
df["Topic_Label"] = df["Topic"].map(topic_id_to_label).fillna("General")


In [250]:
df.loc[:,['Question','Topic','Topic_Label']].head(10)

Unnamed: 0,Question,Topic,Topic_Label
0,"Like everyone else (here in U.S), I work with ...",-1,General
1,Hello dear's people. i have a fictief research...,6,Gun Safety Regulations
2,A lady buys goods worth 200 bucks from a shop ...,2,Drug Testing and Probability
3,I am turning 25 in about a month and am curren...,1,Job Prospects by Major
4,I'm a 34 years old married to a woman. I had a...,4,Dating and Communication
5,My employer has told me that we can not accept...,-1,General
6,I have been using my girlfriend for a sexual r...,4,Dating and Communication
7,"You have 100 coins laying flat on a table, eac...",2,Drug Testing and Probability
8,I broke up with him. I love him so much but we...,4,Dating and Communication
9,I broke up with him. I love him so much but we...,4,Dating and Communication


In [251]:
df.to_csv('./results/bertopic/results.csv', index=False)