In [205]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

# Load your dataset
df = pd.read_csv('./dataset/quora_questions_filtered.csv')
documents = list(df['Question'])



In [113]:
# !pip install textblob

In [114]:
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jadha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [116]:
def preprocess_text(doc):
    # Convert to lowercase
    doc = doc.lower()
    
    # Remove stopwords
    doc = ' '.join([word for word in doc.split() if word not in stop_words])
    
    return doc

In [117]:
documents_preprocessed = [preprocess_text(doc) for doc in documents]

In [118]:
# Initialize the BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Or choose another model

# Create embeddings for the documents
embeddings = model.encode(documents, show_progress_bar=True)


Batches: 100%|██████████| 961/961 [04:13<00:00,  3.79it/s]


In [191]:
num_topics = 7
topic_model = BERTopic(nr_topics=7,n_gram_range=(15,20), min_topic_size= 10,top_n_words=20)

# Fit the model on the embeddings
topics, probs = topic_model.fit_transform(documents, embeddings)


In [192]:
# Print topics with their top words
print(topic_model.get_topics())

# Optional: Visualize topics
topic_model.visualize_topics()


{-1: [('what are good gifts for foreign visitor to bring when theyre invited to someones home', 0.00016365657806210522), ('what are good gifts for foreign visitor to bring when theyre invited to someones home in', 0.00016365657806210522), ('are good gifts for foreign visitor to bring when theyre invited to someones home in', 0.00016365657806210522), ('be safe for unmarried couples without the harassment of police hotel staff and moral police', 0.0001580327990375373), ('would be safe for unmarried couples without the harassment of police hotel staff and moral', 0.0001580327990375373), ('would be safe for unmarried couples without the harassment of police hotel staff and moral police', 0.0001580327990375373), ('hillstation would be safe for unmarried couples without the harassment of police hotel staff and', 7.428848254230938e-05), ('hillstation would be safe for unmarried couples without the harassment of police hotel staff and moral police', 7.428848254230938e-05), ('hillstation would 

In [222]:
topicsDocs=[]
# Get the top 30 words for each topic
for topic_num in set(topics):
    if topic_num == -1:
        continue  # Skip the -1 topic as it represents outliers
    
    top_words = topic_model.get_topic(topic_num)[:30]  # Get the top 30 words
    print(f"Topic {topic_num}:")
    for word, score in top_words:
        print(f"  {word}: {score:.4f}")
    doc=" ".join([ word for word, score in top_words])

    topicsDocs.append([doc,topic_num])
    print("\n")


Topic 0:
  would be safe for unmarried couples without the harassment of police hotel staff and moral police: 0.0029
  would be safe for unmarried couples without the harassment of police hotel staff and moral: 0.0029
  be safe for unmarried couples without the harassment of police hotel staff and moral police: 0.0029
  were the major effects of the cambodia earthquake and how do these effects compare to the: 0.0023
  were the major effects of the cambodia earthquake and how do these effects compare to: 0.0023
  what were the major effects of the cambodia earthquake and how do these effects compare: 0.0023
  the major effects of the cambodia earthquake and how do these effects compare to the: 0.0023
  what were the major effects of the cambodia earthquake and how do these effects compare to: 0.0023
  what were the major effects of the cambodia earthquake and how do these effects compare to the: 0.0023
  significance of the battle of somme and how did this battle compare and contrast to

In [196]:
topicsDocs

[['would be safe for unmarried couples without the harassment of police hotel staff and moral police would be safe for unmarried couples without the harassment of police hotel staff and moral be safe for unmarried couples without the harassment of police hotel staff and moral police were the major effects of the cambodia earthquake and how do these effects compare to the were the major effects of the cambodia earthquake and how do these effects compare to what were the major effects of the cambodia earthquake and how do these effects compare the major effects of the cambodia earthquake and how do these effects compare to the what were the major effects of the cambodia earthquake and how do these effects compare to what were the major effects of the cambodia earthquake and how do these effects compare to the significance of the battle of somme and how did this battle compare and contrast to the battle battle of somme and how did this battle compare and contrast to the battle of signific

In [None]:
topic_id_to_label = {
    0: 'Social Issues and Historical Events',
    1: 'Education and Online Learning Platforms',
    2: 'Mental Health and Substance Use',
    3: 'Account Recovery and Security',
    4: 'Probability and Mathematics',
    5: 'Cosmology and Physics'
}


In [None]:
def SaveTopicsToFile(topic_model, n_words, file_name,title):
    with open(file_name, 'w') as f:
        f.write(title + '\n\n')
        for topic_num in set(topics):
            if topic_num == -1:
                continue  # Skip the -1 topic as it represents outliers
            f.write(f"The Topic {topic_id_to_label[topic_num]}:\n")
            top_words = topic_model.get_topic(topic_num)[:n_words]  # Get the top 30 words
            for word, score in top_words:
                f.write(f"{word}:\n")
            f.write("\n\n")
        f.close()

In [225]:
SaveTopicsToFile(topic_model, 5, 'BerTopic.txt','topics With BerTopic and ngrams')

In [218]:
df['Topic']=topics

In [219]:
df['Topic'].value_counts()

Topic
-1    11448
 0     6437
 1     4919
 2     4029
 3     2007
 4     1395
 5      500
Name: count, dtype: int64

In [220]:
df["Topic_Label"] = df["Topic"].map(topic_id_to_label).fillna("Outlier")


In [221]:
df.loc[:,['Question','Topic','Topic_Label']].head(10)

Unnamed: 0,Question,Topic,Topic_Label
0,"Like everyone else (here in U.S), I work with ...",2,Career
1,Hello dear's people. i have a fictief research...,4,Social Media
2,A lady buys goods worth 200 bucks from a shop ...,4,Social Media
3,I am turning 25 in about a month and am curren...,2,Career
4,I'm a 34 years old married to a woman. I had a...,2,Career
5,My employer has told me that we can not accept...,-1,Outlier
6,I have been using my girlfriend for a sexual r...,2,Career
7,"You have 100 coins laying flat on a table, eac...",4,Social Media
8,I broke up with him. I love him so much but we...,2,Career
9,I broke up with him. I love him so much but we...,2,Career


In [211]:
documents

['Like everyone else (here in U.S), I work with a group of people I see and interact with every day. As a human rights advocate and Lesbian, my views are liberal and I by no means preach my beliefs in my work place. However, I want to know, when is it OK to state your opinions on an issue. For example, if someone mentions that their next door neighbor, who happens to be a 12 year old boy, was playing with dolls and that "this is going down the wrong road and he is strange/weird", is it OK for me to say that we shouldn\'t judge him for who he is? So confused... This just happened at lunch and I am very upset about it...',
 "Hello dear's people. i have a fictief research quest,  If someone standing from 13 feet away from me with a 50AE Desert Eagle with ammunition 300grain hollow point hornady xtp 1475fps at the muzzle he's shooting one bullet between my eyes? I say it with emphasis if there any smallest chance that i could survive? So yes how much % chance can u that analyse in detail. 