In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

# Load dataset
df = pd.read_csv('./dataset/quora_questions_filtered.csv')
documents = list(df['Question'])

# Preprocess text (you can add more preprocessing steps as needed)
def preprocess_text(text):
    return text.lower()  # Convert to lowercase

documents = [preprocess_text(doc) for doc in documents]

# Create Document-Term Matrix (DTM)
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = cv.fit_transform(documents)

# Fit NMF Model
n_topics = 10  # Adjust the number of topics as needed
nmf = NMF(n_components=n_topics, random_state=42)
nmf.fit(dtm)

# Function to get top words per topic
def get_top_words_per_topic(model, feature_names, n_top_words=20):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_indices = topic.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_words_indices]
        topics.append(top_words)
    return topics

# Get feature names (words)
feature_names = cv.get_feature_names_out()

# Get top words for each topic
nmf_topics = get_top_words_per_topic(nmf, feature_names)
for i, topic in enumerate(nmf_topics):
    print(f"NMF - The top 20 words for topic #{i}: {topic}")

# Optional: Calculate Coherence Score
# Note: You can calculate coherence scores for NMF similarly if desired, but it might require additional setup.


NMF - The top 20 words for topic #0: ['police', 'hotel', 'safe', 'staff', 'unmarried', 'moral', 'couples', 'harassment', 'station', 'hill', 'hills', 'nallamala', 'pune', 'ananthagiri', 'shimla', 'kolkata', 'manali', 'srinagar', 'ooty', 'deolo']
NMF - The top 20 words for topic #1: ['does', 'mean', 'compare', 'lake', 'lakes', 'make', 'person', 'come', 'water', 'long', 'work', 'says', 'great', 'matter', 'convictions', 'wildlife', 'feel', 'say', 'differ', 'know']
NMF - The top 20 words for topic #2: ['battle', 'did', 'compare', 'contrast', 'significance', 'somme', 'desert', 'calabria', 'arthur', 'port', 'ones', 'temperatures', 'average', 'bataan', 'rostov', 'borodino', 'cold', 'gobi', 'leningrad', 'nanshan']
NMF - The top 20 words for topic #3: ['want', 'year', 'india', 'years', 'old', 'don', 'know', 'best', 'job', 'start', 'make', 'work', 'engineering', 'life', 'need', 'account', 'just', 'student', 'number', 'way']
NMF - The top 20 words for topic #4: ['notes', '500', '1000', 'money', 'b

In [5]:
labels=[
 "Government",
    "Online Business",
    "Social Harassment",
    "Relationships",
    "cosmology and physics",
    "Exam and university",
    "carrer",
    "Education",
    "Mathematics",
    "Accommodation",
    "Historical Battles",
    "Geological and Environmental Effects",
    " Water Comparisons"
    ]

In [6]:
from transformers import pipeline

# Load a text classification model (you can choose a specific model for your needs)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def generate_label(words,candidate_labels):
    input_text = f"Words: {', '.join(words)}"
    
    # Candidate labels to guide classification, this can be modified based on common topics in your data

    generated = classifier(input_text, candidate_labels=candidate_labels)
    
    # Return the top label
    return generated['labels'][0]

In [15]:
def SaveTopicsToFile(model, cv, n_words, file_name,title):
    with open(file_name, 'w') as f:
        f.write(title + '\n\n')
        for index, topic in enumerate(model.components_):
            f.write(f"The top {n_words} words for topic #{index}:\n")
            top_words = [cv.get_feature_names_out()[i] for i in topic.argsort()[-n_words:]]
            f.write(', '.join(top_words) + '\n\n')
    print(f"Topics have been successfully saved to {file_name}")


In [16]:
SaveTopicsToFile(nmf, cv, 20, 'topics_NMF.txt','topic modeling NMF used 10 topics')

Topics have been successfully saved to topics_NMF.txt


In [7]:
topic_to_label = {}
for index, topic in enumerate(nmf_topics):
    top_words=topic[:30]
    print(f"Topic {index}",top_words)
    label = generate_label(top_words, labels)
    print(f"Topic {index} is about {label}")
    topic_to_label[index] = label

Topic 0 ['police', 'hotel', 'safe', 'staff', 'unmarried', 'moral', 'couples', 'harassment', 'station', 'hill', 'hills', 'nallamala', 'pune', 'ananthagiri', 'shimla', 'kolkata', 'manali', 'srinagar', 'ooty', 'deolo']
Topic 0 is about Accommodation
Topic 1 ['does', 'mean', 'compare', 'lake', 'lakes', 'make', 'person', 'come', 'water', 'long', 'work', 'says', 'great', 'matter', 'convictions', 'wildlife', 'feel', 'say', 'differ', 'know']
Topic 1 is about  Water Comparisons
Topic 2 ['battle', 'did', 'compare', 'contrast', 'significance', 'somme', 'desert', 'calabria', 'arthur', 'port', 'ones', 'temperatures', 'average', 'bataan', 'rostov', 'borodino', 'cold', 'gobi', 'leningrad', 'nanshan']
Topic 2 is about Historical Battles
Topic 3 ['want', 'year', 'india', 'years', 'old', 'don', 'know', 'best', 'job', 'start', 'make', 'work', 'engineering', 'life', 'need', 'account', 'just', 'student', 'number', 'way']
Topic 3 is about carrer
Topic 4 ['notes', '500', '1000', 'money', 'black', 'rupee', 'r

In [8]:
topic_to_label

{0: 'Accommodation',
 1: ' Water Comparisons',
 2: 'Historical Battles',
 3: 'carrer',
 4: 'Government',
 5: 'Geological and Environmental Effects',
 6: 'cosmology and physics',
 7: 'Accommodation',
 8: 'carrer',
 9: 'Mathematics'}

In [9]:
topic_results=nmf.transform(dtm)


In [10]:
topic_results[0]

array([0.        , 0.00605198, 0.        , 0.20229743, 0.        ,
       0.        , 0.00614091, 0.        , 0.18683598, 0.        ])

In [11]:
topic_results[0].round(2)

array([0.  , 0.01, 0.  , 0.2 , 0.  , 0.  , 0.01, 0.  , 0.19, 0.  ])

In [12]:
topic_to_label[topic_results[0].argmax()]

'carrer'

In [13]:
# adding Topic label for the dataframe
df['TopicId']=topic_results.argmax(axis=1)
df['Topic'] = [topic_to_label[topic.argmax()] for topic in topic_results]

In [14]:
df.head(20)

Unnamed: 0,Question,doclen,TopicId,Topic
0,"Like everyone else (here in U.S), I work with ...",125,3,carrer
1,Hello dear's people. i have a fictief research...,80,7,Accommodation
2,A lady buys goods worth 200 bucks from a shop ...,77,4,Government
3,I am turning 25 in about a month and am curren...,73,3,carrer
4,I'm a 34 years old married to a woman. I had a...,73,3,carrer
5,My employer has told me that we can not accept...,72,8,carrer
6,I have been using my girlfriend for a sexual r...,71,3,carrer
7,"You have 100 coins laying flat on a table, eac...",68,2,Historical Battles
8,I broke up with him. I love him so much but we...,68,3,carrer
9,I broke up with him. I love him so much but we...,68,3,carrer
