In [20]:
#Import necessary libraries
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim import corpora, models

In [21]:
df = pd.read_csv('../Datasets/preprocessed_df.csv')

In [22]:
df.dropna(inplace=True)

In [23]:
vec = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)
tfidf = vec.fit_transform(df['preprocessed_text'])

In [26]:
feature_names = vec.get_feature_names_out()

In [24]:
#Set random state
random_state = 696

In [25]:
#get the feature ready for the coherence model

df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: x.split())
texts = df['preprocessed_text'].tolist()
dictionary = corpora.Dictionary(texts)

### Do hyperparameter tuning with different number of topics to determine which model has the highest coherence score

In [None]:
num_top_words = 50
coherence_scores = []
best_avg_coherence = 0
best_model = None
best_W = None
best_H = None

for num_topics in range(2,11):
    nmf = NMF(n_components = num_topics, 
                  max_iter = 200, init = 'nndsvd', random_state = random_state)
    W = nmf.fit_transform(tfidf)
    H = nmf.components_
    
    topics = []
    for topic_idx, topic in enumerate(H):
        top_features_ind = topic.argsort()[-num_top_words:][::-1]
        top_features = [feature_names[i] for i in top_features_ind]
        topics.append(top_features)
        
    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    avg_c = coherence_model.get_coherence()
    coherence_scores.append(avg_c)
    
    if avg_c > best_avg_coherence:
        best_avg_coherence = avg_c
        best_model = nmf
        best_W = W
        best_H = H

In [None]:
best_model