<a href="https://colab.research.google.com/github/rkp74/Topic_Modelling/blob/main/Topic_Modeling_NMF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from time import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os

In [None]:
def nmf(X, n_components, max_iter=1000, tol=1e-6):
    # Random initialization of W and H
    W = np.random.rand(X.shape[0], n_components)
    H = np.random.rand(n_components, X.shape[1])

    for i in range(max_iter):
        # Update H
        H *= (W.T @ X) / (W.T @ (W @ H) + 1e-9)

        # Update W
        W *= (X @ H.T) / ((W @ H) @ H.T + 1e-9)

        # Compute the Frobenius norm as the error
        error = np.linalg.norm(X - W @ H, 'fro')

        if error < tol:
            print(f"Converged after {i + 1} iterations.")
            break

    return W, H


In [None]:
n_samples = 2000
n_features = 1000
n_components = 15
n_top_words = 5
batch_size = 128


In [None]:

# Load and preprocess the data
print("Loading dataset...")
t0 = time()
data, _ = fetch_20newsgroups(
    shuffle=True,
    random_state=1,
    remove=("headers", "footers", "quotes"),
    return_X_y=True,
)


Loading dataset...


In [None]:
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))

done in 2.981s.


In [None]:
# Use tf-idf features for training the NMF
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words="english")
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples).toarray().T
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 0.403s.


In [None]:
# Perform NMF
print("Performing NMF...")
W, H = nmf(tfidf, n_components, max_iter=1000)

Performing NMF...


In [None]:
def extract_topics(W, H, feature_names, n_top_words):
    num_topics = W.shape[1]

    for topic_idx in range(num_topics):
        # Ensure indices are within the valid range
        top_features_ind = np.argsort(W[:, topic_idx])[::-1][:n_top_words]
        top_features_ind = top_features_ind[top_features_ind < len(feature_names)]

        top_features = feature_names[top_features_ind]
        weights = W[top_features_ind, topic_idx]

        print(f"Topic {topic_idx + 1}:")
        for feature, weight in zip(top_features, weights):
            print(f"{feature}: {weight:.4f}")
        print("\n")

In [None]:
# Extract topics using the learned NMF-like components
tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
extract_topics(W, H, tfidf_feature_names, n_top_words)

Topic 1:
people: 30.6011
law: 9.1338
government: 9.0207
did: 8.9991
israel: 8.4803


Topic 2:
thanks: 34.2172
know: 21.5730
does: 19.8200
mail: 15.0363
advance: 14.5057


Topic 3:
god: 46.2602
jesus: 12.5687
bible: 10.7995
faith: 7.9381
does: 7.2693


Topic 4:
car: 37.1527
bike: 17.7268
good: 12.1376
cars: 10.9247
engine: 7.4437


Topic 5:
space: 28.8470
nasa: 10.0659
data: 8.4348
earth: 8.1463
book: 7.9802


Topic 6:
game: 31.6287
team: 16.6582
year: 14.0987
games: 13.3489
play: 9.7267


Topic 7:
drive: 37.2247
drives: 14.7257
hard: 12.5856
disk: 11.7736
software: 9.5968


Topic 8:
windows: 33.6943
file: 22.8666
dos: 12.1325
using: 10.9816
use: 10.6158


Topic 9:
edu: 48.8971
soon: 10.9706
com: 9.4081
send: 6.0648
university: 5.4261


Topic 10:
00: 32.9821
10: 15.2739
sale: 13.7677
card: 8.7801
price: 8.4137


Topic 11:
key: 27.6103
chip: 20.9733
clipper: 15.4195
keys: 13.5475
encryption: 13.2521


Topic 12:
like: 46.6235
don: 13.9964
sounds: 11.1396
know: 9.3137
look: 7.0069


Topic 

In [None]:
def visualize_word_clouds(W, feature_names, n_top_words, output_dir="word_clouds"):
    # Create an output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    num_topics = W.shape[1]

    for topic_idx in range(num_topics):
        # Get the top words for the current topic
        top_features_ind = np.argsort(W[:, topic_idx])[::-1][:n_top_words]

        # Ensure the indices are within the valid range of feature_names
        valid_top_features_ind = top_features_ind[top_features_ind < len(feature_names)]
        top_features = [feature_names[i] for i in valid_top_features_ind]
        weights = [W[i, topic_idx] for i in valid_top_features_ind]

        # Create a dictionary of words and their weights for the word cloud
        wordcloud_data = {top_features[i]: weights[i] for i in range(len(top_features))}

        # Create and save the word cloud for the current topic
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(wordcloud_data)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Topic {topic_idx + 1}')
        plt.axis('off')

        # Save the word cloud as an image
        image_file = os.path.join(output_dir, f"topic_{topic_idx + 1}_wordcloud.png")
        plt.savefig(image_file)
        plt.close()

In [None]:
# Call the visualize_word_clouds function
visualize_word_clouds(W, tfidf_feature_names, n_top_words)

In [None]:
def calculate_coherence(W, feature_names, texts, n_top_words):
    num_topics = W.shape[1]

    # Ensure valid indices and adjust n_top_words if needed
    n_top_words = min(n_top_words, len(feature_names))

    # Extract top words for each topic
    top_words_per_topic = []
    for topic_idx in range(num_topics):
        top_features_ind = np.argsort(W[:, topic_idx])[::-1][:n_top_words]
        top_words = [feature_names[i] for i in top_features_ind if i < len(feature_names)]
        top_words_per_topic.append(top_words)

    # Create a CountVectorizer to convert text to a bag of words
    vectorizer = CountVectorizer(vocabulary=feature_names, binary=True)
    bow_matrix = vectorizer.fit_transform(texts).toarray()

    # Calculate co-occurrence matrix (word by word)
    co_occurrence_matrix = np.dot(bow_matrix.T, bow_matrix)
    np.fill_diagonal(co_occurrence_matrix, 0)

    # Initialize coherence
    coherence = 2.0

    # Calculate coherence for each topic
    for topic_words in top_words_per_topic:
        topic_coherence = 0.0
        for i in range(len(topic_words)):
            for j in range(i + 1, len(topic_words)):
                word_i, word_j = topic_words[i], topic_words[j]
                if word_i in feature_names and word_j in feature_names:
                    word_i_idx, word_j_idx = np.where(feature_names == word_i)[0], np.where(feature_names == word_j)[0]
                    if len(word_i_idx) > 0 and len(word_j_idx) > 0:
                        co_occurrences = co_occurrence_matrix[word_i_idx[0], word_j_idx[0]]
                        word_i_freq = np.sum(bow_matrix[:, word_i_idx])
                        word_j_freq = np.sum(bow_matrix[:, word_j_idx])

                        # Compute Pointwise Mutual Information (PMI)
                        pmi = np.log((co_occurrences * len(texts)) / (word_i_freq * word_j_freq) + 1e-10)
                        topic_coherence += pmi

        # Average over word pairs
        topic_coherence /= len(topic_words)
        coherence += topic_coherence

    # Average over topics
    coherence /= num_topics

    return coherence



In [None]:
# Call the function to calculate coherence
coherence = calculate_coherence(W, tfidf_feature_names, data_samples, n_top_words)
print(f"Coherence: {coherence}")

Coherence: 2.818793743908425
