<a href="https://colab.research.google.com/github/rkp74/Topic_Modelling/blob/main/Topic_Modeling_NMF_Neural.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and Initialization

In [None]:
import numpy as np
from time import time
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.metrics import pairwise_distances
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Define the custom NMF-like neural network layer
def custom_nmf_layer(input_layer, n_components):
    W = Dense(n_components, activation='relu')(input_layer)
    H = Dense(n_components, activation='relu', use_bias=False)(input_layer)
    return W, H

In [None]:
def random_initialization(A, rank):
    number_of_documents = A.shape[0]
    number_of_terms = A.shape[1]
    W = np.random.uniform(1, 2, (number_of_documents, rank))
    H = np.random.uniform(1, 2, (rank, number_of_terms))
    return W, H

In [None]:
def nndsvd_initialization(A, rank):
    u, s, v = np.linalg.svd(A, full_matrices=False)
    v = v.T
    w = np.zeros((A.shape[0], rank))
    h = np.zeros((rank, A.shape[1]))

    w[:, 0] = np.sqrt(s[0]) * np.abs(u[:, 0])
    h[0, :] = np.sqrt(s[0]) * np.abs(v[:, 0].T)

    for i in range(1, rank):
        ui = u[:, i]
        vi = v[:, i]
        ui_pos = (ui >= 0) * ui
        ui_neg = (ui < 0) * -ui
        vi_pos = (vi >= 0) * vi
        vi_neg = (vi < 0) * -vi

        ui_pos_norm = np.linalg.norm(ui_pos, 2)
        ui_neg_norm = np.linalg.norm(ui_neg, 2)
        vi_pos_norm = np.linalg.norm(vi_pos, 2)
        vi_neg_norm = np.linalg.norm(vi_neg, 2)

        norm_pos = ui_pos_norm * vi_pos_norm
        norm_neg = ui_neg_norm * vi_neg_norm

        if norm_pos >= norm_neg:
            w[:, i] = np.sqrt(s[i] * norm_pos) / ui_pos_norm * ui_pos
            h[i, :] = np.sqrt(s[i] * norm_pos) / vi_pos_norm * vi_pos.T
        else:
            w[:, i] = np.sqrt(s[i] * norm_neg) / ui_neg_norm * ui_neg
            h[i, :] = np.sqrt(s[i] * norm_neg) / vi_neg_norm * vi_neg.T

    return w, h

# Using Random Initialization

In [None]:
def mu_method(A, k, max_iter, init_mode='random'):
    # Initialize W and H
    W, H = random_initialization(A, k)

    norms = []
    e = 1.0e-10

    for n in range(max_iter):
        # Update H
        W_TA = W.T @ A
        W_TWH = W.T @ W @ H + e

        H = np.multiply(H, (W_TA / W_TWH))

        # Update W
        AH_T = A @ H.T
        WHH_T =  W @ H @ H.T + e

        W = np.multiply(W, (AH_T / WHH_T))

        norm = np.linalg.norm(A - W @ H, 'fro')
        norms.append(norm)

    return W, H, norms


In [None]:
n_samples = 2000
n_features = 1000
n_components = 5
n_top_words = 20
batch_size = 128
init = "nndsvda"

In [None]:
# Load and preprocess the data
print("Loading dataset...")
t0 = time()
data, _ = fetch_20newsgroups(
    shuffle=True,
    random_state=1,
    remove=("headers", "footers", "quotes"),
    return_X_y=True,
)

Loading dataset...


In [None]:
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))

done in 1.449s.


In [None]:
# Use tf-idf features for training the NMF-like neural network
print("Extracting tf-idf features for NMF-like neural network training...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words="english")
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples).toarray().T
print("done in %0.3fs." % (time() - t0))


Extracting tf-idf features for NMF-like neural network training...
done in 0.410s.


In [None]:
# Call the mu_method for NMF
A = tfidf.T  # Transpose the tfidf matrix to match the input shape
W, H, norms = mu_method(A, n_components, max_iter=200, init_mode=init)

In [None]:
def extract_topics(W, H, feature_names, n_top_words):
    num_topics = W.shape[1]

    for topic_idx in range(num_topics):
        # Ensure indices are within the valid range
        top_features_ind = np.argsort(W[:, topic_idx])[::-1][:n_top_words]

        # Ensure top_features_ind is within the valid range of feature_names
        valid_top_features_ind = top_features_ind[top_features_ind < len(feature_names)]
        top_features = feature_names[valid_top_features_ind]
        weights = W[valid_top_features_ind, topic_idx]

        print(f"Topic {topic_idx + 1}:")
        for feature, weight in zip(top_features, weights):
            print(f"{feature}: {weight:.4f}")
        print("\n")

In [None]:
# Extract topics using the learned NMF-like components
tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
extract_topics(W, H, tfidf_feature_names, n_top_words = 20)

Topic 1:
mac: 23.3556
difference: 21.3883
ll: 16.4255
serial: 16.4236
oil: 15.9419
project: 14.9202


Topic 2:
information: 16.0106
shot: 13.5583
book: 13.4951
gov: 12.4691
systems: 11.2476
3d: 11.2385
tv: 10.8619
board: 10.3814


Topic 3:
size: 11.3294
limited: 10.6152
nasa: 10.2993
changes: 9.9262
armenia: 9.3184
won: 9.0258
research: 8.7738
suppose: 8.6806


Topic 4:
43: 14.1276
energy: 11.9571
head: 11.2984
jobs: 11.2523
recent: 11.2057
clear: 10.9800
women: 10.7775


Topic 5:
years: 16.3171
medical: 15.9220
reply: 14.2778
date: 14.2589
output: 12.7142
hard: 12.4516
gives: 12.0153
author: 11.9079
defense: 11.8827




In [None]:
def visualize_word_clouds(W, feature_names, n_top_words, output_dir="word_clouds"):
    # Create an output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    num_topics = W.shape[1]

    for topic_idx in range(num_topics):
        # Get the top words for the current topic
        top_features_ind = np.argsort(W[:, topic_idx])[::-1][:n_top_words]

        # Ensure the indices are within the valid range of feature_names
        valid_top_features_ind = top_features_ind[top_features_ind < len(feature_names)]
        top_features = [feature_names[i] for i in valid_top_features_ind]
        weights = [W[i, topic_idx] for i in valid_top_features_ind]

        # Create a dictionary of words and their weights for the word cloud
        wordcloud_data = {top_features[i]: weights[i] for i in range(len(top_features))}

        # Create and save the word cloud for the current topic
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(wordcloud_data)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Topic {topic_idx + 1}')
        plt.axis('off')

        # Save the word cloud as an image
        image_file = os.path.join(output_dir, f"topic_{topic_idx + 1}_wordcloud.png")
        plt.savefig(image_file)
        plt.close()

In [None]:
# Call the visualize_word_clouds function
visualize_word_clouds(W, tfidf_feature_names, n_top_words)

In [None]:
def calculate_coherence(W, feature_names, texts, n_top_words):
    num_topics = W.shape[1]

    # Ensure valid indices and adjust n_top_words if needed
    n_top_words = min(n_top_words, len(feature_names))

    # Extract top words for each topic
    top_words_per_topic = []
    for topic_idx in range(num_topics):
        top_features_ind = np.argsort(W[:, topic_idx])[::-1][:n_top_words]
        top_words = [feature_names[i] for i in top_features_ind if i < len(feature_names)]
        top_words_per_topic.append(top_words)

    # Create a CountVectorizer to convert text to a bag of words
    vectorizer = CountVectorizer(vocabulary=feature_names, binary=True)
    bow_matrix = vectorizer.fit_transform(texts).toarray()

    # Calculate co-occurrence matrix (word by word)
    co_occurrence_matrix = np.dot(bow_matrix.T, bow_matrix)
    np.fill_diagonal(co_occurrence_matrix, 0)

    # Initialize coherence
    coherence = 0.0

    # Calculate coherence for each topic
    for topic_words in top_words_per_topic:
        topic_coherence = 0.0
        for i in range(len(topic_words)):
            for j in range(i + 1, len(topic_words)):
                word_i, word_j = topic_words[i], topic_words[j]
                if word_i in feature_names and word_j in feature_names:
                    word_i_idx, word_j_idx = np.where(feature_names == word_i)[0], np.where(feature_names == word_j)[0]
                    if len(word_i_idx) > 0 and len(word_j_idx) > 0:
                        co_occurrences = co_occurrence_matrix[word_i_idx[0], word_j_idx[0]]
                        word_i_freq = np.sum(bow_matrix[:, word_i_idx])
                        word_j_freq = np.sum(bow_matrix[:, word_j_idx])

                        # Compute Pointwise Mutual Information (PMI)
                        pmi = np.log((co_occurrences * len(texts)) / (word_i_freq * word_j_freq) + 1e-10)
                        topic_coherence += pmi

        # Average over word pairs
        topic_coherence /= len(topic_words)
        coherence += topic_coherence

    # Average over topics
    coherence /= num_topics

    return coherence


In [None]:
# Call the function to calculate coherence
coherence = calculate_coherence(W, tfidf_feature_names, data_samples, n_top_words)
print(f"Coherence: {coherence}")

Coherence: -12.981491238355256


# Using Nonnegative Double Singular Value Decomposition

In [None]:
def mu_method(A, k, max_iter, init_mode='random'):
    # Initialize W and H
    W, H = nndsvd_initialization(A, k)

    norms = []
    e = 1.0e-10

    for n in range(max_iter):
        # Update H
        W_TA = W.T @ A
        W_TWH = W.T @ W @ H + e

        H = np.multiply(H, (W_TA / W_TWH))

        # Update W
        AH_T = A @ H.T
        WHH_T =  W @ H @ H.T + e

        W = np.multiply(W, (AH_T / WHH_T))

        norm = np.linalg.norm(A - W @ H, 'fro')
        norms.append(norm)

    return W, H, norms


In [None]:
# Call the mu_method for NMF
A = tfidf.T  # Transpose the tfidf matrix to match the input shape
W, H, norms = mu_method(A, n_components, max_iter=100, init_mode=init)

In [None]:
def extract_topics(W, H, feature_names, n_top_words):
    num_topics = W.shape[1]

    for topic_idx in range(num_topics):
        # Ensure indices are within the valid range
        top_features_ind = np.argsort(W[:, topic_idx])[::-1][:n_top_words]

        # Ensure top_features_ind is within the valid range of feature_names
        valid_top_features_ind = top_features_ind[top_features_ind < len(feature_names)]
        top_features = feature_names[valid_top_features_ind]
        weights = W[valid_top_features_ind, topic_idx]

        print(f"Topic {topic_idx + 1}:")
        for feature, weight in zip(top_features, weights):
            print(f"{feature}: {weight:.4f}")
        print("\n")

In [None]:
# Extract topics using the learned NMF-like components
tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
extract_topics(W, H, tfidf_feature_names, n_top_words = 20)

Topic 1:
size: 0.2262
limited: 0.2051
nasa: 0.1835
armenia: 0.1792
changes: 0.1722
30: 0.1674
looking: 0.1667
suppose: 0.1652


Topic 2:
years: 0.2325
medical: 0.2208
date: 0.2157
author: 0.2087
output: 0.2061
board: 0.1984
hard: 0.1935
basically: 0.1932
given: 0.1847
reply: 0.1825


Topic 3:
mac: 0.4521
difference: 0.3767
ll: 0.2897
oil: 0.2881
serial: 0.2859
project: 0.2599


Topic 4:
recent: 0.2965
clear: 0.2411
head: 0.2204
jobs: 0.2139
connector: 0.2116
43: 0.2038
areas: 0.1942
war: 0.1940
food: 0.1865
women: 0.1825
17: 0.1823


Topic 5:
science: 0.2775
hours: 0.2755
best: 0.2579
results: 0.2532
scsi: 0.2496
cost: 0.2448
president: 0.2402
wouldn: 0.2300
jesus: 0.2197
access: 0.2164




In [None]:
def visualize_word_clouds(W, feature_names, n_top_words, output_dir="word_clouds_svd"):
    # Create an output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    num_topics = W.shape[1]

    for topic_idx in range(num_topics):
        # Get the top words for the current topic
        top_features_ind = np.argsort(W[:, topic_idx])[::-1][:n_top_words]

        # Ensure the indices are within the valid range of feature_names
        valid_top_features_ind = top_features_ind[top_features_ind < len(feature_names)]
        top_features = [feature_names[i] for i in valid_top_features_ind]
        weights = [W[i, topic_idx] for i in valid_top_features_ind]

        # Create a dictionary of words and their weights for the word cloud
        wordcloud_data = {top_features[i]: weights[i] for i in range(len(top_features))}

        # Create and save the word cloud for the current topic
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(wordcloud_data)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Topic {topic_idx + 1}')
        plt.axis('off')

        # Save the word cloud as an image
        image_file = os.path.join(output_dir, f"topic_{topic_idx + 1}_wordcloud.png")
        plt.savefig(image_file)
        plt.close()

In [None]:
# Call the visualize_word_clouds function
visualize_word_clouds(W, tfidf_feature_names, n_top_words)

In [None]:
def calculate_coherence(W, feature_names, texts, n_top_words):
    num_topics = W.shape[1]

    # Ensure valid indices and adjust n_top_words if needed
    n_top_words = min(n_top_words, len(feature_names))

    # Extract top words for each topic
    top_words_per_topic = []
    for topic_idx in range(num_topics):
        top_features_ind = np.argsort(W[:, topic_idx])[::-1][:n_top_words]
        top_words = [feature_names[i] for i in top_features_ind if i < len(feature_names)]
        top_words_per_topic.append(top_words)

    # Create a CountVectorizer to convert text to a bag of words
    vectorizer = CountVectorizer(vocabulary=feature_names, binary=True)
    bow_matrix = vectorizer.fit_transform(texts).toarray()

    # Calculate co-occurrence matrix (word by word)
    co_occurrence_matrix = np.dot(bow_matrix.T, bow_matrix)
    np.fill_diagonal(co_occurrence_matrix, 0)

    # Initialize coherence
    coherence = 0.0

    # Calculate coherence for each topic
    for topic_words in top_words_per_topic:
        topic_coherence = 0.0
        for i in range(len(topic_words)):
            for j in range(i + 1, len(topic_words)):
                word_i, word_j = topic_words[i], topic_words[j]
                if word_i in feature_names and word_j in feature_names:
                    word_i_idx, word_j_idx = np.where(feature_names == word_i)[0], np.where(feature_names == word_j)[0]
                    if len(word_i_idx) > 0 and len(word_j_idx) > 0:
                        co_occurrences = co_occurrence_matrix[word_i_idx[0], word_j_idx[0]]
                        word_i_freq = np.sum(bow_matrix[:, word_i_idx])
                        word_j_freq = np.sum(bow_matrix[:, word_j_idx])

                        # Compute Pointwise Mutual Information (PMI)
                        pmi = np.log((co_occurrences * len(texts)) / (word_i_freq * word_j_freq) + 1e-10)
                        topic_coherence += pmi

        # Average over word pairs
        topic_coherence /= len(topic_words)
        coherence += topic_coherence

    # Average over topics
    coherence /= num_topics

    return coherence



In [None]:
# Call the function to calculate coherence
coherence = calculate_coherence(W, tfidf_feature_names, data_samples, n_top_words)
print(f"Coherence: {coherence}")

Coherence: -13.802921864496625
