# Gibbs Sampling on Toy Example

In [8]:
"""
    File name: gibbs_sampler.py
    Description: a re-implementation of the Gibbs sampler for http://www.gatsby.ucl.ac.uk/teaching/courses/ml1
    Author: python: Roman Pogodin, MATLAB (original): Yee Whye Teh and Maneesh Sahani
    Date created: October 2018
    Python version: 3.6
"""
# gibbs_sampler.py file provided. Complete sections marked todo

import numpy as np
import pandas as pd
from scipy.special import gammaln
import matplotlib.pyplot as plt
import seaborn as sns

# todo: sample everything from self.rang_gen to control the random seed (works as numpy.random)
class GibbsSampler:
    def __init__(self, n_docs, n_topics, n_words, alpha, beta, random_seed=None):
        """
        :param n_docs:          number of documents
        :param n_topics:        number of topics
        :param n_words:         number of words in vocabulary
        :param alpha:           dirichlet parameter on topic mixing proportions
        :param beta:            dirichlet parameter on topic word distributions
        :param random_seed:     random seed of the sampler
        """
        self.n_docs = n_docs
        self.n_topics = n_topics
        self.n_words = n_words
        self.alpha = alpha
        self.beta = beta
        self.rand_gen = np.random.RandomState(random_seed)

        self.docs_words = np.zeros((self.n_docs, self.n_words))
        self.docs_words_test = None
        self.loglike = None
        self.loglike_test = None
        self.do_test = False

        self.A_dk = np.zeros((self.n_docs, self.n_topics))  # number of words in document d assigned to topic k
        self.B_kw = np.zeros((self.n_topics, self.n_words))  # number of occurrences of word w assigned to topic k
        self.A_dk_test = np.zeros((self.n_docs, self.n_topics))
        self.B_kw_test = np.zeros((self.n_topics, self.n_words))

        self.theta = np.ones((self.n_docs, self.n_topics)
                             ) / self.n_topics  # theta[d] is the distribution over topics in document d
        self.phi = np.ones((self.n_topics, self.n_words)) / self.n_words  # phi[k] is the distribution words in topic k

        self.topics_space = np.arange(self.n_topics)
        self.topic_doc_words_distr = np.zeros((self.n_docs, self.n_words, self.n_topics))  # z_id|x_id, theta, phi

    def init_sampling(self, docs_words, docs_words_test=None,
                      theta=None, phi=None, n_iter=0, save_loglike=False):
        # Update dimensions from actual data
        self.n_docs = docs_words.shape[0]
        self.n_words = docs_words.shape[1]

        self.docs_words = docs_words
        self.docs_words_test = docs_words_test

        self.do_test = (docs_words_test is not None)

        if save_loglike:
            self.loglike = np.zeros(n_iter)

            if self.do_test:
                self.loglike_test = np.zeros(n_iter)

        # Resize arrays to match actual data dimensions
        self.A_dk = np.zeros((self.n_docs, self.n_topics))
        self.B_kw = np.zeros((self.n_topics, self.n_words))
        self.A_dk_test = np.zeros((self.n_docs, self.n_topics))
        self.B_kw_test = np.zeros((self.n_topics, self.n_words))

        # Resize topic_doc_words_distr - will be properly set in init_params
        self.topic_doc_words_distr = np.zeros((self.n_docs, self.n_words, self.n_topics))

        self.init_params(theta, phi)

    def init_params(self, theta=None, phi=None):
        D = self.n_docs
        K = self.n_topics
        W = self.n_words

        if theta is None:
            self.theta = np.ones((D, K)) / K
        else:
            self.theta = theta.copy()

        if phi is None:
            self.phi = np.ones((K, W)) / W
        else:
            self.phi = phi.copy()

        self.update_topic_doc_words()
        self.sample_counts()

    def run(self, docs_words, docs_words_test=None,
            n_iter=100, theta=None, phi=None, save_loglike=False):
        """
        docs_words is a matrix n_docs * n_words; each entry
        is a number of occurrences of a word in a document
        docs_words_test does not influence the updates and is used
        for validation
        """
        self.init_sampling(docs_words, docs_words_test,
                           theta, phi, n_iter, save_loglike)

        # Run the Gibbs sampler
        for iteration in range(n_iter):

            self.update_params()

            if save_loglike:
                self.update_loglike(iteration)

        return self.to_return_from_run()

    def to_return_from_run(self):
        return self.topic_doc_words_distr, self.theta, self.phi

    def update_params(self):
        """
        Samples theta and phi, then computes the distribution of
        z_id and samples counts A_dk, B_kw from it
        """
        D = self.n_docs
        K = self.n_topics

        # Zero out counts before collecting
        self.A_dk.fill(0)
        self.B_kw.fill(0)

         # Sample theta (document-topic distributions)
        for d in range(D):
            self.theta[d, :] = self.rand_gen.dirichlet(self.A_dk[d, :] + self.alpha)

        # Sample phi (topic-word distributions)
        for k in range(K):
            self.phi[k, :] = self.rand_gen.dirichlet(self.B_kw[k, :] + self.beta)

        # Update the distribution of z_id|x_id, theta, phi and sample counts A_dk, B_kw from it
        self.update_topic_doc_words()
        self.sample_counts()

    def update_topic_doc_words(self):
        """
        Computes the distribution of z_id|x_id, theta, phi
        """
        # Shape: theta (D x K), phi (K x W)
        # Topic distribution for each doc-word pair: (D, W, K)
        # We want topic_probs[d, w, k] = theta[d, k] * phi[k, w]
        # Use broadcasting: theta[:, None, :] gives (D, 1, K), phi.T[None, :, :] gives (1, W, K)
        # After broadcasting and transpose: (D, W, K)
        topic_probs = self.theta[:, None, :] * self.phi.T[None, :, :]  # (D, 1, K) * (1, W, K) -> (D, W, K)
        # Normalize over topics (axis 2) for each (doc, word) pair
        self.topic_doc_words_distr = topic_probs / topic_probs.sum(axis=2, keepdims=True)

    def sample_counts(self):
        """
        For each document and each word, samples from z_id|x_id, theta, phi
        and adds the results to the counts A_dk and B_kw
        """
        D = self.n_docs
        K = self.n_topics
        W = self.n_words
        z_id = self.topic_doc_words_distr  # shape (D, W, K)

        # Reset and accumulate counts
        self.A_dk.fill(0)
        self.B_kw.fill(0)

        for d in range(D):
            for w in range(W):
                n_occurrences = self.docs_words[d, w]
                if n_occurrences > 0:
                    # z_id[d, w, :] is the probability distribution over topics for this (doc, word) pair
                    # Multiply by the number of occurrences to get expected counts
                    for k in range(K):
                        expected_count = z_id[d, w, k] * n_occurrences
                        self.A_dk[d, k] += expected_count        # doc-topic assignments
                        self.B_kw[k, w] += expected_count        # topic-word assignments

    def update_loglike(self, iteration):
        """
        Updates loglike of the data, omitting the constant additive term
        with Gamma functions of hyperparameters
        """
        # todo: implement log-like
        loglike = 0.0
        D = self.n_docs
        W = self.n_words
        K = self.n_topics
        z_id = self.topic_doc_words_distr  # shape (D, W, K)

        for d in range(D):
            for w in range(W):
                n_occurrences = self.docs_words[d, w]
                if n_occurrences > 0:
                    for k in range(K):
                        # Weight by probability and number of occurrences
                        loglike += z_id[d, w, k] * n_occurrences * (np.log(self.theta[d, k]) + np.log(self.phi[k, w]))
        self.loglike[iteration] = loglike

        # Compute test log-likelihood if test data is provided
        if self.do_test and self.loglike_test is not None:
            loglike_test = 0.0
            for d in range(D):
                for w in range(W):
                    n_occurrences_test = self.docs_words_test[d, w]
                    if n_occurrences_test > 0:
                        # Use current theta and phi to compute test log-likelihood
                        for k in range(K):
                            loglike_test += z_id[d, w, k] * n_occurrences_test * (np.log(self.theta[d, k]) + np.log(self.phi[k, w]))
            self.loglike_test[iteration] = loglike_test

    def get_loglike(self):
      """Returns log-likelihood at each iteration."""
      if self.do_test:
          return self.loglike, self.loglike_test
      else:
          return self.loglike

class GibbsSamplerCollapsed(GibbsSampler):
    def __init__(self, n_docs, n_topics, n_words, alpha, beta, random_seed=None):
        """
        :param n_docs:          number of documents
        :param n_topics:        number of topics
        :param n_words:         number of words in vocabulary
        :param alpha:           dirichlet parameter on topic mixing proportions
        :param beta:            dirichlet parameter on topic word distributions
        :param random_seed:     random seed of the sampler
        """
        super().__init__(n_docs, n_topics, n_words, alpha, beta, random_seed)

        # topics assigned to each (doc, word)
        self.doc_word_samples = np.ndarray((self.n_docs, self.n_words), dtype=object)
        self.doc_word_samples_test = self.doc_word_samples.copy()

    def init_params(self, theta=None, phi=None):
        # z_id are initialized uniformly
        for doc in range(self.n_docs):
            for word in range(self.n_words):
                n_train = self.docs_words[doc, word]
                if self.do_test:
                    n_test = self.docs_words_test[doc, word]
                else:
                    n_test = 0

                # Initialize training samples
                if n_train > 0:
                    sampled_topics_train = self.rand_gen.choice(self.topics_space, size=n_train)
                    self.doc_word_samples[doc, word] = sampled_topics_train.copy()

                    sample, counts = np.unique(sampled_topics_train, return_counts=True)
                    self.A_dk[doc, sample] += counts
                    self.B_kw[sample, word] += counts
                else:
                    self.doc_word_samples[doc, word] = np.array([], dtype=int)

                # Initialize test samples
                if self.do_test:
                    if n_test > 0:
                        sampled_topics_test = self.rand_gen.choice(self.topics_space, size=n_test)
                        self.doc_word_samples_test[doc, word] = sampled_topics_test.copy()

                        sample, counts = np.unique(sampled_topics_test, return_counts=True)
                        self.A_dk_test[doc, sample] += counts
                        self.B_kw_test[sample, word] += counts
                    else:
                        self.doc_word_samples_test[doc, word] = np.array([], dtype=int)

    def update_params(self):
        """
        Computes the distribution of z_id.
        Sampling of A_dk, B_kw is done automatically as
        each new z_id updates these counters
        """
        # todo: sample a topic for each (doc, word) and update A_dk, B_kw correspondingly
        # Hint: you can update A_dk, B_kw after each sampling instead of re-computing the whole matrix

        D = self.n_docs
        W = self.n_words
        K = self.n_topics

        for d in range(D):
            for w in range(W):
                n_occurrences = self.docs_words[d, w]

                # Remove old topic assignments for this (doc, word) pair
                old_samples = self.doc_word_samples[d, w]
                if old_samples is not None and len(old_samples) > 0:
                    for old_topic in old_samples:
                        self.A_dk[d, old_topic] -= 1
                        self.B_kw[old_topic, w] -= 1

                # Sample new topics if there are occurrences
                if n_occurrences > 0:
                    # Compute conditional probability for each topic k
                    # M_k is the total count of words assigned to topic k (after removing old assignments)
                    M_k = np.sum(self.B_kw, axis=1)  # shape (K,)
                    probs = (self.A_dk[d, :] + self.alpha) * (self.B_kw[:, w] + self.beta) / (M_k + W * self.beta)
                    probs = probs / probs.sum()

                    # Sample topics for all occurrences of this word in this document
                    sampled_topics = self.rand_gen.choice(np.arange(K), size=n_occurrences, p=probs)
                    self.doc_word_samples[d, w] = sampled_topics

                    # Update counts for sampled topics
                    for topic in sampled_topics:
                        self.A_dk[d, topic] += 1
                        self.B_kw[topic, w] += 1
                else:
                    # No occurrences - set to empty array
                    self.doc_word_samples[d, w] = np.array([], dtype=int)

    def update_loglike(self, iteration):
        """
        Updates loglike of the data, omitting the constant additive term
        with Gamma functions of hyperparameters
        """
        # todo: implement log-like
        self.loglike[iteration] = 0.0
        D = self.n_docs
        K = self.n_topics

        # First term: log-likelihood of doc-topic assignments (theta)
        for d in range(D):
            self.loglike[iteration] += np.sum(gammaln(self.A_dk[d, :] + self.alpha))
            self.loglike[iteration] -= gammaln(np.sum(self.A_dk[d, :] + self.alpha))
        # Second term: log-likelihood of topic-word assignments (phi)
        for k in range(K):
            self.loglike[iteration] += np.sum(gammaln(self.B_kw[k, :] + self.beta))
            self.loglike[iteration] -= gammaln(np.sum(self.B_kw[k, :] + self.beta))

        # Compute test log-likelihood if test data is provided
        if self.do_test and self.loglike_test is not None:
            loglike_test = 0.0
            # Use test counts to compute test log-likelihood
            for d in range(D):
                loglike_test += np.sum(gammaln(self.A_dk_test[d, :] + self.alpha))
                loglike_test -= gammaln(np.sum(self.A_dk_test[d, :] + self.alpha))
            for k in range(K):
                loglike_test += np.sum(gammaln(self.B_kw_test[k, :] + self.beta))
                loglike_test -= gammaln(np.sum(self.B_kw_test[k, :] + self.beta))
            self.loglike_test[iteration] = loglike_test

    def get_loglike(self):
      """Returns log-likelihood at each iteration."""
      if self.do_test:
          return self.loglike, self.loglike_test
      else:
          return self.loglike

    def to_return_from_run(self):
        # Compute theta and phi from the counts (posterior means)
        D = self.n_docs
        K = self.n_topics
        W = self.n_words

        # Compute theta: document-topic distributions (posterior mean)
        theta = np.zeros((D, K))
        for d in range(D):
            theta[d, :] = (self.A_dk[d, :] + self.alpha) / (np.sum(self.A_dk[d, :]) + K * self.alpha)

        # Compute phi: topic-word distributions (posterior mean)
        phi = np.zeros((K, W))
        for k in range(K):
            phi[k, :] = (self.B_kw[k, :] + self.beta) / (np.sum(self.B_kw[k, :]) + W * self.beta)

        return self.doc_word_samples, theta, phi

def display_topic_word_table(phi, word_labels=None, topic_labels=None, precision=2):
    df = pd.DataFrame(np.round(phi, precision))
    if word_labels:
        df.columns = word_labels
    if topic_labels:
        df.index = topic_labels
    print(df)

def display_doc_topic_table(theta, doc_labels=None, topic_labels=None, precision=2):
    df = pd.DataFrame(np.round(theta, precision))
    if doc_labels:
        df.index = doc_labels
    if topic_labels:
        df.columns = topic_labels
    print(df)

def read_data(filename):
    """
    Reads the text data and splits into train/test.
    Examples:
    docs_words_train, docs_words_test = read_data('./code/toyexample.data')
    nips_train, nips_test = read_data('./code/nips.data')
    :param filename:    path to the file
    :return:
    docs_words_train:   training data, [n_docs, n_words] numpy array
    docs_words_test:    test data, [n_docs, n_words] numpy array
    """
    data = pd.read_csv(filename, dtype=int, sep=' ', names=['doc', 'word', 'train', 'test'])

    n_docs = np.amax(data.loc[:, 'doc'])
    n_words = np.amax(data.loc[:, 'word'])

    docs_words_train = np.zeros((n_docs, n_words), dtype=int)
    docs_words_test = np.zeros((n_docs, n_words), dtype=int)

    docs_words_train[data.loc[:, 'doc'] - 1, data.loc[:, 'word'] - 1] = data.loc[:, 'train']
    docs_words_test[data.loc[:, 'doc'] - 1, data.loc[:, 'word'] - 1] = data.loc[:, 'test']

    return docs_words_train, docs_words_test

def main():
    # run both standard and collapsed gibbs on the toy example consisting of
    # 6 documents, 6 words, and 3 topics.  The true word distribution for each
    # topic should be:
    # [.5 .5 0 0 0 0], [0 0 .5 .5 0 0], [0 0 0 0 .5 .5]

    n_topics = 10     # number of topics
    alpha = 1      # dirichlet prior over topics
    beta =  1        # dirichlet prior over words
    n_iter = 1000    # number of iterations
    random_seed = 0  # random seed

    # Run the standard sampler
    print('Running toyexample.data with the standard sampler')
    docs_words_train, docs_words_test = read_data('/datasets/t1cw-data/toyexample.data')
    n_docs, n_words = docs_words_train.shape

    sampler = GibbsSampler(n_docs=n_docs, n_topics=n_topics, n_words=n_words,
                           alpha=alpha, beta=beta, random_seed=random_seed)

    topic_doc_words_distr, theta, phi = sampler.run(docs_words_train, docs_words_test,
                                                  n_iter=n_iter, save_loglike=True)
    # Print the topic-word distributions
    print(phi * [phi > 1e-2]) # # highlight the "active" words for each topic in topic-word probabilities
    print(theta * [theta > 1e-2]) # highlight the "active" words for each topic is the document-topic distributions
    print(topic_doc_words_distr) # raw topic assignments for each (doc, word) pair

    print("Topic-word distribution (phi) in Standard Gibbs:")
    display_topic_word_table(phi)
    print("\nDoc-topic distribution (theta) in Standard Gibbs:")
    display_doc_topic_table(theta)

    # ensure log-likelihoods for the standard sampler are stored if not done by run() method
    like_train, like_test = sampler.get_loglike()

    # Plot the log-likelihood
    plt.subplots(figsize=(15, 6))
    plt.plot(like_train, label='Joint')
    plt.plot(like_test, label='Predictive')
    plt.title('Standard Gibbs Log-likelihood')
    plt.ylabel('loglike')
    plt.xlabel('iteration')
    plt.legend()
    plt.show()

    # Heatmap
    plt.figure(figsize=(8,3))
    sns.heatmap(phi, annot=True, cmap="Blues", fmt=".2f", annot_kws={'size': 7})
    plt.title("Standard Topic-Word Distribution (phi)")
    plt.ylabel("Topic")
    plt.xlabel("Word")
    plt.show()

    plt.figure(figsize=(6,4))
    sns.heatmap(theta, annot=True, cmap="Greens", fmt=".2f", annot_kws={'size': 7})
    plt.title("Standard Doc-Topic Distribution (theta)")
    plt.ylabel("Document")
    plt.xlabel("Topic")
    plt.show()

    # Run the collapsed sampler
    print('Running toyexample.data with the collapsed sampler')

    sampler_collapsed = GibbsSamplerCollapsed(n_docs=n_docs, n_topics=n_topics, n_words=n_words,
                                              alpha=alpha, beta=beta, random_seed=random_seed)

    doc_word_samples, theta, phi = sampler_collapsed.run(docs_words_train, docs_words_test,
                                             n_iter=n_iter, save_loglike=True)

    for d in range(n_docs):
        for w in range(n_words):
            print(f"doc {d}, word {w}: {sampler_collapsed.doc_word_samples[d, w]}")

    # Print the topic-word distributions
    print(phi * [phi > 1e-2]) # highlight the "active" words for each topic in topic-word probabilities
    print(theta * [theta > 1e-2]) # highlight the "active" words for each topic is the document-topic distributions
    print(doc_word_samples) # raw topic assignments for each (doc, word) pair

    # Print topic counts as heatmap
    topic_counts = np.zeros((n_topics, n_words))
    for doc in range(doc_word_samples.shape[0]):
        for word in range(doc_word_samples.shape[1]):
            for topic in doc_word_samples[doc, word]:
                topic_counts[topic, word] += 1
    plt.figure(figsize=(8,3))
    sns.heatmap(topic_counts, annot=True, cmap="Reds", fmt=".2f", annot_kws={'size': 7})  # two decimal places
    plt.title("Collapsed Gibbs: Assignment Counts")
    plt.ylabel("Topic")
    plt.xlabel("Word")
    plt.show()

    print("\nTopic-word distribution (phi) in Collapsed Gibbs:")
    display_topic_word_table(phi)
    print("\nDoc-topic distribution (theta) in Collapsed Gibbs:")
    display_doc_topic_table(theta)

    # ensure log-likelihoods for the collapsed sampler are stored if not done by run() method
    like_train_collapsed, like_test_collapsed = sampler_collapsed.get_loglike()

    # Plot the log-likelihood for collapsed sampler
    plt.subplots(figsize=(15, 6))
    plt.plot(like_train_collapsed, label='Joint')
    plt.plot(like_test_collapsed, label='Predictive')
    plt.title('Collapsed Gibbs Log-likelihood')
    plt.ylabel('loglike')
    plt.xlabel('iteration')
    plt.legend()
    plt.show()

    # Heatmap
    plt.figure(figsize=(8,3))
    sns.heatmap(phi, annot=True, cmap="Blues", fmt=".2f", annot_kws={'size': 7})
    plt.title("Collapsed Topic-Word Distribution (phi)")
    plt.ylabel("Topic")
    plt.xlabel("Word")
    plt.show()

    plt.figure(figsize=(6,4))
    sns.heatmap(theta, annot=True, cmap="Greens", fmt=".2f", annot_kws={'size': 7})
    plt.title("Collapsed Doc-Topic Distribution (theta)")
    plt.ylabel("Document")
    plt.xlabel("Topic")
    plt.show()

    return sampler, sampler_collapsed


if __name__ == '__main__':
    # Capture the returned sampler objects into global variables
    sampler, sampler_collapsed = main()


## Autocorrelation after Burn-in

In [14]:
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import acf

burnin = 200
max_lag = 50

# Use samples after burn-in only:
predstd = sampler.loglike_test[burnin:]
predcol = sampler_collapsed.loglike_test[burnin:]
jointstd = sampler.loglike[burnin:]
jointcol = sampler_collapsed.loglike[burnin:]


# Remove NaNs or inf before autocorrelation
def fin(arr, nlags, fft=True):
    arr = np.array(arr)
    arr = arr[np.isfinite(arr)]
    if arr.size < nlags + 1:
        raise ValueError("Not enough finite values to compute autocorrelation.")
    if np.nanstd(arr) == 0:
        print("Warning: input array has zero variance after burn-in. Autocorrelation is undefined.")
        return np.zeros(nlags + 1)
    return acf(arr, nlags=nlags, fft=fft)

# Compute autocorrelation (using statsmodels)
acf_predstd = fin(predstd, nlags=max_lag)
acf_predcol = fin(predcol, nlags=max_lag)
acf_jointstd = fin(jointstd, nlags=max_lag)
acf_jointcol = fin(jointcol, nlags=max_lag)

# Plot
plt.figure(figsize=(10,8))
plt.subplot(221); plt.stem(acf_predstd); plt.title('Standard Gibbs Log Predictive')
plt.subplot(222); plt.stem(acf_jointstd); plt.title('Standard Gibbs Log Joint')
plt.subplot(223); plt.stem(acf_predcol); plt.title('Collapsed Gibbs Log Predictive')
plt.subplot(224); plt.stem(acf_jointcol); plt.title('Collapsed Gibbs Log Joint')
plt.tight_layout(); plt.show()

## Recover Posteriors

In [None]:
# Inspect Posterior Recovery
print('Estimated topic-word distributions (standard sampler):')
print(np.round(sampler.phi, 2))
print('Estimated doc-topic distributions (standard sampler):')
print(np.round(sampler.theta, 2))

print('Estimated topic-word distributions (collapsed sampler):')
print(np.round(sampler_collapsed.phi, 2))
print('Estimated doc-topic distributions (collapsed sampler):')
print(np.round(sampler_collapsed.theta, 2))


# Gibbs Sampling on NeurIPS data


## Prune data using tf-idf

In [None]:
import pandas as pd
import numpy as np

# Step 1: Load .data file (4 cols: doc, word, train, test)
data = pd.read_csv('/datasets/t1cw-data/nips.data', sep=' ', header=None, names=['doc', 'word', 'train', 'test'])

# Convert to zero-based indices for arrays
data['doc'] -= 1
data['word'] -= 1

# Step 2: Load vocab
with open('/datasets/t1cw-data/nips.vocab') as f:
  vocab = [line.strip() for line in f]

# Step 3: Compute informativeness metric (tf-idf)
N = int(data['doc'].max()) + 1 # Number of docs
W = int(data['word'].max()) + 1 # Number of words
# Term frequency (tf): total count of each word in all documents
tf = data.groupby('word')['train'].sum()
# Document frequency (df): number of docs each word appears in
df = data[data['train'] > 0].groupby('word')['doc'].nunique()
# tf-idf informativeness metric for each word
tfidf = tf * np.log(N / (1 + df)).reindex(tf.index, fill_value=0)
top = tfidf.nlargest(500).index.to_list()   # List of top 500 word indices (ints)

# Step 4: Filter .data using indices, then renumber for contiguous matrix
filtered = data[data['word'].isin(top)].copy()
top = sorted(top)
mapping = {old_idx: new_idx for new_idx, old_idx in enumerate(top)}
filtered['word'] = filtered['word'].map(mapping)

# Step 5: Build train/test matrices
N = int(filtered['doc'].max()) + 1
W = int(filtered['word'].max()) + 1
nips_train = np.zeros((N, W), dtype=float)
nips_test = np.zeros((N, W), dtype=float)
nips_train[filtered['doc'], filtered['word']] = filtered['train']
nips_test[filtered['doc'], filtered['word']] = filtered['test']

# Step 6: Save new .data file (word and doc indices as 1-based)
pruned = filtered.copy()
pruned['doc'] += 1
pruned['word'] += 1
pruned[['doc', 'word', 'train', 'test']].to_csv(
    '/datasets/t1cw-data/prunednips.data', sep=' ', header=False, index=False)

# Step 7: Save new filtered vocab, aligned to new word indices
keywords = [vocab[idx] for idx in top]
with open('/datasets/t1cw-data/prunednips.vocab', 'w') as f:
    for word in keywords:
        f.write(word + '\n')


## Run Collapsed Sampler on pruned data

In [None]:
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from scipy.special import gammaln
import matplotlib.pyplot as plt
import seaborn as sns

class GibbsSamplerCollapsed:
    def __init__(self, n_docs, n_topics, n_words, alpha, beta, random_seed=None):
        """Initialize sampler with shapes."""
        self.n_docs = n_docs
        self.n_topics = n_topics
        self.n_words = n_words
        self.alpha = alpha
        self.beta = beta
        self.rand_gen = np.random.RandomState(random_seed)

        # Placeholders (shapes will be updated in init_sampling)
        self.nips_samples = None  # topic assignments for train
        self.nips_samples_test = None  # topic assignments for test
        self.nips = None
        self.nips_test = None
        self.A_dk = None  # document-topic count
        self.B_kw = None  # topic-word count
        self.A_dk_test = None
        self.B_kw_test = None
        self.theta = None
        self.phi = None
        self.topics_space = np.arange(self.n_topics)
        self.topic_nips_distr = None
        self.loglike = None
        self.loglike_test = None
        self.do_test = False

    def init_sampling(self, nips, nips_test=None,
                      theta=None, phi=None, n_iter=0, save_loglike=False):
        self.nips = nips
        self.n_docs, self.n_words = nips.shape
        self.n_topics = getattr(self, 'n_topics', 10)
        if nips_test is not None:
            self.do_test = True
            self.nips_test = nips_test
        else:
            self.do_test = False
            self.nips_test = None

        self.nips_samples = np.empty((self.n_docs, self.n_words), dtype=object)
        self.nips_samples_test = np.empty((self.n_docs, self.n_words), dtype=object)
        self.A_dk = np.zeros((self.n_docs, self.n_topics), dtype=int)
        self.B_kw = np.zeros((self.n_topics, self.n_words), dtype=int)
        self.A_dk_test = np.zeros((self.n_docs, self.n_topics), dtype=int)
        self.B_kw_test = np.zeros((self.n_topics, self.n_words), dtype=int)
        self.theta = np.ones((self.n_docs, self.n_topics)) / self.n_topics
        self.phi = np.ones((self.n_topics, self.n_words)) / self.n_words
        self.topics_space = np.arange(self.n_topics)
        self.topic_nips_distr = np.zeros((self.n_docs, self.n_topics, self.n_words))

        if save_loglike:
            self.loglike = np.zeros(n_iter)
            self.loglike_test = np.zeros(n_iter) if self.do_test else None

        self.init_params(theta, phi)

    def init_params(self, theta=None, phi=None):
        for d in range(self.n_docs):
            for w in range(self.n_words):
                n_train = int(self.nips[d, w])
                n_test = int(self.nips_test[d, w]) if (self.do_test and self.nips_test is not None) else 0
                if n_train > 0:
                    sampled_topics_train = self.rand_gen.choice(self.topics_space, size=n_train)
                    self.nips_samples[d, w] = sampled_topics_train.copy()
                    for k, c in zip(*np.unique(sampled_topics_train, return_counts=True)):
                        self.A_dk[d, k] += c
                        self.B_kw[k, w] += c
                else:
                    self.nips_samples[d, w] = np.array([], dtype=int)

                if self.do_test and n_test > 0:
                    sampled_topics_test = self.rand_gen.choice(self.topics_space, size=n_test)
                    self.nips_samples_test[d, w] = sampled_topics_test.copy()
                    for k, c in zip(*np.unique(sampled_topics_test, return_counts=True)):
                        self.A_dk_test[d, k] += c
                        self.B_kw_test[k, w] += c
                elif self.do_test:
                    self.nips_samples_test[d, w] = np.array([], dtype=int)

    def sample_counts(self):
        for d in range(self.n_docs):
            for w in range(self.n_words):
                n_occurrences = int(self.nips[d, w])
                # Remove old topic assignments
                old_samples = self.nips_samples[d, w]
                for old_topic in old_samples:
                    self.A_dk[d, old_topic] -= 1
                    self.B_kw[old_topic, w] -= 1
                if n_occurrences > 0:
                    M_k = np.sum(self.B_kw, axis=1)  # total words per topic
                    p = (self.A_dk[d, :] + self.alpha) * (self.B_kw[:, w] + self.beta) / (M_k + self.n_words * self.beta)
                    p = p / p.sum()
                    sampled_topics = self.rand_gen.choice(self.topics_space, size=n_occurrences, p=p)
                    self.nips_samples[d, w] = sampled_topics
                    for topic in sampled_topics:
                        self.A_dk[d, topic] += 1
                        self.B_kw[topic, w] += 1
                else:
                    self.nips_samples[d, w] = np.array([], dtype=int)

    def update_pd(self):
        # Recalculate topic_nips_distr
        # Note: not generally used in collapsed samplers, but keeping for reference
        self.topic_nips_distr = self.theta[:, :, None] * self.phi[None, :, :]  # shape (N, K, W)
        self.topic_nips_distr /= np.sum(self.topic_nips_distr, axis=1, keepdims=True)  # normalize over K

    def update_params(self):
        # Posterior means for theta/phi after sampling
        self.theta = (self.A_dk + self.alpha).astype(float)
        self.theta /= np.sum(self.theta, axis=1, keepdims=True)
        self.phi = (self.B_kw + self.beta).astype(float)
        self.phi /= np.sum(self.phi, axis=1, keepdims=True)
        self.update_pd()
        self.sample_counts()
        return self.nips_samples, self.theta, self.phi

    def update_loglike(self, iteration):
        """Collapsed log-likelihood per iteration."""
        loglike = 0.0
        for d in range(self.n_docs):
            loglike += np.sum(gammaln(self.A_dk[d, :] + self.alpha))
            loglike -= gammaln(np.sum(self.A_dk[d, :] + self.n_topics * self.alpha))
        for k in range(self.n_topics):
            loglike += np.sum(gammaln(self.B_kw[k, :] + self.beta))
            loglike -= gammaln(np.sum(self.B_kw[k, :] + self.n_words * self.beta))
        self.loglike[iteration] = loglike
        # Compute test log-like if relevant
        if self.do_test and self.loglike_test is not None:
            loglike_test = 0.0
            for d in range(self.n_docs):
                loglike_test += np.sum(gammaln(self.A_dk_test[d, :] + self.alpha))
                loglike_test -= gammaln(np.sum(self.A_dk_test[d, :] + self.n_topics * self.alpha))
            for k in range(self.n_topics):
                loglike_test += np.sum(gammaln(self.B_kw_test[k, :] + self.beta))
                loglike_test -= gammaln(np.sum(self.B_kw_test[k, :] + self.n_words * self.beta))
            self.loglike_test[iteration] = loglike_test

    def get_loglike(self):
        if self.do_test:
            return self.loglike, self.loglike_test
        else:
            return self.loglike, None

    def run(self, nips, nips_test=None, n_iter=100, theta=None, phi=None, save_loglike=False):
        self.init_sampling(nips, nips_test, theta, phi, n_iter, save_loglike)
        for iteration in range(n_iter):
            self.update_params()
            if save_loglike:
                self.update_loglike(iteration)
        return self.nips_samples, self.theta, self.phi

    @staticmethod
    def display_topic_word_table(phi, word_labels=None, topic_labels=None, precision=2):
        df = pd.DataFrame(np.round(phi, precision))
        if word_labels is not None:
            df.columns = word_labels
        if topic_labels is not None:
            df.index = topic_labels
        print(df)

    @staticmethod
    def display_doc_topic_table(theta, doc_labels=None, topic_labels=None, precision=2):
        df = pd.DataFrame(np.round(theta, precision))
        if doc_labels is not None:
            df.index = doc_labels
        if topic_labels is not None:
            df.columns = topic_labels
        print(df)

def read_data(filename):
    """
    Reads the text data and splits into train/test.
    Examples:
    docs_words_train, docs_words_test = read_data('./code/toyexample.data')
    nips_train, nips_test = read_data('./code/nips.data')
    :param filename:    path to the file
    :return:
    nips_train:   training data, [n_docs, n_words] numpy array
    nips_words_test:    test data, [n_docs, n_words] numpy array
    """
    data = pd.read_csv(filename, dtype=int, sep=' ', names=['doc', 'word', 'train', 'test'])

    n_docs = np.amax(data.loc[:, 'doc'])
    n_words = np.amax(data.loc[:, 'word'])

    nips_train = np.zeros((n_docs, n_words), dtype=int)
    nips_test = np.zeros((n_docs, n_words), dtype=int)

    nips_train[data.loc[:, 'doc'] - 1, data.loc[:, 'word'] - 1] = data.loc[:, 'train']
    nips_test[data.loc[:, 'doc'] - 1, data.loc[:, 'word'] - 1] = data.loc[:, 'test']

    return nips_train, nips_test

def main():
    nips_train, nips_test = read_data('/datasets/t1cw-data/prunednips.data')
    K= 3
    alpha = 1
    beta = 1
    n_iter = 200
    seed = 0

    sampler = GibbsSamplerCollapsed(
        n_docs=N, n_topics=K, n_words=W, alpha=alpha, beta=beta, random_seed=seed
    )
    nips_samples, theta, phi = sampler.run(
        nips_train, nips_test, n_iter=n_iter, save_loglike=True
    )
    like_train, like_test = sampler.get_loglike()

    # Print topic assignments for first few doc/word pairs (truncate printing for readability)
    for d in range(min(3, N)):
        for w in range(min(3, W)):
            print(f"doc {d}, word {w}: {sampler.nips_samples[d, w]}")

    print("\nActive topic-word probabilities (>1e-2):\n", np.round(phi * (phi > 1e-2), 2))
    print("\nActive doc-topic probabilities (>1e-2):\n", np.round(theta * (theta > 1e-2), 2))

    # Topic assignment heatmap
    topic_counts = np.zeros((K, N))
    for doc in range(N):
        for word in range(W):
            for topic in sampler.nips_samples[doc, word]:
                topic_counts[topic, doc] += 1

    plt.figure(figsize=(8, 3))
    sns.heatmap(topic_counts, annot=True, cmap="Reds", fmt=".0f", annot_kws={'size': 7})
    plt.title("Topic Assignments (Counts per Topic, Document)")
    plt.ylabel("Topic")
    plt.xlabel("Document")
    plt.show()

    print("\nTopic-word distribution (phi):")
    GibbsSamplerCollapsed.display_topic_word_table(phi)
    print("\nDoc-topic distribution (theta):")
    GibbsSamplerCollapsed.display_doc_topic_table(theta)

    # Plot log-likelihood
    plt.figure(figsize=(15, 6))
    plt.plot(like_train, label='Joint log-like')
    if like_test is not None:
        plt.plot(like_test, label='Predictive log-like')
    plt.title('Collapsed Gibbs Log-likelihood')
    plt.ylabel('loglikelihood')
    plt.xlabel('Iteration')
    plt.legend()
    plt.show()

    plt.figure(figsize=(8, 3))
    sns.heatmap(phi, annot=True, cmap="Blues", fmt=".2f", annot_kws={'size': 7})
    plt.title("Collapsed Topic-Word Distribution (phi)")
    plt.ylabel("Topic")
    plt.xlabel("Word")
    plt.show()

    plt.figure(figsize=(6, 4))
    sns.heatmap(theta, annot=True, cmap="Greens", fmt=".2f", annot_kws={'size': 7})
    plt.title("Collapsed Doc-Topic Distribution (theta)")
    plt.ylabel("Document")
    plt.xlabel("Topic")
    plt.show()

    return sampler

if __name__ == '__main__':
    nips_sampler = main()


## Get topics

In [None]:
phi = nips_sampler.phi
K = phi.shape[0]
top_n = 10  # Number of top words per topic
summary = []

for k in range(K):
    keyword_indices = phi[k, :].argsort()[::-1][:top_n]
    topkeywords = [keywords[i] for i in keyword_indices]
    summary.append(topkeywords)

# Print summary as a markdown-style table:
print("| Topic | Top 10 Words                |")
print("|-------|-----------------------------|")
for k, words in enumerate(summary):
    print(f"| {k+1:5d} | {' '.join(words)} |")


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cb182644-878e-48cb-992b-68a78a5afe3d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>