# Lab 3.2 - Topic Modeling (Тематическое моделирование)

This notebook demonstrates the complete topic modeling pipeline:
1. Loading term-document matrix from Lab 2
2. LDA experiments with different numbers of topics
3. Perplexity analysis and polynomial approximation
4. Finding optimal number of topics
5. Document-topic probability analysis
6. Iteration count experiments



In [1]:
# Import necessary libraries
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Add source directory to path
sys.path.append('source')

from source.data_loader import load_vocabulary_from_lab2, split_train_test
from source.topic_modeling import TopicModeler
from source.experiments import run_lda_experiments, run_iteration_experiments
from source.analysis import plot_perplexity_vs_topics, find_optimal_topics, polynomial_approximation

# Set up plotting style
try:
    plt.style.use('seaborn-v0_8')
except:
    try:
        plt.style.use('seaborn')
    except:
        pass
sns.set_palette("husl")

print("Libraries imported successfully!")


Libraries imported successfully!


## Step 1: Load Term-Document Matrix from Lab 2

Load the term-document matrix and vocabulary that were created in Lab 2 Task 1.


In [2]:
# Configuration
lab2_dir = "../lab2"
filter_stopwords = True  # Set to False to keep all words

# Load term-document matrix and vocabulary
print("Loading term-document matrix and vocabulary from lab2...")
if filter_stopwords:
    print("Stop word filtering: ENABLED")
else:
    print("Stop word filtering: DISABLED")

dict_data, term_doc_matrix, vocabulary = load_vocabulary_from_lab2(
    lab2_dir, 
    filter_stopwords=filter_stopwords
)

print(f"\nTerm-document matrix shape: {term_doc_matrix.shape}")
print(f"Vocabulary size: {len(vocabulary)}")
print(f"Number of documents: {term_doc_matrix.shape[1]}")
print(f"\nSample vocabulary words (first 20): {vocabulary[:20]}")


Loading term-document matrix and vocabulary from lab2...
Stop word filtering: ENABLED


FileNotFoundError: Token dictionary not found at ../lab2\assets\token_dictionary.pkl. Please run Lab 2 Task 1 first.

## Step 2: Split into Train and Test Sets

Split the term-document matrix for training and evaluation.


In [None]:
# Split into train and test
train_matrix, test_matrix = split_train_test(term_doc_matrix, test_size=0.2, random_state=42)

print(f"Train matrix: {train_matrix.shape}")
print(f"Test matrix: {test_matrix.shape}")
print(f"Train documents: {train_matrix.shape[1]}")
print(f"Test documents: {test_matrix.shape[1]}")


## Step 3: Task 1 - LDA Experiments with Different Numbers of Topics

Run LDA experiments with varying numbers of topics. The dataset has 4 classes, so we'll test: 2, 4, 5, 10, 20, 40 topics.


In [None]:
# Number of classes in dataset (AG News has 4 classes)
num_classes = 4

# List of topics to test (including number of classes)
n_topics_list = [2, 5, 10, 20, 40]
if num_classes not in n_topics_list:
    n_topics_list.append(num_classes)
n_topics_list = sorted(n_topics_list)

print(f"Testing number of topics: {n_topics_list}")

# Run experiments
results = run_lda_experiments(
    train_matrix, test_matrix,
    vocabulary, n_topics_list,
    n_iter=10
)

print(f"\nCompleted {len(results)} experiments")


## Step 4: Display Top Words for Each Topic

Show the top 10 keywords for each topic from one of the experiments.


In [None]:
# Display top words for experiment with 10 topics
result_10 = next((r for r in results if r['n_topics'] == 10), None)

if result_10:
    print(f"Top 10 words for each topic (n_topics=10):")
    print("=" * 60)
    for topic_id, words in result_10['top_words'].items():
        print(f"\nTopic {topic_id}:")
        for i, (word, prob) in enumerate(words, 1):
            print(f"  {i:2d}. {word:15s} (prob: {prob:.6f})")


## Step 5: Task 2 - Perplexity Analysis with Polynomial Approximation

Plot perplexity vs number of topics and fit a polynomial curve using r-squared metric.


In [None]:
# Plot perplexity vs topics with polynomial approximation
coefs, r2, degree, best_reg, best_poly_features = plot_perplexity_vs_topics(results)

print(f"\nPolynomial Approximation Results:")
print(f"  Best degree: {degree}")
print(f"  R-squared: {r2:.4f}")
print(f"  Coefficients: {coefs}")


## Step 6: Task 3 - Find Optimal Number of Topics

Determine the optimal number of topics using different methods.


In [None]:
# Find optimal number of topics
optimal_topics_elbow = find_optimal_topics(results, method='elbow')
optimal_topics_min = find_optimal_topics(results, method='min_perplexity')

print("=" * 60)
print("Optimal Number of Topics Analysis")
print("=" * 60)
print(f"\nElbow method: {optimal_topics_elbow} topics")
print(f"Minimum perplexity: {optimal_topics_min} topics")

# Show perplexity for each configuration
print("\nPerplexity by number of topics:")
print("-" * 60)
for r in sorted(results, key=lambda x: x['n_topics']):
    print(f"  {r['n_topics']:2d} topics: {r['perplexity']:8.2f}")


## Step 7: Document-Topic Probability Analysis

Examine document-topic probability distributions for the optimal model.


In [None]:
# Get document-topic distribution for optimal model
optimal_result = next((r for r in results if r['n_topics'] == optimal_topics_elbow), results[0])
doc_topic_dist = np.array(optimal_result['doc_topic_distribution'])

print(f"Document-topic distribution shape: {doc_topic_dist.shape}")
print(f"Number of documents: {doc_topic_dist.shape[0]}")
print(f"Number of topics: {doc_topic_dist.shape[1]}")

# Show statistics
print("\nTopic probability statistics:")
print("-" * 60)
for topic_idx in range(doc_topic_dist.shape[1]):
    topic_probs = doc_topic_dist[:, topic_idx]
    print(f"Topic {topic_idx}: mean={np.mean(topic_probs):.4f}, "
          f"std={np.std(topic_probs):.4f}, max={np.max(topic_probs):.4f}")


## Step 8: Top Documents per Topic

Show documents with highest probability for each topic.


In [None]:
# Display top documents for each topic
print(f"Top 5 documents for each topic (n_topics={optimal_result['n_topics']}):")
print("=" * 60)

for topic_id, docs in optimal_result['top_documents'].items():
    print(f"\nTopic {topic_id}:")
    for i, (doc_idx, prob) in enumerate(docs[:5], 1):
        print(f"  {i}. Document {doc_idx:5d} (probability: {prob:.4f})")


## Step 9: Optional - Iteration Count Experiments

Test different numbers of training iterations to find optimal value.


In [None]:
# Test different iteration counts
base_iter = 10
n_iters_list = [base_iter // 2, base_iter, base_iter * 2]
test_n_topics = optimal_topics_elbow

print(f"Testing iteration counts: {n_iters_list}")
print(f"Using {test_n_topics} topics (optimal from previous experiments)")

iter_results = run_iteration_experiments(
    train_matrix, test_matrix,
    vocabulary, test_n_topics,
    n_iters_list
)


In [None]:
# Visualize iteration results
iter_df = pd.DataFrame(iter_results)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Perplexity vs iterations
axes[0].plot(iter_df['n_iter'], iter_df['perplexity'], marker='o', linewidth=2, markersize=10)
axes[0].set_xlabel('Number of Iterations', fontsize=12)
axes[0].set_ylabel('Perplexity', fontsize=12)
axes[0].set_title('Perplexity vs Number of Iterations', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Training time vs iterations
axes[1].plot(iter_df['n_iter'], iter_df['training_time'], marker='s', linewidth=2, markersize=10, color='orange')
axes[1].set_xlabel('Number of Iterations', fontsize=12)
axes[1].set_ylabel('Training Time (seconds)', fontsize=12)
axes[1].set_title('Training Time vs Number of Iterations', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find optimal iterations
best_iter_result = min(iter_results, key=lambda x: x['perplexity'])
print(f"\nOptimal number of iterations: {best_iter_result['n_iter']}")
print(f"  Perplexity: {best_iter_result['perplexity']:.2f}")
print(f"  Training time: {best_iter_result['training_time']:.2f}s")


## Step 10: Save Results

Save experiment results and document-topic distributions.


In [None]:
# Save results
os.makedirs('assets/results', exist_ok=True)
os.makedirs('assets/results/distributions', exist_ok=True)

# Save experiment results
results_file = 'assets/results/lda_experiments.json'
with open(results_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2)
print(f"Saved experiment results to {results_file}")

# Save document-topic distributions in TSV format
for result in results:
    n_topics = result['n_topics']
    doc_topic_dist = np.array(result['doc_topic_distribution'])
    
    output_file = f'assets/results/distributions/doc_topic_dist_n_topics_{n_topics}.tsv'
    with open(output_file, 'w', encoding='utf-8') as f:
        for doc_idx, probs in enumerate(doc_topic_dist):
            doc_id = str(doc_idx)
            prob_str = '\t'.join([f"{p:.6f}" for p in probs])
            f.write(f"{doc_id}\t{prob_str}\n")
    
    print(f"Saved document-topic distribution to {output_file}")

print("\nAll results saved successfully!")
