# Topic Modeling Demo

This notebook demonstrates topic modeling techniques including LDA (Latent Dirichlet Allocation), topic visualization, and topic-sentiment correlation analysis.

## Setup and Data Loading

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.analysis import (
    load_data, preprocess_df, lda_topics, topic_sentiment_correlation,
    compute_vader_sentiment
)

# Generate data if needed
DATA = Path('data') / 'synthetic_texts.csv'
if not DATA.exists():
    from generate_data import generate
    generate(800)

# Load and preprocess data
df = load_data(DATA)
df = preprocess_df(df)
print(f"Dataset shape: {df.shape}")
print("Label distribution:")
print(df['label'].value_counts())

## Basic LDA Topic Modeling

In [None]:
# Extract topics using LDA
topics = lda_topics(df['joined_tokens'], n_topics=5, n_top_words=10)
print("LDA Topics discovered:")
for i, topic_words in enumerate(topics):
    print(f"Topic {i}: {', '.join(topic_words)}")

## Topic Visualization

In [None]:
# Visualize topics using word clouds
from wordcloud import WordCloud

plt.figure(figsize=(15, 10))
for i, topic_words in enumerate(topics):
    plt.subplot(2, 3, i+1)
    text = ' '.join(topic_words)
    wordcloud = WordCloud(width=400, height=300, background_color='white').generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Topic {i}')
    plt.axis('off')
plt.tight_layout()
plt.show()

## Topic-Sentiment Correlation Analysis

In [None]:
# Add sentiment scores
df_sentiment = compute_vader_sentiment(df)

# Analyze topic-sentiment correlation
topic_corr = topic_sentiment_correlation(df_sentiment, n_topics=5)
print("Topic-Sentiment Correlation:")
print(topic_corr)

# Visualize correlation
plt.figure(figsize=(12, 6))
colors = ['red' if x < 0 else 'green' for x in topic_corr['corr_with_sentiment']]
plt.bar(range(len(topic_corr)), topic_corr['corr_with_sentiment'], color=colors)
plt.xlabel('Topic')
plt.ylabel('Correlation with Sentiment')
plt.title('Topic-Sentiment Correlation')
plt.xticks(range(len(topic_corr)), [f'Topic {i}' for i in range(len(topic_corr))])
plt.axhline(y=0, color='black', linestyle='--')
plt.tight_layout()
plt.show()

## Topic Distribution by Domain

In [None]:
# Fit LDA and get document-topic distributions
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(max_features=2000, stop_words='english')
X = vectorizer.fit_transform(df['joined_tokens'])

lda = LatentDirichletAllocation(n_components=5, random_state=42)
doc_topic_dist = lda.fit_transform(X)

# Add topic distributions to dataframe
topic_cols = [f'topic_{i}' for i in range(5)]
for i, col in enumerate(topic_cols):
    df[col] = doc_topic_dist[:, i]

# Analyze topic distribution by domain
topic_by_domain = df.groupby('domain')[topic_cols].mean()
print("Topic distribution by domain:")
print(topic_by_domain)

# Visualize topic distribution
plt.figure(figsize=(12, 8))
topic_by_domain.T.plot(kind='bar', figsize=(12, 6))
plt.title('Topic Distribution by Domain')
plt.ylabel('Mean Topic Probability')
plt.xlabel('Topic')
plt.legend(title='Domain')
plt.tight_layout()
plt.show()

## Topic Quality Assessment

In [None]:
# Assess topic coherence
def calculate_topic_coherence(topics, vectorizer, X):
    """Simple topic coherence metric based on pairwise word co-occurrence"""
    coherence_scores = []
    for topic_words in topics:
        if len(topic_words) < 2:
            coherence_scores.append(0)
            continue
        
        # Get word indices
        word_indices = [vectorizer.vocabulary_.get(word, -1) for word in topic_words[:10]]
        word_indices = [idx for idx in word_indices if idx != -1]
        
        if len(word_indices) < 2:
            coherence_scores.append(0)
            continue
        
        # Calculate pairwise co-occurrence
        cooccurrence = X[:, word_indices].sum(axis=0)
        pairwise_cooccurrence = X[:, word_indices].T @ X[:, word_indices]
        
        coherence = 0
        count = 0
        for i in range(len(word_indices)):
            for j in range(i+1, len(word_indices)):
                if pairwise_cooccurrence[i, j] > 0:
                    coherence += np.log((pairwise_cooccurrence[i, j] + 1) / (cooccurrence[i] + 1))
                    count += 1
        
        coherence_scores.append(coherence / count if count > 0 else 0)
    
    return coherence_scores

# Calculate coherence scores
coherence_scores = calculate_topic_coherence(topics, vectorizer, X)
print("Topic Coherence Scores:")
for i, score in enumerate(coherence_scores):
    print(f"Topic {i}: {score:.3f}")

## Interactive Topic Visualization (Plotly)

In [None]:
# Interactive topic visualization
try:
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    
    # Create interactive topic distribution plot
    topic_melt = topic_by_domain.reset_index().melt(id_vars=['domain'], 
                                                   value_vars=topic_cols,
                                                   var_name='topic', 
                                                   value_name='probability')
    
    fig = px.bar(topic_melt, x='topic', y='probability', color='domain', 
                title='Topic Distribution by Domain', barmode='group')
    fig.show()
    
except ImportError:
    print("Plotly not available for interactive visualization")

## Topic Modeling Summary

This notebook demonstrates:
1. Basic LDA topic extraction
2. Topic visualization with word clouds
3. Topic-sentiment correlation analysis
4. Topic distribution across domains
5. Topic coherence assessment
6. Interactive visualization

Key insights:
- Different domains have distinct topic distributions
- Topics can be ranked by their sentiment correlation
- Coherence scores help assess topic quality
- Interactive visualizations provide deeper insights