## Setup and Imports

In [None]:
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, '../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

from preprocessing.text_cleaner import TextPreprocessor
from reference_detection.detect_references import ReferenceDetector
from analysis.similarity_analysis import SimilarityAnalyzer
from analysis.concept_mapping import ConceptMapper

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

## 1. Reference Detection

Detect how often Vedanta concepts, texts, and figures are mentioned in Anthroposophy texts.

In [None]:
# Initialize detector
detector = ReferenceDetector()

# Analyze Anthroposophy corpus
anthro_dir = Path("../data/anthroposophy")
if anthro_dir.exists():
    analysis = detector.analyze_corpus(anthro_dir)
    
    print(f"Total Vedanta references found: {analysis['total_references']}")
    print(f"Unique terms found: {analysis['unique_terms_across_corpus']}")
    print("\nMost frequently mentioned terms:")
    for term, count in analysis['most_common_terms'][:10]:
        print(f"  {term:25s}: {count:3d}")
else:
    print("No data found. Please run data collection first.")

In [None]:
# Visualize reference frequencies
if 'analysis' in locals():
    terms = [t[0] for t in analysis['most_common_terms'][:15]]
    counts = [t[1] for t in analysis['most_common_terms'][:15]]
    
    plt.figure(figsize=(12, 6))
    plt.barh(terms, counts, color='steelblue')
    plt.xlabel('Frequency')
    plt.title('Most Frequently Mentioned Vedanta Terms in Anthroposophy Texts')
    plt.tight_layout()
    plt.show()

In [None]:
# Visualize references by text file
if 'analysis' in locals() and 'file_analyses' in analysis:
    file_refs = [(file_data['filename'], file_data['total_references']) 
                 for file_data in analysis['file_analyses']]
    file_refs.sort(key=lambda x: x[1], reverse=True)
    
    files = [f[0].replace('steiner_', '').replace('.txt', '').replace('_', ' ').title() 
             for f in file_refs]
    refs = [f[1] for f in file_refs]
    
    plt.figure(figsize=(12, 6))
    bars = plt.bar(range(len(files)), refs, color='teal', alpha=0.7)
    plt.xticks(range(len(files)), files, rotation=45, ha='right')
    plt.ylabel('Number of Vedanta References')
    plt.title('Vedanta References by Anthroposophy Text')
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for i, (bar, val) in enumerate(zip(bars, refs)):
        if val > 0:
            plt.text(bar.get_x() + bar.get_width()/2, val + 0.3, str(val), 
                    ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()

### Context of References

Let's look at where these references appear in context.

In [None]:
# Display sample contexts for most common terms
if 'analysis' in locals() and 'file_analyses' in analysis:
    print("Sample contexts for Vedanta references:\n")
    
    contexts_shown = 0
    for file_data in analysis['file_analyses']:
        if contexts_shown >= 5:  # Show 5 examples
            break
            
        if file_data['total_references'] > 0:
            filename = file_data['filename'].replace('steiner_', '').replace('.txt', '')
            print(f"\n{'='*80}")
            print(f"From: {filename.replace('_', ' ').title()}")
            print(f"{'='*80}")
            
            for match in file_data['matches'][:2]:  # Show up to 2 per file
                if contexts_shown >= 5:
                    break
                print(f"\nTerm: '{match['term']}'")
                print(f"Context: ...{match['context']}...")
                contexts_shown += 1

## 2. Text Similarity Analysis

Compute semantic similarity between Anthroposophy and Vedanta texts.

### Corpus Statistics

Compare the size and characteristics of both text corpora.

In [None]:
# Analyze corpus statistics
import os

anthro_dir = Path("../data/anthroposophy")
vedanta_dir = Path("../data/vedanta")

def get_text_stats(directory):
    stats = []
    for filepath in directory.glob("*.txt"):
        if filepath.name not in ['steiner_sample.txt', 'bhagavad_gita_sample.txt']:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
                words = content.split()
                stats.append({
                    'filename': filepath.name.replace('.txt', '').replace('_', ' ').title(),
                    'words': len(words),
                    'chars': len(content),
                    'lines': len(content.split('\n'))
                })
    return stats

anthro_stats = get_text_stats(anthro_dir)
vedanta_stats = get_text_stats(vedanta_dir)

# Create comparison DataFrame
df_anthro = pd.DataFrame(anthro_stats)
df_vedanta = pd.DataFrame(vedanta_stats)

print("ANTHROPOSOPHY CORPUS")
print("="*80)
print(f"Total texts: {len(anthro_stats)}")
print(f"Total words: {df_anthro['words'].sum():,}")
print(f"Average words per text: {df_anthro['words'].mean():.0f}")
print(f"\nLargest text: {df_anthro.loc[df_anthro['words'].idxmax(), 'filename']}")
print(f"             ({df_anthro['words'].max():,} words)")

print("\n\nVEDANTA CORPUS")
print("="*80)
print(f"Total texts: {len(vedanta_stats)}")
print(f"Total words: {df_vedanta['words'].sum():,}")
print(f"Average words per text: {df_vedanta['words'].mean():.0f}")
print(f"\nLargest text: {df_vedanta.loc[df_vedanta['words'].idxmax(), 'filename']}")
print(f"             ({df_vedanta['words'].max():,} words)")

In [None]:
# Visualize text sizes
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Anthroposophy
df_anthro_sorted = df_anthro.sort_values('words', ascending=True)
axes[0].barh(range(len(df_anthro_sorted)), df_anthro_sorted['words']/1000, color='cornflowerblue', alpha=0.8)
axes[0].set_yticks(range(len(df_anthro_sorted)))
axes[0].set_yticklabels([name[:35] for name in df_anthro_sorted['filename']], fontsize=9)
axes[0].set_xlabel('Thousands of Words')
axes[0].set_title('Anthroposophy Text Sizes', fontsize=12, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)

# Vedanta
df_vedanta_sorted = df_vedanta.sort_values('words', ascending=True)
axes[1].barh(range(len(df_vedanta_sorted)), df_vedanta_sorted['words']/1000, color='coral', alpha=0.8)
axes[1].set_yticks(range(len(df_vedanta_sorted)))
axes[1].set_yticklabels([name[:35] for name in df_vedanta_sorted['filename']], fontsize=9)
axes[1].set_xlabel('Thousands of Words')
axes[1].set_title('Vedanta Text Sizes', fontsize=12, fontweight='bold')
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Initialize analyzer
analyzer = SimilarityAnalyzer(processed_dir=Path("../data/processed"))

# Load texts
anthro_texts, vedanta_texts = analyzer.load_texts()

print(f"Loaded {len(anthro_texts)} Anthroposophy texts")
print(f"Loaded {len(vedanta_texts)} Vedanta texts")

In [None]:
# Compute TF-IDF similarity
if len(anthro_texts) > 0 and len(vedanta_texts) > 0:
    similarity_results = analyzer.compute_tfidf_similarity()
    
    print(f"Average cross-corpus similarity: {similarity_results['average_similarity']:.4f}")
    print(f"Maximum similarity: {similarity_results['max_similarity']:.4f}")
    print(f"Minimum similarity: {similarity_results['min_similarity']:.4f}")
    
    print("\nMost similar text pairs:")
    for pair in similarity_results['most_similar_pairs'][:5]:
        print(f"  {pair['anthroposophy_text'][:40]:40s} <-> {pair['vedanta_text'][:40]:40s}: {pair['similarity']:.4f}")

In [None]:
# Visualize similarity matrix
if 'similarity_results' in locals():
    similarity_matrix = np.array(similarity_results['similarity_matrix'])
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(
        similarity_matrix,
        cmap='YlOrRd',
        cbar_kws={'label': 'Cosine Similarity'}
    )
    plt.title('Anthroposophy vs Vedanta Text Similarity')
    plt.xlabel('Vedanta Texts')
    plt.ylabel('Anthroposophy Texts')
    plt.tight_layout()
    plt.show()

## 3. Distinctive Terms Analysis

Identify which terms are most characteristic of each tradition.

In [None]:
# Extract distinctive terms
if hasattr(analyzer, 'tfidf_matrix') and analyzer.tfidf_matrix is not None:
    distinctive_terms = analyzer.extract_distinctive_terms(top_n=20)
    
    print("Top Anthroposophy Terms:")
    for term_dict in distinctive_terms['anthroposophy_top_terms'][:10]:
        print(f"  {term_dict['term']:30s}: {term_dict['tfidf']:.4f}")
    
    print("\nTop Vedanta Terms:")
    for term_dict in distinctive_terms['vedanta_top_terms'][:10]:
        print(f"  {term_dict['term']:30s}: {term_dict['tfidf']:.4f}")

In [None]:
# Visualize distinctive terms
if 'distinctive_terms' in locals():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Anthroposophy terms
    anthro_terms = [t['term'] for t in distinctive_terms['anthroposophy_top_terms'][:15]]
    anthro_scores = [t['tfidf'] for t in distinctive_terms['anthroposophy_top_terms'][:15]]
    ax1.barh(anthro_terms, anthro_scores, color='cornflowerblue')
    ax1.set_xlabel('TF-IDF Score')
    ax1.set_title('Top Anthroposophy Terms')
    
    # Vedanta terms
    vedanta_terms = [t['term'] for t in distinctive_terms['vedanta_top_terms'][:15]]
    vedanta_scores = [t['tfidf'] for t in distinctive_terms['vedanta_top_terms'][:15]]
    ax2.barh(vedanta_terms, vedanta_scores, color='coral')
    ax2.set_xlabel('TF-IDF Score')
    ax2.set_title('Top Vedanta Terms')
    
    plt.tight_layout()
    plt.show()

## 4. Topic Modeling

Discover latent topics across both corpora.

In [None]:
# Perform topic modeling
if len(anthro_texts) > 0 and len(vedanta_texts) > 0:
    topic_results = analyzer.perform_topic_modeling(n_topics=8)
    
    print("Discovered Topics:\n")
    for topic in topic_results['topics']:
        print(f"Topic {topic['topic_id']}: {', '.join(topic['top_words'][:8])}")

In [None]:
# Compare topic distributions
if 'topic_results' in locals():
    topics = [f"Topic {i}" for i in range(len(topic_results['topics']))]
    anthro_dist = topic_results['anthroposophy_topic_distribution']
    vedanta_dist = topic_results['vedanta_topic_distribution']
    
    x = np.arange(len(topics))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(x - width/2, anthro_dist, width, label='Anthroposophy', color='cornflowerblue')
    ax.bar(x + width/2, vedanta_dist, width, label='Vedanta', color='coral')
    
    ax.set_xlabel('Topics')
    ax.set_ylabel('Average Proportion')
    ax.set_title('Topic Distribution by Tradition')
    ax.set_xticks(x)
    ax.set_xticklabels(topics, rotation=45)
    ax.legend()
    
    plt.tight_layout()
    plt.show()

### Word Clouds

Generate word clouds to visualize the most prominent vocabulary in each tradition.

In [None]:
# Install wordcloud if needed
try:
    from wordcloud import WordCloud
    
    # Load and combine texts
    def load_corpus_text(directory):
        all_text = []
        for filepath in directory.glob("*.txt"):
            if filepath.name not in ['steiner_sample.txt', 'bhagavad_gita_sample.txt']:
                with open(filepath, 'r', encoding='utf-8') as f:
                    all_text.append(f.read().lower())
        return ' '.join(all_text)
    
    anthro_text = load_corpus_text(anthro_dir)
    vedanta_text = load_corpus_text(vedanta_dir)
    
    # Create word clouds
    fig, axes = plt.subplots(1, 2, figsize=(18, 8))
    
    # Anthroposophy word cloud
    wordcloud_anthro = WordCloud(
        width=800, height=400, 
        background_color='white',
        colormap='Blues',
        max_words=100,
        stopwords=set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who', 'when', 'where', 'why', 'how', 'not', 'no', 'nor', 'so', 'than', 'too', 'very', 'just', 'as', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'under', 'again', 'further', 'then', 'once'])
    ).generate(anthro_text)
    
    axes[0].imshow(wordcloud_anthro, interpolation='bilinear')
    axes[0].axis('off')
    axes[0].set_title('Anthroposophy Vocabulary', fontsize=14, fontweight='bold', pad=20)
    
    # Vedanta word cloud
    wordcloud_vedanta = WordCloud(
        width=800, height=400,
        background_color='white',
        colormap='Oranges',
        max_words=100,
        stopwords=set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who', 'when', 'where', 'why', 'how', 'not', 'no', 'nor', 'so', 'than', 'too', 'very', 'just', 'as', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'under', 'again', 'further', 'then', 'once'])
    ).generate(vedanta_text)
    
    axes[1].imshow(wordcloud_vedanta, interpolation='bilinear')
    axes[1].axis('off')
    axes[1].set_title('Vedanta Vocabulary', fontsize=14, fontweight='bold', pad=20)
    
    plt.tight_layout()
    plt.show()
    
    print("✓ Word clouds generated successfully")
    
except ImportError:
    print("WordCloud library not installed. Run: pip install wordcloud")
    print("Skipping word cloud visualization.")

### Term Co-occurrence Network

Visualize which Vedanta terms appear together in Anthroposophy texts.

In [None]:
# Build term co-occurrence network
if 'analysis' in locals() and 'file_analyses' in analysis:
    try:
        import networkx as nx
        
        # Build co-occurrence matrix
        from collections import defaultdict
        cooccurrence = defaultdict(int)
        
        for file_data in analysis['file_analyses']:
            terms_in_file = [m['term'] for m in file_data['matches']]
            # Count co-occurrences (terms appearing in same text)
            for i, term1 in enumerate(terms_in_file):
                for term2 in terms_in_file[i+1:]:
                    pair = tuple(sorted([term1, term2]))
                    cooccurrence[pair] += 1
        
        # Create network graph
        G = nx.Graph()
        
        # Add edges with weights
        for (term1, term2), weight in cooccurrence.items():
            if weight >= 2:  # Only show terms that co-occur at least twice
                G.add_edge(term1, term2, weight=weight)
        
        if len(G.nodes()) > 0:
            # Calculate node sizes based on degree
            node_sizes = [300 + G.degree(node) * 200 for node in G.nodes()]
            
            # Draw network
            plt.figure(figsize=(14, 10))
            pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
            
            # Draw edges with varying thickness
            edges = G.edges()
            weights = [G[u][v]['weight'] for u, v in edges]
            
            nx.draw_networkx_edges(G, pos, alpha=0.3, width=[w*0.5 for w in weights])
            nx.draw_networkx_nodes(G, pos, node_size=node_sizes, 
                                 node_color='lightblue', alpha=0.8, 
                                 edgecolors='navy', linewidths=2)
            nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
            
            plt.title('Vedanta Term Co-occurrence Network in Anthroposophy Texts', 
                     fontsize=14, fontweight='bold', pad=20)
            plt.axis('off')
            plt.tight_layout()
            plt.show()
            
            print(f"Network contains {len(G.nodes())} terms and {len(G.edges())} co-occurrence relationships")
        else:
            print("Not enough co-occurrences to build network (need at least 2 shared appearances)")
            
    except ImportError:
        print("NetworkX library not installed. Run: pip install networkx")
        print("Skipping network visualization.")
else:
    print("No reference analysis data available")

## 5. Concept Mapping

Map parallel concepts between the two traditions using semantic embeddings.

In [None]:
# Initialize concept mapper
mapper = ConceptMapper()

# Find parallel concepts
if mapper.model:
    mappings = mapper.find_parallel_concepts(top_n=3)
    
    print("Concept Mappings:\n")
    for mapping in mappings[:5]:
        print(f"\n{mapping['anthroposophy_concept'].upper()}")
        print(f"  ({mapping['anthroposophy_description']})")
        print("  Most similar Vedanta concepts:")
        for match in mapping['top_matches'][:3]:
            print(f"    • {match['vedanta_concept']} (similarity: {match['similarity']:.3f})")
else:
    print("Sentence transformers not available. Install with: pip install sentence-transformers")

## 6. Key Findings Summary

In [None]:
print("="*80)
print("KEY FINDINGS SUMMARY")
print("="*80)

if 'analysis' in locals():
    print(f"\n1. REFERENCE DETECTION")
    print(f"   Total Vedanta references in Anthroposophy texts: {analysis.get('total_references', 0)}")
    print(f"   Unique Vedanta terms found: {analysis.get('unique_terms_across_corpus', 0)}")

if 'similarity_results' in locals():
    print(f"\n2. SEMANTIC SIMILARITY")
    print(f"   Average text similarity: {similarity_results['average_similarity']:.4f}")
    print(f"   Maximum similarity: {similarity_results['max_similarity']:.4f}")

if 'topic_results' in locals():
    print(f"\n3. TOPIC MODELING")
    print(f"   Number of topics discovered: {topic_results['n_topics']}")

print("\n" + "="*80)

### Advanced Visualizations

Additional comparative charts and insights.

In [None]:
# Comparative pie chart of term categories
if 'analysis' in locals() and 'most_common_terms' in analysis:
    # Categorize terms
    categories = {
        'Texts & Scriptures': ['gita', 'vedas', 'upanishads', 'bhagavad'],
        'Concepts & Philosophy': ['maya', 'karma', 'dharma', 'brahman', 'atman', 'moksha'],
        'Practices': ['yoga', 'meditation', 'self-knowledge'],
        'Figures': ['krishna', 'vivekananda', 'shankara', 'vedanta']
    }
    
    category_counts = {cat: 0 for cat in categories}
    
    for term, count in analysis['most_common_terms']:
        for category, terms in categories.items():
            if any(t in term.lower() for t in terms):
                category_counts[category] += count
                break
    
    # Remove empty categories
    category_counts = {k: v for k, v in category_counts.items() if v > 0}
    
    if category_counts:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        # Pie chart
        colors = plt.cm.Set3(range(len(category_counts)))
        wedges, texts, autotexts = ax1.pie(
            category_counts.values(), 
            labels=category_counts.keys(),
            autopct='%1.1f%%',
            startangle=90,
            colors=colors
        )
        ax1.set_title('Vedanta References by Category', fontsize=12, fontweight='bold')
        
        # Bar chart with category breakdown
        ax2.bar(range(len(category_counts)), category_counts.values(), 
               color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
        ax2.set_xticks(range(len(category_counts)))
        ax2.set_xticklabels(category_counts.keys(), rotation=45, ha='right')
        ax2.set_ylabel('Number of References')
        ax2.set_title('Reference Count by Category', fontsize=12, fontweight='bold')
        ax2.grid(axis='y', alpha=0.3)
        
        # Add value labels
        for i, (cat, val) in enumerate(category_counts.items()):
            ax2.text(i, val + 0.3, str(val), ha='center', va='bottom', fontweight='bold')
        
        plt.tight_layout()
        plt.show()
    else:
        print("Not enough data to categorize terms")

## Export Results

In [None]:
# Save results
results_dir = Path("../results")
results_dir.mkdir(exist_ok=True)

if 'similarity_results' in locals():
    with open(results_dir / "similarity_results.json", 'w') as f:
        json.dump(similarity_results, f, indent=2)
    print("Saved similarity results")

if 'distinctive_terms' in locals():
    with open(results_dir / "distinctive_terms.json", 'w') as f:
        json.dump(distinctive_terms, f, indent=2)
    print("Saved distinctive terms")

if 'topic_results' in locals():
    with open(results_dir / "topic_results.json", 'w') as f:
        json.dump(topic_results, f, indent=2)
    print("Saved topic modeling results")

print("\nAll results exported to:", results_dir)

### Save Visualizations

Export key visualizations as image files for reports and presentations.

In [None]:
# Create figures directory
figures_dir = Path("../results/figures")
figures_dir.mkdir(parents=True, exist_ok=True)

saved_figures = []

# Function to save current figure
def save_current_figure(filename, dpi=300):
    filepath = figures_dir / filename
    plt.savefig(filepath, dpi=dpi, bbox_inches='tight', facecolor='white')
    saved_figures.append(filename)
    print(f"✓ Saved: {filename}")

print("To save visualizations, re-run the visualization cells above and then execute:")
print()
print("# After each visualization:")
print("save_current_figure('reference_frequency.png')")
print("save_current_figure('references_by_text.png')")
print("save_current_figure('text_sizes_comparison.png')")
print("save_current_figure('similarity_heatmap.png')")
print("save_current_figure('distinctive_terms.png')")
print("save_current_figure('topic_distributions.png')")
print("save_current_figure('wordclouds.png')")
print("save_current_figure('cooccurrence_network.png')")
print("save_current_figure('category_breakdown.png')")
print()
print(f"Figures will be saved to: {figures_dir.absolute()}")