## Setup and Imports

In [None]:
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, '../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

from preprocessing.text_cleaner import TextPreprocessor
from reference_detection.detect_references import ReferenceDetector
from analysis.similarity_analysis import SimilarityAnalyzer
from analysis.concept_mapping import ConceptMapper

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

## 1. Reference Detection

Detect how often Vedanta concepts, texts, and figures are mentioned in Anthroposophy texts.

In [None]:
# Initialize detector
detector = ReferenceDetector()

# Analyze Anthroposophy corpus
anthro_dir = Path("../data/anthroposophy")
if anthro_dir.exists():
    analysis = detector.analyze_corpus(anthro_dir)
    
    print(f"Total Vedanta references found: {analysis['total_references']}")
    print(f"Unique terms found: {analysis['unique_terms_across_corpus']}")
    print("\nMost frequently mentioned terms:")
    for term, count in analysis['most_common_terms'][:10]:
        print(f"  {term:25s}: {count:3d}")
else:
    print("No data found. Please run data collection first.")

In [None]:
# Visualize reference frequencies
if 'analysis' in locals():
    terms = [t[0] for t in analysis['most_common_terms'][:15]]
    counts = [t[1] for t in analysis['most_common_terms'][:15]]
    
    plt.figure(figsize=(12, 6))
    plt.barh(terms, counts, color='steelblue')
    plt.xlabel('Frequency')
    plt.title('Most Frequently Mentioned Vedanta Terms in Anthroposophy Texts')
    plt.tight_layout()
    plt.show()

## 2. Text Similarity Analysis

Compute semantic similarity between Anthroposophy and Vedanta texts.

In [None]:
# Initialize analyzer
analyzer = SimilarityAnalyzer(processed_dir=Path("../data/processed"))

# Load texts
anthro_texts, vedanta_texts = analyzer.load_texts()

print(f"Loaded {len(anthro_texts)} Anthroposophy texts")
print(f"Loaded {len(vedanta_texts)} Vedanta texts")

In [None]:
# Compute TF-IDF similarity
if len(anthro_texts) > 0 and len(vedanta_texts) > 0:
    similarity_results = analyzer.compute_tfidf_similarity()
    
    print(f"Average cross-corpus similarity: {similarity_results['average_similarity']:.4f}")
    print(f"Maximum similarity: {similarity_results['max_similarity']:.4f}")
    print(f"Minimum similarity: {similarity_results['min_similarity']:.4f}")
    
    print("\nMost similar text pairs:")
    for pair in similarity_results['most_similar_pairs'][:5]:
        print(f"  {pair['anthroposophy_text'][:40]:40s} <-> {pair['vedanta_text'][:40]:40s}: {pair['similarity']:.4f}")

In [None]:
# Visualize similarity matrix
if 'similarity_results' in locals():
    similarity_matrix = np.array(similarity_results['similarity_matrix'])
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(
        similarity_matrix,
        cmap='YlOrRd',
        cbar_kws={'label': 'Cosine Similarity'}
    )
    plt.title('Anthroposophy vs Vedanta Text Similarity')
    plt.xlabel('Vedanta Texts')
    plt.ylabel('Anthroposophy Texts')
    plt.tight_layout()
    plt.show()

## 3. Distinctive Terms Analysis

Identify which terms are most characteristic of each tradition.

In [None]:
# Extract distinctive terms
if hasattr(analyzer, 'tfidf_matrix') and analyzer.tfidf_matrix is not None:
    distinctive_terms = analyzer.extract_distinctive_terms(top_n=20)
    
    print("Top Anthroposophy Terms:")
    for term_dict in distinctive_terms['anthroposophy_top_terms'][:10]:
        print(f"  {term_dict['term']:30s}: {term_dict['tfidf']:.4f}")
    
    print("\nTop Vedanta Terms:")
    for term_dict in distinctive_terms['vedanta_top_terms'][:10]:
        print(f"  {term_dict['term']:30s}: {term_dict['tfidf']:.4f}")

In [None]:
# Visualize distinctive terms
if 'distinctive_terms' in locals():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Anthroposophy terms
    anthro_terms = [t['term'] for t in distinctive_terms['anthroposophy_top_terms'][:15]]
    anthro_scores = [t['tfidf'] for t in distinctive_terms['anthroposophy_top_terms'][:15]]
    ax1.barh(anthro_terms, anthro_scores, color='cornflowerblue')
    ax1.set_xlabel('TF-IDF Score')
    ax1.set_title('Top Anthroposophy Terms')
    
    # Vedanta terms
    vedanta_terms = [t['term'] for t in distinctive_terms['vedanta_top_terms'][:15]]
    vedanta_scores = [t['tfidf'] for t in distinctive_terms['vedanta_top_terms'][:15]]
    ax2.barh(vedanta_terms, vedanta_scores, color='coral')
    ax2.set_xlabel('TF-IDF Score')
    ax2.set_title('Top Vedanta Terms')
    
    plt.tight_layout()
    plt.show()

## 4. Topic Modeling

Discover latent topics across both corpora.

In [None]:
# Perform topic modeling
if len(anthro_texts) > 0 and len(vedanta_texts) > 0:
    topic_results = analyzer.perform_topic_modeling(n_topics=8)
    
    print("Discovered Topics:\n")
    for topic in topic_results['topics']:
        print(f"Topic {topic['topic_id']}: {', '.join(topic['top_words'][:8])}")

In [None]:
# Compare topic distributions
if 'topic_results' in locals():
    topics = [f"Topic {i}" for i in range(len(topic_results['topics']))]
    anthro_dist = topic_results['anthroposophy_topic_distribution']
    vedanta_dist = topic_results['vedanta_topic_distribution']
    
    x = np.arange(len(topics))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(x - width/2, anthro_dist, width, label='Anthroposophy', color='cornflowerblue')
    ax.bar(x + width/2, vedanta_dist, width, label='Vedanta', color='coral')
    
    ax.set_xlabel('Topics')
    ax.set_ylabel('Average Proportion')
    ax.set_title('Topic Distribution by Tradition')
    ax.set_xticks(x)
    ax.set_xticklabels(topics, rotation=45)
    ax.legend()
    
    plt.tight_layout()
    plt.show()

## 5. Concept Mapping

Map parallel concepts between the two traditions using semantic embeddings.

In [None]:
# Initialize concept mapper
mapper = ConceptMapper()

# Find parallel concepts
if mapper.model:
    mappings = mapper.find_parallel_concepts(top_n=3)
    
    print("Concept Mappings:\n")
    for mapping in mappings[:5]:
        print(f"\n{mapping['anthroposophy_concept'].upper()}")
        print(f"  ({mapping['anthroposophy_description']})")
        print("  Most similar Vedanta concepts:")
        for match in mapping['top_matches'][:3]:
            print(f"    â€¢ {match['vedanta_concept']} (similarity: {match['similarity']:.3f})")
else:
    print("Sentence transformers not available. Install with: pip install sentence-transformers")

## 6. Key Findings Summary

In [None]:
print("="*80)
print("KEY FINDINGS SUMMARY")
print("="*80)

if 'analysis' in locals():
    print(f"\n1. REFERENCE DETECTION")
    print(f"   Total Vedanta references in Anthroposophy texts: {analysis.get('total_references', 0)}")
    print(f"   Unique Vedanta terms found: {analysis.get('unique_terms_across_corpus', 0)}")

if 'similarity_results' in locals():
    print(f"\n2. SEMANTIC SIMILARITY")
    print(f"   Average text similarity: {similarity_results['average_similarity']:.4f}")
    print(f"   Maximum similarity: {similarity_results['max_similarity']:.4f}")

if 'topic_results' in locals():
    print(f"\n3. TOPIC MODELING")
    print(f"   Number of topics discovered: {topic_results['n_topics']}")

print("\n" + "="*80)

## Export Results

In [None]:
# Save results
results_dir = Path("../results")
results_dir.mkdir(exist_ok=True)

if 'similarity_results' in locals():
    with open(results_dir / "similarity_results.json", 'w') as f:
        json.dump(similarity_results, f, indent=2)
    print("Saved similarity results")

if 'distinctive_terms' in locals():
    with open(results_dir / "distinctive_terms.json", 'w') as f:
        json.dump(distinctive_terms, f, indent=2)
    print("Saved distinctive terms")

if 'topic_results' in locals():
    with open(results_dir / "topic_results.json", 'w') as f:
        json.dump(topic_results, f, indent=2)
    print("Saved topic modeling results")

print("\nAll results exported to:", results_dir)