In [None]:
# Enhanced imports for advanced analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import networkx as nx
from itertools import combinations
import json
from functools import lru_cache
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📚 Enhanced libraries imported successfully!")


In [None]:
# Load the processed data from previous analysis
print("📥 Loading processed Quranic data...")
try:
    quran = pd.read_csv('quran-morphology-final.csv')
    print(f"✅ Loaded {len(quran):,} entries")
except FileNotFoundError:
    print("❌ Base data file not found. Please run quran_corrected.ipynb first.")
    raise

# Load table of contents
toc = pd.read_csv('toc.csv')
print(f"✅ Loaded TOC with {len(toc)} suras")

# Display basic info
print(f"\n📊 Dataset Overview:")
print(f"Total entries: {len(quran):,}")
print(f"Entries with roots: {quran.Root.notna().sum():,}")
print(f"Unique roots: {quran.Root.nunique():,}")
print(f"Meccan suras: {len(toc[toc.Place == 'Meccan'])}")
print(f"Medinan suras: {len(toc[toc.Place == 'Medinan'])}")


In [None]:
# Set up Buckwalter conversion functions (from previous notebook)
abjad = {
    "\u0627": 'A', "\u0628": 'b', "\u062A": 't', "\u062B": 'v', "\u062C": 'j',
    "\u062D": 'H', "\u062E": 'x', "\u062F": 'd', "\u0630": '*', "\u0631": 'r',
    "\u0632": 'z', "\u0633": 's', "\u0634": '$', "\u0635": 'S', "\u0636": 'D',
    "\u0637": 'T', "\u0638": 'Z', "\u0639": 'E', "\u063A": 'g', "\u0641": 'f',
    "\u0642": 'q', "\u0643": 'k', "\u0644": 'l', "\u0645": 'm', "\u0646": 'n',
    "\u0647": 'h', "\u0648": 'w', "\u0649": 'Y', "\u064A": 'y',
    ' ': ' ', "\u0621": "'", "\u0623": '>', "\u0625": '<', "\u0624": '&',
    "\u0626": '}', "\u0622": '|', "\u064E": 'a', "\u064F": 'u', "\u0650": 'i',
    "\u0651": '~', "\u0652": 'o', "\u064B": 'F', "\u064C": 'N', "\u064D": 'K',
    "\u0640": '_', "\u0670": '`', "\u0629": 'p'
}

alphabet = {v: k for k, v in abjad.items()}

def buck_to_arabic(buc):
    """Convert Buckwalter to Arabic"""
    try:
        return ''.join(alphabet.get(x, x) for x in str(buc))
    except:
        return str(buc)

def arabic_to_buck(ara):
    """Convert Arabic to Buckwalter"""
    try:
        return ''.join(abjad.get(x, x) for x in str(ara))
    except:
        return str(ara)

print("🔤 Text conversion functions ready")


In [None]:
# Define semantic categories for root classification
semantic_categories = {
    # Divine and Religious
    'divine_attributes': ['رحم', 'غفر', 'علم', 'قدر', 'حكم', 'عزز', 'حمد', 'سبح'],
    'worship_ritual': ['صلو', 'صوم', 'حجج', 'زكو', 'سجد', 'ركع', 'دعو', 'ذكر'],
    'faith_belief': ['امن', 'كفر', 'شرك', 'وحد', 'ايمن', 'يقن', 'شكك', 'ظنن'],
    
    # Human Relations and Society
    'family_relations': ['ابو', 'امم', 'زوج', 'ولد', 'اخو', 'عشر', 'قرب', 'رحم'],
    'social_justice': ['عدل', 'ظلم', 'قسط', 'حقق', 'انصف', 'فسد', 'صلح'],
    'commerce_economics': ['بيع', 'شرو', 'ربو', 'دين', 'قرض', 'تجر', 'كسب', 'انفق'],
    
    # Knowledge and Communication
    'knowledge_wisdom': ['علم', 'حكم', 'فهم', 'عقل', 'فكر', 'تدبر', 'ذكر', 'فقه'],
    'communication': ['قول', 'كلم', 'نطق', 'صوت', 'نداء', 'بشر', 'انذر', 'بلغ'],
    'books_revelation': ['كتب', 'قرا', 'نزل', 'وحي', 'ايت', 'تلو', 'حفظ'],
    
    # Natural World
    'creation_nature': ['خلق', 'برا', 'فطر', 'انشا', 'جعل', 'كون', 'وجد'],
    'time_temporal': ['يوم', 'ليل', 'صبح', 'مسا', 'وقت', 'زمن', 'دهر', 'عصر'],
    'natural_elements': ['سمو', 'ارض', 'ماء', 'نار', 'هوا', 'شمس', 'قمر', 'نجم'],
    
    # Actions and States
    'movement_direction': ['ذهب', 'جيا', 'رجع', 'خرج', 'دخل', 'صعد', 'نزل', 'سير'],
    'emotions_states': ['خوف', 'امن', 'حزن', 'فرح', 'غضب', 'رضو', 'حبب', 'بغض'],
    'moral_conduct': ['صبر', 'شكر', 'تقو', 'برر', 'فجر', 'كذب', 'صدق', 'امن']
}

print(f"📋 Defined {len(semantic_categories)} semantic categories")
print(f"Total categorized roots: {sum(len(roots) for roots in semantic_categories.values())}")


In [None]:
# Create reverse mapping: root -> category
root_to_category = {}
for category, roots in semantic_categories.items():
    for root in roots:
        if root in root_to_category:
            # Handle roots that might belong to multiple categories
            if isinstance(root_to_category[root], list):
                root_to_category[root].append(category)
            else:
                root_to_category[root] = [root_to_category[root], category]
        else:
            root_to_category[root] = category

def get_semantic_category(root):
    """Get semantic category for a root"""
    return root_to_category.get(root, 'uncategorized')

# Test the function
test_roots = ['رحم', 'علم', 'كتب']
for root in test_roots:
    category = get_semantic_category(root)
    print(f"Root {buck_to_arabic(root)} ({root}) -> {category}")

print(f"\n✅ Root categorization system ready")


In [None]:
# Calculate comprehensive frequency statistics
print("📊 Calculating frequency analysis...")

# Overall root frequencies
root_frequencies = quran[quran.Root.notna()].Root.value_counts()
print(f"Most frequent root: {buck_to_arabic(root_frequencies.index[0])} ({root_frequencies.iloc[0]} occurrences)")

# Frequency by revelation type
meccan_freq = quran[(quran.Place == 'Meccan') & (quran.Root.notna())].Root.value_counts()
medinan_freq = quran[(quran.Place == 'Medinan') & (quran.Root.notna())].Root.value_counts()

print(f"Meccan most frequent: {buck_to_arabic(meccan_freq.index[0])} ({meccan_freq.iloc[0]} occurrences)")
print(f"Medinan most frequent: {buck_to_arabic(medinan_freq.index[0])} ({medinan_freq.iloc[0]} occurrences)")

# Frequency by sura
sura_root_counts = quran[quran.Root.notna()].groupby('sura').Root.nunique().sort_values(ascending=False)
print(f"\nSura with most unique roots: {sura_root_counts.index[0]} ({sura_root_counts.iloc[0]} unique roots)")

# Create frequency dataframe
frequency_stats = pd.DataFrame({
    'root': root_frequencies.index,
    'total_frequency': root_frequencies.values,
    'meccan_frequency': [meccan_freq.get(root, 0) for root in root_frequencies.index],
    'medinan_frequency': [medinan_freq.get(root, 0) for root in root_frequencies.index]
})

# Add relative frequencies
frequency_stats['meccan_ratio'] = frequency_stats['meccan_frequency'] / frequency_stats['total_frequency']
frequency_stats['medinan_ratio'] = frequency_stats['medinan_frequency'] / frequency_stats['total_frequency']

# Add semantic categories
frequency_stats['semantic_category'] = frequency_stats['root'].apply(get_semantic_category)

# Add Arabic forms
frequency_stats['root_arabic'] = frequency_stats['root'].apply(buck_to_arabic)

print(f"\n✅ Frequency analysis complete for {len(frequency_stats)} roots")
frequency_stats.head(10)


In [None]:
# Advanced frequency analysis
print("🔍 Advanced frequency patterns...")

# Roots with strong Meccan preference (>80% Meccan)
meccan_preferred = frequency_stats[
    (frequency_stats['meccan_ratio'] > 0.8) & 
    (frequency_stats['total_frequency'] >= 5)  # At least 5 occurrences
].sort_values('meccan_ratio', ascending=False)

print(f"\n🕌 Strongly Meccan-preferred roots (>80%, ≥5 occurrences): {len(meccan_preferred)}")
print("Top 10:")
for _, row in meccan_preferred.head(10).iterrows():
    print(f"  {row['root_arabic']:>8} ({row['root']:>6}) - {row['meccan_ratio']:.1%} Meccan ({row['total_frequency']} total)")

# Roots with strong Medinan preference (>80% Medinan)
medinan_preferred = frequency_stats[
    (frequency_stats['medinan_ratio'] > 0.8) & 
    (frequency_stats['total_frequency'] >= 5)
].sort_values('medinan_ratio', ascending=False)

print(f"\n🏛️ Strongly Medinan-preferred roots (>80%, ≥5 occurrences): {len(medinan_preferred)}")
print("Top 10:")
for _, row in medinan_preferred.head(10).iterrows():
    print(f"  {row['root_arabic']:>8} ({row['root']:>6}) - {row['medinan_ratio']:.1%} Medinan ({row['total_frequency']} total)")

# Rare roots (appear only 1-2 times)
rare_roots = frequency_stats[frequency_stats['total_frequency'] <= 2]
print(f"\n🔹 Rare roots (≤2 occurrences): {len(rare_roots)} ({len(rare_roots)/len(frequency_stats):.1%} of all roots)")


In [None]:
# Analyze frequency patterns by semantic category
print("🎯 Thematic categorization analysis...")

# Category frequency distribution
category_stats = frequency_stats.groupby('semantic_category').agg({
    'total_frequency': ['count', 'sum', 'mean'],
    'meccan_frequency': 'sum',
    'medinan_frequency': 'sum'
}).round(2)

category_stats.columns = ['root_count', 'total_occurrences', 'avg_frequency', 'meccan_total', 'medinan_total']
category_stats['meccan_ratio'] = category_stats['meccan_total'] / (category_stats['meccan_total'] + category_stats['medinan_total'])
category_stats = category_stats.sort_values('total_occurrences', ascending=False)

print("\n📊 Semantic category statistics:")
print(category_stats)

# Find categories with strong revelation type preferences
print("\n🎭 Categories with revelation preferences:")
for category, row in category_stats.iterrows():
    if row['meccan_ratio'] > 0.7:
        print(f"  🕌 {category}: {row['meccan_ratio']:.1%} Meccan")
    elif row['meccan_ratio'] < 0.3:
        print(f"  🏛️ {category}: {1-row['meccan_ratio']:.1%} Medinan")


In [None]:
# Create thematic distribution by sura
print("📚 Thematic distribution by sura...")

# Add semantic categories to main dataset
quran_enhanced = quran.copy()
quran_enhanced['semantic_category'] = quran_enhanced['Root'].apply(get_semantic_category)

# Calculate category distribution by sura
sura_theme_distribution = quran_enhanced[quran_enhanced.Root.notna()].groupby(['sura', 'semantic_category']).size().unstack(fill_value=0)

# Calculate relative distributions
sura_theme_relative = sura_theme_distribution.div(sura_theme_distribution.sum(axis=1), axis=0)

print(f"✅ Thematic analysis complete for {len(sura_theme_distribution)} suras")
print(f"Categories tracked: {list(sura_theme_distribution.columns)}")

# Show example for Al-Fatiha
print("\n📖 Example - Al-Fatiha (Sura 1) thematic breakdown:")
if 1 in sura_theme_relative.index:
    fatiha_themes = sura_theme_relative.loc[1]
    for theme, ratio in fatiha_themes[fatiha_themes > 0].sort_values(ascending=False).items():
        print(f"  {theme}: {ratio:.1%}")


In [None]:
# Calculate root co-occurrences within verses
print("🔗 Calculating root co-occurrence matrices...")

# Group by verse to find roots that appear together
verse_roots = quran_enhanced[quran_enhanced.Root.notna()].groupby(['sura', 'aya'])['Root'].apply(list).reset_index()
verse_roots['root_count'] = verse_roots['Root'].apply(len)

print(f"Analyzed {len(verse_roots)} verses with roots")
print(f"Average roots per verse: {verse_roots['root_count'].mean():.1f}")
print(f"Max roots in a verse: {verse_roots['root_count'].max()}")

# Calculate co-occurrence matrix
cooccurrence_counts = defaultdict(int)
total_pairs = 0

for _, row in verse_roots.iterrows():
    roots_in_verse = list(set(row['Root']))  # Remove duplicates within verse
    if len(roots_in_verse) > 1:
        for root1, root2 in combinations(roots_in_verse, 2):
            # Sort pair to ensure consistent ordering
            pair = tuple(sorted([root1, root2]))
            cooccurrence_counts[pair] += 1
            total_pairs += 1

print(f"\n🔢 Found {len(cooccurrence_counts)} unique root pairs")
print(f"Total co-occurrence instances: {total_pairs}")

# Convert to DataFrame for analysis
cooccurrence_df = pd.DataFrame([
    {'root1': pair[0], 'root2': pair[1], 'cooccurrence_count': count}
    for pair, count in cooccurrence_counts.items()
]).sort_values('cooccurrence_count', ascending=False)

print("\n🔝 Top 10 most co-occurring root pairs:")
for _, row in cooccurrence_df.head(10).iterrows():
    r1_ar = buck_to_arabic(row['root1'])
    r2_ar = buck_to_arabic(row['root2'])
    print(f"  {r1_ar} + {r2_ar}: {row['cooccurrence_count']} times")


In [None]:
# Calculate semantic co-occurrence patterns
print("🎭 Semantic category co-occurrence analysis...")

# Add semantic categories to co-occurrence data
cooccurrence_df['category1'] = cooccurrence_df['root1'].apply(get_semantic_category)
cooccurrence_df['category2'] = cooccurrence_df['root2'].apply(get_semantic_category)

# Calculate category-level co-occurrences
category_cooccurrence = defaultdict(int)
for _, row in cooccurrence_df.iterrows():
    cat_pair = tuple(sorted([row['category1'], row['category2']]))
    category_cooccurrence[cat_pair] += row['cooccurrence_count']

category_cooccurrence_df = pd.DataFrame([
    {'category1': pair[0], 'category2': pair[1], 'total_cooccurrence': count}
    for pair, count in category_cooccurrence.items()
    if pair[0] != 'uncategorized' and pair[1] != 'uncategorized'  # Filter out uncategorized
]).sort_values('total_cooccurrence', ascending=False)

print("\n🎯 Top semantic category co-occurrences:")
for _, row in category_cooccurrence_df.head(15).iterrows():
    if row['category1'] != row['category2']:  # Different categories
        print(f"  {row['category1']} + {row['category2']}: {row['total_cooccurrence']} co-occurrences")
    else:  # Same category (internal consistency)
        print(f"  {row['category1']} (internal): {row['total_cooccurrence']} co-occurrences")


In [None]:
# Create comprehensive enhanced dataset
print("💎 Creating enhanced dataset with all metadata...")

# Merge frequency statistics with root data
frequency_lookup = frequency_stats.set_index('root').to_dict('index')

def add_frequency_info(root):
    if pd.isna(root) or root not in frequency_lookup:
        return {'total_freq': 0, 'meccan_freq': 0, 'medinan_freq': 0, 'meccan_ratio': 0, 'frequency_rank': 0}
    info = frequency_lookup[root]
    return {
        'total_freq': info['total_frequency'],
        'meccan_freq': info['meccan_frequency'], 
        'medinan_freq': info['medinan_frequency'],
        'meccan_ratio': info['meccan_ratio'],
        'frequency_rank': frequency_stats[frequency_stats['root'] == root].index[0] + 1
    }

# Add all enhancements to the dataset
enhanced_quran = quran_enhanced.copy()

# Add frequency information
freq_info = enhanced_quran['Root'].apply(add_frequency_info)
for key in ['total_freq', 'meccan_freq', 'medinan_freq', 'meccan_ratio', 'frequency_rank']:
    enhanced_quran[f'root_{key}'] = [info[key] for info in freq_info]

# Add Arabic root form
enhanced_quran['root_arabic'] = enhanced_quran['Root'].apply(lambda x: buck_to_arabic(x) if pd.notna(x) else '')

# Add rarity classification
def classify_rarity(freq):
    if freq == 0: return 'no_root'
    elif freq == 1: return 'hapax_legomena'
    elif freq <= 5: return 'very_rare'
    elif freq <= 20: return 'rare'
    elif freq <= 100: return 'common'
    else: return 'very_common'

enhanced_quran['root_rarity'] = enhanced_quran['root_total_freq'].apply(classify_rarity)

# Add revelation preference classification
def classify_revelation_preference(ratio, total_freq):
    if total_freq < 3: return 'insufficient_data'
    elif ratio > 0.8: return 'strongly_meccan'
    elif ratio > 0.6: return 'meccan_leaning'
    elif ratio < 0.2: return 'strongly_medinan'
    elif ratio < 0.4: return 'medinan_leaning'
    else: return 'balanced'

enhanced_quran['revelation_preference'] = enhanced_quran.apply(
    lambda row: classify_revelation_preference(row['root_meccan_ratio'], row['root_total_freq']), axis=1
)

print(f"✅ Enhanced dataset created with {len(enhanced_quran)} entries")
print(f"New columns added: {[col for col in enhanced_quran.columns if col not in quran.columns]}")


In [None]:
# Save enhanced datasets
print("💾 Saving enhanced datasets...")

# Main enhanced dataset
enhanced_quran.to_csv('quran-enhanced-phase1.csv', index=False)
print(f"✅ Saved main enhanced dataset: quran-enhanced-phase1.csv")

# Frequency statistics
frequency_stats.to_csv('root-frequency-analysis.csv', index=False)
print(f"✅ Saved frequency analysis: root-frequency-analysis.csv")

# Semantic category analysis
category_stats.to_csv('semantic-category-analysis.csv')
print(f"✅ Saved category analysis: semantic-category-analysis.csv")

# Co-occurrence matrices
cooccurrence_df.to_csv('root-cooccurrence-matrix.csv', index=False)
category_cooccurrence_df.to_csv('category-cooccurrence-matrix.csv', index=False)
print(f"✅ Saved co-occurrence matrices")

# Thematic distribution by sura
sura_theme_distribution.to_csv('sura-thematic-distribution.csv')
sura_theme_relative.to_csv('sura-thematic-relative.csv')
print(f"✅ Saved thematic distributions")

# Create metadata summary
metadata_summary = {
    'dataset_info': {
        'total_entries': len(enhanced_quran),
        'entries_with_roots': enhanced_quran['Root'].notna().sum(),
        'unique_roots': enhanced_quran['Root'].nunique(),
        'enhancement_date': pd.Timestamp.now().isoformat()
    },
    'semantic_categories': list(semantic_categories.keys()),
    'frequency_statistics': {
        'most_frequent_root': frequency_stats.iloc[0]['root'],
        'most_frequent_count': int(frequency_stats.iloc[0]['total_frequency']),
        'hapax_legomena_count': len(frequency_stats[frequency_stats['total_frequency'] == 1]),
        'rare_roots_count': len(frequency_stats[frequency_stats['total_frequency'] <= 5])
    },
    'co_occurrence_stats': {
        'unique_pairs': len(cooccurrence_df),
        'total_co_occurrences': total_pairs,
        'top_pair': (cooccurrence_df.iloc[0]['root1'], cooccurrence_df.iloc[0]['root2']),
        'top_pair_count': int(cooccurrence_df.iloc[0]['cooccurrence_count'])
    }
}

with open('enhancement-metadata.json', 'w', encoding='utf-8') as f:
    json.dump(metadata_summary, f, indent=2, ensure_ascii=False)
print(f"✅ Saved metadata summary: enhancement-metadata.json")


In [None]:
# Final comprehensive summary
print("📋 PHASE 1 ENHANCEMENT SUMMARY")
print("=" * 50)

print(f"\n📊 Dataset Enhancements:")
print(f"  Original entries: {len(quran):,}")
print(f"  Enhanced entries: {len(enhanced_quran):,}")
print(f"  New metadata columns: {len(enhanced_quran.columns) - len(quran.columns)}")

print(f"\n🎯 Semantic Analysis:")
print(f"  Categories defined: {len(semantic_categories)}")
print(f"  Roots categorized: {sum(len(roots) for roots in semantic_categories.values())}")
print(f"  Category coverage: {(enhanced_quran['semantic_category'] != 'uncategorized').sum() / len(enhanced_quran) * 100:.1f}%")

print(f"\n📈 Frequency Analysis:")
print(f"  Unique roots analyzed: {len(frequency_stats)}")
print(f"  Hapax legomena: {len(frequency_stats[frequency_stats['total_frequency'] == 1])}")
print(f"  Strongly Meccan roots: {len(meccan_preferred)}")
print(f"  Strongly Medinan roots: {len(medinan_preferred)}")

print(f"\n🔗 Co-occurrence Analysis:")
print(f"  Verses analyzed: {len(verse_roots)}")
print(f"  Root pairs found: {len(cooccurrence_df)}")
print(f"  Category pairs: {len(category_cooccurrence_df)}")

print(f"\n💾 Files Created:")
files_created = [
    'quran-enhanced-phase1.csv',
    'root-frequency-analysis.csv', 
    'semantic-category-analysis.csv',
    'root-cooccurrence-matrix.csv',
    'category-cooccurrence-matrix.csv',
    'sura-thematic-distribution.csv',
    'sura-thematic-relative.csv',
    'enhancement-metadata.json'
]
for file in files_created:
    print(f"  ✅ {file}")

print(f"\n🚀 Ready for Phase 2: Backend API Development")
print(f"\n✨ Phase 1 Enhancement Complete! ✨")
