In [1]:
# Enhanced imports for advanced analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict

# Optional imports with fallbacks
try:
    import networkx as nx
    HAS_NETWORKX = True
    print("✅ NetworkX available for network analysis")
except ImportError:
    print("⚠️ NetworkX not installed. Network analysis features will be skipped.")
    print("💡 To install: pip install networkx")
    HAS_NETWORKX = False

from itertools import combinations
import json
from functools import lru_cache
import warnings
warnings.filterwarnings('ignore')

# Set plotting style with fallback
try:
    plt.style.use('seaborn-v0_8')
    print("✅ Using seaborn-v0_8 style")
except OSError:
    try:
        plt.style.use('seaborn')
        print("✅ Using seaborn style (fallback)")
    except:
        print("⚠️ Using default matplotlib style")

sns.set_palette("husl")
print("📚 Enhanced libraries imported successfully!")


⚠️ NetworkX not installed. Network analysis features will be skipped.
💡 To install: pip install networkx
✅ Using seaborn-v0_8 style
📚 Enhanced libraries imported successfully!


In [2]:
# Load the processed data from previous analysis
print("📥 Loading processed Quranic data...")
try:
    quran = pd.read_csv('quran-morphology-final.csv')
    print(f"✅ Loaded {len(quran):,} entries")
except FileNotFoundError:
    print("❌ Base data file not found. Please run quran_corrected.ipynb first.")
    raise

# Load table of contents
toc = pd.read_csv('toc.csv')
print(f"✅ Loaded TOC with {len(toc)} suras")

# Display basic info
print(f"\n📊 Dataset Overview:")
print(f"Total entries: {len(quran):,}")
print(f"Entries with roots: {quran.Root.notna().sum():,}")
print(f"Unique roots: {quran.Root.nunique():,}")
print(f"Meccan suras: {len(toc[toc.Place == 'Meccan'])}")
print(f"Medinan suras: {len(toc[toc.Place == 'Medinan'])}")


📥 Loading processed Quranic data...
✅ Loaded 128,219 entries
✅ Loaded TOC with 114 suras

📊 Dataset Overview:
Total entries: 128,219
Entries with roots: 49,968
Unique roots: 1,642
Meccan suras: 86
Medinan suras: 28


In [3]:
# Set up Buckwalter conversion functions (from previous notebook)
abjad = {
    "\u0627": 'A', "\u0628": 'b', "\u062A": 't', "\u062B": 'v', "\u062C": 'j',
    "\u062D": 'H', "\u062E": 'x', "\u062F": 'd', "\u0630": '*', "\u0631": 'r',
    "\u0632": 'z', "\u0633": 's', "\u0634": '$', "\u0635": 'S', "\u0636": 'D',
    "\u0637": 'T', "\u0638": 'Z', "\u0639": 'E', "\u063A": 'g', "\u0641": 'f',
    "\u0642": 'q', "\u0643": 'k', "\u0644": 'l', "\u0645": 'm', "\u0646": 'n',
    "\u0647": 'h', "\u0648": 'w', "\u0649": 'Y', "\u064A": 'y',
    ' ': ' ', "\u0621": "'", "\u0623": '>', "\u0625": '<', "\u0624": '&',
    "\u0626": '}', "\u0622": '|', "\u064E": 'a', "\u064F": 'u', "\u0650": 'i',
    "\u0651": '~', "\u0652": 'o', "\u064B": 'F', "\u064C": 'N', "\u064D": 'K',
    "\u0640": '_', "\u0670": '`', "\u0629": 'p'
}

alphabet = {v: k for k, v in abjad.items()}

def buck_to_arabic(buc):
    """Convert Buckwalter to Arabic"""
    try:
        return ''.join(alphabet.get(x, x) for x in str(buc))
    except:
        return str(buc)

def arabic_to_buck(ara):
    """Convert Arabic to Buckwalter"""
    try:
        return ''.join(abjad.get(x, x) for x in str(ara))
    except:
        return str(ara)

print("🔤 Text conversion functions ready")


🔤 Text conversion functions ready


In [4]:
# Define semantic categories for root classification (in Buckwalter transliteration)
semantic_categories = {
    # Divine and Religious
    'divine_attributes': ['rHm', 'gfr', 'Elm', 'qdr', 'Hkm', 'Ezz', 'Hmd', 'sbH', 'qds', 'wHd'],
    'worship_ritual': ['Slw', 'Swm', 'Hjj', 'zkw', 'sjd', 'rkE', 'dEw', '*kr', 'Ebd'],
    'faith_belief': ['>mn', 'kfr', '$rk', 'wHd', '<mn', 'yqn', '$kk', 'Znn', 'Slm'],
    
    # Human Relations and Society
    'family_relations': ['>bw', '>mm', 'zwj', 'wld', '>xw', 'E$r', 'qrb', 'rHm', 'Ahl'],
    'social_justice': ['Edl', 'Zlm', 'qsT', 'Hqq', 'nSf', 'fsd', 'SlH', '>mn'],
    'commerce_economics': ['byE', '$rw', 'rbw', 'dyn', 'qrD', 'tjr', 'ksb', 'nfq', 'ml'],
    
    # Knowledge and Communication
    'knowledge_wisdom': ['Elm', 'Hkm', 'fhm', 'Eql', 'fkr', 'dbr', '*kr', 'fqh', 'E*r'],
    'communication': ['qwl', 'klm', 'nTq', 'Swt', 'ndw', 'b$r', 'n*r', 'blg', 'xbr'],
    'books_revelation': ['ktb', 'qr>', 'nzl', 'wHy', '>yt', 'tlw', 'HfZ', 'ktp'],
    
    # Natural World
    'creation_nature': ['xlq', 'br>', 'fTr', 'n$>', 'jEl', 'kwn', 'wjd', 'xrj'],
    'time_temporal': ['ywm', 'lyl', 'SbH', 'ms>', 'wqt', 'zmn', 'dhr', 'ESr', 'sEp'],
    'natural_elements': ['smw', '>rD', 'm>', 'nwr', 'hw>', '$ms', 'qmr', 'njm', 'jbl'],
    
    # Actions and States
    'movement_direction': ['*hb', 'jy>', 'rjE', 'xrj', 'dxl', 'SEd', 'nzl', 'syr', 'Sbr'],
    'emotions_states': ['xwf', '>mn', 'Hzn', 'frH', 'gDb', 'rDw', 'Hbb', 'bgD', 'fzE'],
    'moral_conduct': ['Sbr', '$kr', 'tqw', 'brr', 'fjr', 'k*b', 'Sdq', '>mn', 'HSn']
}

print(f"📋 Defined {len(semantic_categories)} semantic categories")
print(f"Total categorized roots: {sum(len(roots) for roots in semantic_categories.values())}")

# Show some examples of the categories
print(f"\n🔍 Sample categorization (Buckwalter → Arabic):")
sample_roots = ['rHm', 'Elm', 'ktb', 'Slw', 'smw']
for root in sample_roots:
    if any(root in category_roots for category_roots in semantic_categories.values()):
        category = next(cat for cat, roots in semantic_categories.items() if root in roots)
        print(f"  {root} → {buck_to_arabic(root)} ({category})")
    else:
        print(f"  {root} → {buck_to_arabic(root)} (uncategorized)")


📋 Defined 15 semantic categories
Total categorized roots: 133

🔍 Sample categorization (Buckwalter → Arabic):
  rHm → رحم (divine_attributes)
  Elm → علم (divine_attributes)
  ktb → كتب (books_revelation)
  Slw → صلو (worship_ritual)
  smw → سمو (natural_elements)


In [5]:
# Create reverse mapping: root -> category (using first category found)
root_to_category = {}
for category, roots in semantic_categories.items():
    for root in roots:
        if root not in root_to_category:  # Only assign if not already assigned
            root_to_category[root] = category

def get_semantic_category(root):
    """Get semantic category for a root"""
    category = root_to_category.get(root, 'uncategorized')
    # Ensure we always return a string, not a list (for pandas compatibility)
    if isinstance(category, list):
        return category[0]  # Take first category if multiple
    return category

# Test the function with Buckwalter roots
test_roots = ['rHm', 'Elm', 'ktb', 'Slw', 'smw']
print(f"\n🔍 Testing categorization (Buckwalter → Arabic):")
for root in test_roots:
    category = get_semantic_category(root)
    print(f"  {root} → {buck_to_arabic(root)} ({category})")

print(f"\n📊 Categorization stats:")
categorized_count = len(root_to_category)
total_categories = len(semantic_categories)
print(f"  Categories defined: {total_categories}")
print(f"  Roots categorized: {categorized_count}")
print(f"\n✅ Root categorization system ready")



🔍 Testing categorization (Buckwalter → Arabic):
  rHm → رحم (divine_attributes)
  Elm → علم (divine_attributes)
  ktb → كتب (books_revelation)
  Slw → صلو (worship_ritual)
  smw → سمو (natural_elements)

📊 Categorization stats:
  Categories defined: 15
  Roots categorized: 122

✅ Root categorization system ready


In [6]:
# Calculate comprehensive frequency statistics
print("📊 Calculating frequency analysis...")

# Overall root frequencies
root_frequencies = quran[quran.Root.notna()].Root.value_counts()
print(f"Most frequent root: {buck_to_arabic(root_frequencies.index[0])} ({root_frequencies.iloc[0]} occurrences)")

# Frequency by revelation type
meccan_freq = quran[(quran.Place == 'Meccan') & (quran.Root.notna())].Root.value_counts()
medinan_freq = quran[(quran.Place == 'Medinan') & (quran.Root.notna())].Root.value_counts()

print(f"Meccan most frequent: {buck_to_arabic(meccan_freq.index[0])} ({meccan_freq.iloc[0]} occurrences)")
print(f"Medinan most frequent: {buck_to_arabic(medinan_freq.index[0])} ({medinan_freq.iloc[0]} occurrences)")

# Frequency by sura
sura_root_counts = quran[quran.Root.notna()].groupby('sura').Root.nunique().sort_values(ascending=False)
print(f"\nSura with most unique roots: {sura_root_counts.index[0]} ({sura_root_counts.iloc[0]} unique roots)")

# Create frequency dataframe
frequency_stats = pd.DataFrame({
    'root': root_frequencies.index,
    'total_frequency': root_frequencies.values,
    'meccan_frequency': [meccan_freq.get(root, 0) for root in root_frequencies.index],
    'medinan_frequency': [medinan_freq.get(root, 0) for root in root_frequencies.index]
})

# Add relative frequencies
frequency_stats['meccan_ratio'] = frequency_stats['meccan_frequency'] / frequency_stats['total_frequency']
frequency_stats['medinan_ratio'] = frequency_stats['medinan_frequency'] / frequency_stats['total_frequency']

# Add semantic categories
frequency_stats['semantic_category'] = frequency_stats['root'].apply(get_semantic_category)

# Add Arabic forms
frequency_stats['root_arabic'] = frequency_stats['root'].apply(buck_to_arabic)

print(f"\n✅ Frequency analysis complete for {len(frequency_stats)} roots")
frequency_stats.head(10)


📊 Calculating frequency analysis...
Most frequent root: اله (2851 occurrences)
Meccan most frequent: قول (1248 occurrences)
Medinan most frequent: اله (1759 occurrences)

Sura with most unique roots: 2 (585 unique roots)

✅ Frequency analysis complete for 1642 roots


Unnamed: 0,root,total_frequency,meccan_frequency,medinan_frequency,meccan_ratio,medinan_ratio,semantic_category,root_arabic
0,Alh,2851,1092,1759,0.383024,0.616976,uncategorized,اله
1,qwl,1722,1248,474,0.724739,0.275261,communication,قول
2,kwn,1390,920,470,0.661871,0.338129,creation_nature,كون
3,rbb,980,762,218,0.777551,0.222449,uncategorized,ربب
4,Amn,879,379,500,0.431172,0.568828,uncategorized,امن
5,Elm,854,495,359,0.579625,0.420375,divine_attributes,علم
6,qwm,660,454,206,0.687879,0.312121,uncategorized,قوم
7,Aty,549,339,210,0.617486,0.382514,uncategorized,اتي
8,kfr,525,233,292,0.44381,0.55619,faith_belief,كفر
9,byn,523,304,219,0.581262,0.418738,uncategorized,بين


In [7]:
# Advanced frequency analysis
print("🔍 Advanced frequency patterns...")

# Roots with strong Meccan preference (>80% Meccan)
meccan_preferred = frequency_stats[
    (frequency_stats['meccan_ratio'] > 0.8) & 
    (frequency_stats['total_frequency'] >= 5)  # At least 5 occurrences
].sort_values('meccan_ratio', ascending=False)

print(f"\n🕌 Strongly Meccan-preferred roots (>80%, ≥5 occurrences): {len(meccan_preferred)}")
print("Top 10:")
for _, row in meccan_preferred.head(10).iterrows():
    print(f"  {row['root_arabic']:>8} ({row['root']:>6}) - {row['meccan_ratio']:.1%} Meccan ({row['total_frequency']} total)")

# Roots with strong Medinan preference (>80% Medinan)
medinan_preferred = frequency_stats[
    (frequency_stats['medinan_ratio'] > 0.8) & 
    (frequency_stats['total_frequency'] >= 5)
].sort_values('medinan_ratio', ascending=False)

print(f"\n🏛️ Strongly Medinan-preferred roots (>80%, ≥5 occurrences): {len(medinan_preferred)}")
print("Top 10:")
for _, row in medinan_preferred.head(10).iterrows():
    print(f"  {row['root_arabic']:>8} ({row['root']:>6}) - {row['medinan_ratio']:.1%} Medinan ({row['total_frequency']} total)")

# Rare roots (appear only 1-2 times)
rare_roots = frequency_stats[frequency_stats['total_frequency'] <= 2]
print(f"\n🔹 Rare roots (≤2 occurrences): {len(rare_roots)} ({len(rare_roots)/len(frequency_stats):.1%} of all roots)")


🔍 Advanced frequency patterns...

🕌 Strongly Meccan-preferred roots (>80%, ≥5 occurrences): 179
Top 10:
       فطر (   fTr) - 100.0% Meccan (20 total)
       كشف (   k$f) - 100.0% Meccan (20 total)
       ضحو (   DHw) - 100.0% Meccan (7 total)
       فرط (   frT) - 100.0% Meccan (8 total)
       سري (   sry) - 100.0% Meccan (8 total)
       خسف (   xsf) - 100.0% Meccan (8 total)
       مهل (   mhl) - 100.0% Meccan (6 total)
       اسف (   Asf) - 100.0% Meccan (5 total)
       طرد (   Trd) - 100.0% Meccan (5 total)
       صنم (   Snm) - 100.0% Meccan (5 total)

🏛️ Strongly Medinan-preferred roots (>80%, ≥5 occurrences): 50
Top 10:
       لوي (   lwy) - 100.0% Medinan (5 total)
       حرف (   Hrf) - 100.0% Medinan (6 total)
       شطر (   $Tr) - 100.0% Medinan (5 total)
       شحح (   $HH) - 100.0% Medinan (5 total)
       اسر (   Asr) - 100.0% Medinan (6 total)
       ميل (   myl) - 100.0% Medinan (6 total)
       ثقف (   vqf) - 100.0% Medinan (6 total)
      زلزل (  zlzl) - 100.0% Medi

In [8]:
# Analyze frequency patterns by semantic category
print("🎯 Thematic categorization analysis...")

# Category frequency distribution
category_stats = frequency_stats.groupby('semantic_category').agg({
    'total_frequency': ['count', 'sum', 'mean'],
    'meccan_frequency': 'sum',
    'medinan_frequency': 'sum'
}).round(2)

category_stats.columns = ['root_count', 'total_occurrences', 'avg_frequency', 'meccan_total', 'medinan_total']
category_stats['meccan_ratio'] = category_stats['meccan_total'] / (category_stats['meccan_total'] + category_stats['medinan_total'])
category_stats = category_stats.sort_values('total_occurrences', ascending=False)

print("\n📊 Semantic category statistics:")
print(category_stats)

# Find categories with strong revelation type preferences
print("\n🎭 Categories with revelation preferences:")
for category, row in category_stats.iterrows():
    if row['meccan_ratio'] > 0.7:
        print(f"  🕌 {category}: {row['meccan_ratio']:.1%} Meccan")
    elif row['meccan_ratio'] < 0.3:
        print(f"  🏛️ {category}: {1-row['meccan_ratio']:.1%} Medinan")


🎯 Thematic categorization analysis...

📊 Semantic category statistics:
                    root_count  total_occurrences  avg_frequency  \
semantic_category                                                  
uncategorized             1541              36140          23.45   
creation_nature              6               2306         384.33   
communication                9               2252         250.22   
divine_attributes           10               2121         212.10   
worship_ritual               9               1089         121.00   
social_justice               7                892         127.43   
faith_belief                 5                805         161.00   
books_revelation             5                797         159.40   
natural_elements             6                689         114.83   
moral_conduct                6                586          97.67   
time_temporal                6                562          93.67   
family_relations             5               

In [9]:
# Create thematic distribution by sura
print("📚 Thematic distribution by sura...")

# Add semantic categories to main dataset
quran_enhanced = quran.copy()
quran_enhanced['semantic_category'] = quran_enhanced['Root'].apply(get_semantic_category)

# Calculate category distribution by sura
sura_theme_distribution = quran_enhanced[quran_enhanced.Root.notna()].groupby(['sura', 'semantic_category']).size().unstack(fill_value=0)

# Calculate relative distributions
sura_theme_relative = sura_theme_distribution.div(sura_theme_distribution.sum(axis=1), axis=0)

print(f"✅ Thematic analysis complete for {len(sura_theme_distribution)} suras")
print(f"Categories tracked: {list(sura_theme_distribution.columns)}")

# Show example for Al-Fatiha
print("\n📖 Example - Al-Fatiha (Sura 1) thematic breakdown:")
if 1 in sura_theme_relative.index:
    fatiha_themes = sura_theme_relative.loc[1]
    for theme, ratio in fatiha_themes[fatiha_themes > 0].sort_values(ascending=False).items():
        print(f"  {theme}: {ratio:.1%}")


📚 Thematic distribution by sura...
✅ Thematic analysis complete for 114 suras
Categories tracked: ['books_revelation', 'commerce_economics', 'communication', 'creation_nature', 'divine_attributes', 'emotions_states', 'faith_belief', 'family_relations', 'knowledge_wisdom', 'moral_conduct', 'movement_direction', 'natural_elements', 'social_justice', 'time_temporal', 'uncategorized', 'worship_ritual']

📖 Example - Al-Fatiha (Sura 1) thematic breakdown:
  uncategorized: 52.2%
  divine_attributes: 26.1%
  commerce_economics: 4.3%
  emotions_states: 4.3%
  natural_elements: 4.3%
  time_temporal: 4.3%
  worship_ritual: 4.3%


In [10]:
# Calculate root co-occurrences within verses
print("🔗 Calculating root co-occurrence matrices...")

# Group by verse to find roots that appear together
verse_roots = quran_enhanced[quran_enhanced.Root.notna()].groupby(['sura', 'aya'])['Root'].apply(list).reset_index()
verse_roots['root_count'] = verse_roots['Root'].apply(len)

print(f"Analyzed {len(verse_roots)} verses with roots")
print(f"Average roots per verse: {verse_roots['root_count'].mean():.1f}")
print(f"Max roots in a verse: {verse_roots['root_count'].max()}")

# Calculate co-occurrence matrix
cooccurrence_counts = defaultdict(int)
total_pairs = 0

for _, row in verse_roots.iterrows():
    roots_in_verse = list(set(row['Root']))  # Remove duplicates within verse
    if len(roots_in_verse) > 1:
        for root1, root2 in combinations(roots_in_verse, 2):
            # Sort pair to ensure consistent ordering
            pair = tuple(sorted([root1, root2]))
            cooccurrence_counts[pair] += 1
            total_pairs += 1

print(f"\n🔢 Found {len(cooccurrence_counts)} unique root pairs")
print(f"Total co-occurrence instances: {total_pairs}")

# Convert to DataFrame for analysis
cooccurrence_df = pd.DataFrame([
    {'root1': pair[0], 'root2': pair[1], 'cooccurrence_count': count}
    for pair, count in cooccurrence_counts.items()
]).sort_values('cooccurrence_count', ascending=False)

print("\n🔝 Top 10 most co-occurring root pairs:")
for _, row in cooccurrence_df.head(10).iterrows():
    r1_ar = buck_to_arabic(row['root1'])
    r2_ar = buck_to_arabic(row['root2'])
    print(f"  {r1_ar} + {r2_ar}: {row['cooccurrence_count']} times")


🔗 Calculating root co-occurrence matrices...
Analyzed 6214 verses with roots
Average roots per verse: 8.0
Max roots in a verse: 84

🔢 Found 74185 unique root pairs
Total co-occurrence instances: 211298

🔝 Top 10 most co-occurring root pairs:
  اله + قول: 514 times
  اله + كون: 441 times
  اله + علم: 408 times
  اله + امن: 372 times
  كون + قول: 369 times
  قول + ربب: 329 times
  شيا + اله: 282 times
  اله + قوم: 242 times
  علم + قول: 230 times
  اله + كفر: 225 times


In [11]:
# Calculate semantic co-occurrence patterns
print("🎭 Semantic category co-occurrence analysis...")

# Add semantic categories to co-occurrence data
cooccurrence_df['category1'] = cooccurrence_df['root1'].apply(get_semantic_category)
cooccurrence_df['category2'] = cooccurrence_df['root2'].apply(get_semantic_category)

# Check categorization success
categorized_pairs = cooccurrence_df[(cooccurrence_df['category1'] != 'uncategorized') | 
                                    (cooccurrence_df['category2'] != 'uncategorized')]
print(f"📊 Categorization results:")
print(f"  Total root pairs: {len(cooccurrence_df)}")
print(f"  Pairs with at least one categorized root: {len(categorized_pairs)}")
print(f"  Categorization coverage: {len(categorized_pairs)/len(cooccurrence_df)*100:.1f}%")

# Calculate category-level co-occurrences
category_cooccurrence = defaultdict(int)
for _, row in cooccurrence_df.iterrows():
    cat_pair = tuple(sorted([row['category1'], row['category2']]))
    category_cooccurrence[cat_pair] += row['cooccurrence_count']

# Create DataFrame with error handling
categorized_pairs_data = [
    {'category1': pair[0], 'category2': pair[1], 'total_cooccurrence': count}
    for pair, count in category_cooccurrence.items()
    if pair[0] != 'uncategorized' and pair[1] != 'uncategorized'  # Filter out uncategorized
]

if categorized_pairs_data:
    category_cooccurrence_df = pd.DataFrame(categorized_pairs_data).sort_values('total_cooccurrence', ascending=False)
    
    print(f"\n🎯 Top semantic category co-occurrences:")
    for _, row in category_cooccurrence_df.head(15).iterrows():
        if row['category1'] != row['category2']:  # Different categories
            print(f"  {row['category1']} + {row['category2']}: {row['total_cooccurrence']} co-occurrences")
        else:  # Same category (internal consistency)
            print(f"  {row['category1']} (internal): {row['total_cooccurrence']} co-occurrences")
else:
    print("⚠️  No categorized pairs found for co-occurrence analysis")
    print("💡 This suggests the semantic categories need adjustment to match the actual root data")
    # Create empty DataFrame to prevent errors downstream
    category_cooccurrence_df = pd.DataFrame(columns=['category1', 'category2', 'total_cooccurrence'])
    
    # Show some examples of actual roots for debugging
    print(f"\n🔍 Sample actual roots in data:")
    sample_roots = cooccurrence_df['root1'].head(10).tolist()
    for root in sample_roots[:5]:
        category = get_semantic_category(root)
        print(f"  {root} → {buck_to_arabic(root)} ({category})")


🎭 Semantic category co-occurrence analysis...
📊 Categorization results:
  Total root pairs: 74185
  Pairs with at least one categorized root: 26198
  Categorization coverage: 35.3%

🎯 Top semantic category co-occurrences:
  communication + creation_nature: 746 co-occurrences
  creation_nature + divine_attributes: 713 co-occurrences
  communication + divine_attributes: 680 co-occurrences
  divine_attributes (internal): 494 co-occurrences
  creation_nature + worship_ritual: 381 co-occurrences
  creation_nature + social_justice: 356 co-occurrences
  divine_attributes + worship_ritual: 356 co-occurrences
  communication + worship_ritual: 354 co-occurrences
  books_revelation + divine_attributes: 338 co-occurrences
  creation_nature + natural_elements: 329 co-occurrences
  divine_attributes + social_justice: 306 co-occurrences
  creation_nature + faith_belief: 304 co-occurrences
  communication + social_justice: 300 co-occurrences
  communication + faith_belief: 291 co-occurrences
  books_r

In [12]:
# Create comprehensive enhanced dataset
print("💎 Creating enhanced dataset with all metadata...")

# Merge frequency statistics with root data
frequency_lookup = frequency_stats.set_index('root').to_dict('index')

def add_frequency_info(root):
    if pd.isna(root) or root not in frequency_lookup:
        return {'total_freq': 0, 'meccan_freq': 0, 'medinan_freq': 0, 'meccan_ratio': 0, 'frequency_rank': 0}
    info = frequency_lookup[root]
    return {
        'total_freq': info['total_frequency'],
        'meccan_freq': info['meccan_frequency'], 
        'medinan_freq': info['medinan_frequency'],
        'meccan_ratio': info['meccan_ratio'],
        'frequency_rank': frequency_stats[frequency_stats['root'] == root].index[0] + 1
    }

# Add all enhancements to the dataset
enhanced_quran = quran_enhanced.copy()

# Add frequency information
freq_info = enhanced_quran['Root'].apply(add_frequency_info)
for key in ['total_freq', 'meccan_freq', 'medinan_freq', 'meccan_ratio', 'frequency_rank']:
    enhanced_quran[f'root_{key}'] = [info[key] for info in freq_info]

# Add Arabic root form
enhanced_quran['root_arabic'] = enhanced_quran['Root'].apply(lambda x: buck_to_arabic(x) if pd.notna(x) else '')

# Add rarity classification
def classify_rarity(freq):
    if freq == 0: return 'no_root'
    elif freq == 1: return 'hapax_legomena'
    elif freq <= 5: return 'very_rare'
    elif freq <= 20: return 'rare'
    elif freq <= 100: return 'common'
    else: return 'very_common'

enhanced_quran['root_rarity'] = enhanced_quran['root_total_freq'].apply(classify_rarity)

# Add revelation preference classification
def classify_revelation_preference(ratio, total_freq):
    if total_freq < 3: return 'insufficient_data'
    elif ratio > 0.8: return 'strongly_meccan'
    elif ratio > 0.6: return 'meccan_leaning'
    elif ratio < 0.2: return 'strongly_medinan'
    elif ratio < 0.4: return 'medinan_leaning'
    else: return 'balanced'

enhanced_quran['revelation_preference'] = enhanced_quran.apply(
    lambda row: classify_revelation_preference(row['root_meccan_ratio'], row['root_total_freq']), axis=1
)

print(f"✅ Enhanced dataset created with {len(enhanced_quran)} entries")
print(f"New columns added: {[col for col in enhanced_quran.columns if col not in quran.columns]}")


💎 Creating enhanced dataset with all metadata...
✅ Enhanced dataset created with 128219 entries
New columns added: ['semantic_category', 'root_total_freq', 'root_meccan_freq', 'root_medinan_freq', 'root_meccan_ratio', 'root_frequency_rank', 'root_arabic', 'root_rarity', 'revelation_preference']


In [13]:
# Save enhanced datasets
print("💾 Saving enhanced datasets...")

# Main enhanced dataset
enhanced_quran.to_csv('quran-enhanced-phase1.csv', index=False)
print(f"✅ Saved main enhanced dataset: quran-enhanced-phase1.csv")

# Frequency statistics
frequency_stats.to_csv('root-frequency-analysis.csv', index=False)
print(f"✅ Saved frequency analysis: root-frequency-analysis.csv")

# Semantic category analysis
category_stats.to_csv('semantic-category-analysis.csv')
print(f"✅ Saved category analysis: semantic-category-analysis.csv")

# Co-occurrence matrices
cooccurrence_df.to_csv('root-cooccurrence-matrix.csv', index=False)
if not category_cooccurrence_df.empty:
    category_cooccurrence_df.to_csv('category-cooccurrence-matrix.csv', index=False)
    print(f"✅ Saved co-occurrence matrices")
else:
    # Save empty file with headers for consistency
    pd.DataFrame(columns=['category1', 'category2', 'total_cooccurrence']).to_csv('category-cooccurrence-matrix.csv', index=False)
    print(f"✅ Saved root co-occurrence matrix (category matrix empty due to categorization issues)")

# Thematic distribution by sura
sura_theme_distribution.to_csv('sura-thematic-distribution.csv')
sura_theme_relative.to_csv('sura-thematic-relative.csv')
print(f"✅ Saved thematic distributions")

# Create metadata summary
metadata_summary = {
    'dataset_info': {
        'total_entries': int(len(enhanced_quran)),
        'entries_with_roots': int(enhanced_quran['Root'].notna().sum()),
        'unique_roots': int(enhanced_quran['Root'].nunique()),
        'enhancement_date': pd.Timestamp.now().isoformat()
    },
    'semantic_categories': list(semantic_categories.keys()),
    'frequency_statistics': {
        'most_frequent_root': str(frequency_stats.iloc[0]['root']),
        'most_frequent_count': int(frequency_stats.iloc[0]['total_frequency']),
        'hapax_legomena_count': int(len(frequency_stats[frequency_stats['total_frequency'] == 1])),
        'rare_roots_count': int(len(frequency_stats[frequency_stats['total_frequency'] <= 5]))
    },
    'co_occurrence_stats': {
        'unique_pairs': int(len(cooccurrence_df)),
        'total_co_occurrences': int(total_pairs),
        'top_pair': [str(cooccurrence_df.iloc[0]['root1']), str(cooccurrence_df.iloc[0]['root2'])],
        'top_pair_count': int(cooccurrence_df.iloc[0]['cooccurrence_count'])
    }
}

with open('enhancement-metadata.json', 'w', encoding='utf-8') as f:
    json.dump(metadata_summary, f, indent=2, ensure_ascii=False)
print(f"✅ Saved metadata summary: enhancement-metadata.json")


💾 Saving enhanced datasets...
✅ Saved main enhanced dataset: quran-enhanced-phase1.csv
✅ Saved frequency analysis: root-frequency-analysis.csv
✅ Saved category analysis: semantic-category-analysis.csv
✅ Saved co-occurrence matrices
✅ Saved thematic distributions
✅ Saved metadata summary: enhancement-metadata.json


In [14]:
# Final comprehensive summary
print("📋 PHASE 1 ENHANCEMENT SUMMARY")
print("=" * 50)

print(f"\n📊 Dataset Enhancements:")
print(f"  Original entries: {len(quran):,}")
print(f"  Enhanced entries: {len(enhanced_quran):,}")
print(f"  New metadata columns: {len(enhanced_quran.columns) - len(quran.columns)}")

print(f"\n🎯 Semantic Analysis:")
print(f"  Categories defined: {len(semantic_categories)}")
print(f"  Roots categorized: {sum(len(roots) for roots in semantic_categories.values())}")
print(f"  Category coverage: {(enhanced_quran['semantic_category'] != 'uncategorized').sum() / len(enhanced_quran) * 100:.1f}%")

print(f"\n📈 Frequency Analysis:")
print(f"  Unique roots analyzed: {len(frequency_stats)}")
print(f"  Hapax legomena: {len(frequency_stats[frequency_stats['total_frequency'] == 1])}")
print(f"  Strongly Meccan roots: {len(meccan_preferred)}")
print(f"  Strongly Medinan roots: {len(medinan_preferred)}")

print(f"\n🔗 Co-occurrence Analysis:")
print(f"  Verses analyzed: {len(verse_roots)}")
print(f"  Root pairs found: {len(cooccurrence_df)}")
print(f"  Category pairs: {len(category_cooccurrence_df)}")

print(f"\n💾 Files Created:")
files_created = [
    'quran-enhanced-phase1.csv',
    'root-frequency-analysis.csv', 
    'semantic-category-analysis.csv',
    'root-cooccurrence-matrix.csv',
    'category-cooccurrence-matrix.csv',
    'sura-thematic-distribution.csv',
    'sura-thematic-relative.csv',
    'enhancement-metadata.json'
]
for file in files_created:
    print(f"  ✅ {file}")

print(f"\n🚀 Ready for Phase 2: Backend API Development")
print(f"\n✨ Phase 1 Enhancement Complete! ✨")


📋 PHASE 1 ENHANCEMENT SUMMARY

📊 Dataset Enhancements:
  Original entries: 128,219
  Enhanced entries: 128,219
  New metadata columns: 9

🎯 Semantic Analysis:
  Categories defined: 15
  Roots categorized: 133
  Category coverage: 10.8%

📈 Frequency Analysis:
  Unique roots analyzed: 1642
  Hapax legomena: 395
  Strongly Meccan roots: 179
  Strongly Medinan roots: 50

🔗 Co-occurrence Analysis:
  Verses analyzed: 6214
  Root pairs found: 74185
  Category pairs: 120

💾 Files Created:
  ✅ quran-enhanced-phase1.csv
  ✅ root-frequency-analysis.csv
  ✅ semantic-category-analysis.csv
  ✅ root-cooccurrence-matrix.csv
  ✅ category-cooccurrence-matrix.csv
  ✅ sura-thematic-distribution.csv
  ✅ sura-thematic-relative.csv
  ✅ enhancement-metadata.json

🚀 Ready for Phase 2: Backend API Development

✨ Phase 1 Enhancement Complete! ✨
