In [None]:
# Import required libraries
import pandas as pd
import numpy as np
print("✅ Libraries imported successfully!")


In [None]:
# Load morphological data from URL
url = 'http://textminingthequran.com/data/quranic-corpus-morphology-0.4.txt'
print("📥 Downloading morphological data...")
qdforiginal = pd.read_csv(url, sep='\t', skiprows=56)
print(f"✅ Loaded {len(qdforiginal)} entries")
qdforiginal.head()


In [None]:
# Save original data locally
qdforiginal.to_csv('quran-morphology-v1.csv')
print("💾 Original data saved as quran-morphology-v1.csv")


In [None]:
# Filter for entries with ROOT information
print("🔍 Filtering entries with ROOT information...")
root_entries = qdforiginal[qdforiginal.FEATURES.str.contains('ROOT:', na=False)]
print(f"Found {len(root_entries)} entries with root information")
root_entries.head(3)


In [None]:
# Apply extraction to full dataset
print("⚙️ Processing full dataset...")
tmp1 = qdforiginal.LOCATION.str.extract(r'(?P<sura>\d*):(?P<aya>\d*):(?P<word>\d*):(?P<w_seg>\d*)')
tmp2 = qdforiginal.FEATURES.str.extract(r'ROOT:(?P<Root>[^|]*)')
tmp3 = qdforiginal.FEATURES.str.extract(r'LEM:(?P<Lemma>[^|]*)')
df_qruan = pd.concat([tmp1, qdforiginal, tmp2, tmp3], axis=1)
print(f"✅ Processed dataset shape: {df_qruan.shape}")


In [None]:
# Clean extracted data (remove extra spaces)
print("🧹 Cleaning extracted data...")
df_qruan['Root'] = df_qruan['Root'].str.strip()
df_qruan['Lemma'] = df_qruan['Lemma'].str.strip()
print("✅ Data cleaned")
df_qruan.sample(5)


In [None]:
# Load table of contents with Meccan/Medinan classification
print("📖 Loading sura classification data...")
qtoc = pd.read_csv('toc.csv')
print(f"✅ Loaded classification for {len(qtoc)} suras")
qtoc.head()


In [None]:
# Convert sura column to integer for proper merging
print("🔧 Converting sura column to integer...")
df_qruan['sura'] = df_qruan['sura'].astype(int)
print(f"✅ Sura column type: {df_qruan['sura'].dtype}")
print(f"TOC No. column type: {qtoc['No.'].dtype}")


In [None]:
# Merge morphological data with classification
print("🔗 Merging data with Meccan/Medinan classification...")
quran = df_qruan.merge(qtoc.loc[:, ['No.', 'Place']], how='left', left_on='sura', right_on='No.')
print(f"✅ Merged dataset shape: {quran.shape}")
quran.info()


In [None]:
# Save final processed data
quran.to_csv('quran-morphology-final.csv', index=False)
print("💾 Final data saved as quran-morphology-final.csv")


In [None]:
# Create Buckwalter to Arabic mapping
print("🔤 Setting up Buckwalter to Arabic conversion...")

# Arabic to Buckwalter mapping
abjad = {
    "\u0627": 'A',
    "\u0628": 'b', "\u062A": 't', "\u062B": 'v', "\u062C": 'j',
    "\u062D": 'H', "\u062E": 'x', "\u062F": 'd', "\u0630": '*', "\u0631": 'r',
    "\u0632": 'z', "\u0633": 's', "\u0634": '$', "\u0635": 'S', "\u0636": 'D',
    "\u0637": 'T', "\u0638": 'Z', "\u0639": 'E', "\u063A": 'g', "\u0641": 'f',
    "\u0642": 'q', "\u0643": 'k', "\u0644": 'l', "\u0645": 'm', "\u0646": 'n',
    "\u0647": 'h', "\u0648": 'w', "\u0649": 'Y', "\u064A": 'y'
}

# Additional characters
abjad[' '] = ' '
abjad["\u0621"] = "'"
abjad["\u0623"] = '>'
abjad["\u0625"] = '<'
abjad["\u0624"] = '&'
abjad["\u0626"] = '}'
abjad["\u0622"] = '|'
abjad["\u064E"] = 'a'
abjad["\u064F"] = 'u'
abjad["\u0650"] = 'i'
abjad["\u0651"] = '~'
abjad["\u0652"] = 'o'
abjad["\u064B"] = 'F'
abjad["\u064C"] = 'N'
abjad["\u064D"] = 'K'
abjad["\u0640"] = '_'
abjad["\u0670"] = '`'
abjad["\u0629"] = 'p'

# Create reverse mapping (Buckwalter to Arabic)
alphabet = {}
for key in abjad:
    alphabet[abjad[key]] = key

print(f"✅ Mappings created with {len(alphabet)} characters")


In [None]:
# Define conversion functions
def arabic_to_buc(ara):
    """Convert Arabic text to Buckwalter transliteration"""
    try:
        return ''.join(map(lambda x: abjad.get(x, x), list(ara)))
    except:
        return ara

def buck_to_arabic(buc):
    """Convert Buckwalter transliteration to Arabic text"""
    try:
        return ''.join(map(lambda x: alphabet.get(x, x), list(buc)))
    except:
        return buc

print("✅ Conversion functions defined")


In [None]:
# Function to get words for specific suras
def sura_words(s_list, kind='W'):
    """
    Get unique words from specified suras
    
    Parameters:
    s_list: list of sura numbers
    kind: 'W' for words, 'R' for roots, 'L' for lemmas
    
    Returns: list of words in Arabic script
    """
    if kind == 'R':
        result = quran[quran.sura.isin(s_list)].Root.dropna().unique().tolist()
    elif kind == 'L':
        result = quran[quran.sura.isin(s_list)].Lemma.dropna().unique().tolist()
    else:
        result = quran[quran.sura.isin(s_list)].FORM.dropna().unique().tolist()
    
    return [buck_to_arabic(x) for x in result if x]

print("✅ sura_words function defined")


In [None]:
# Function to get unique words (appearing only in specified suras)
def unique_sura_words(s_list, kind='W'):
    """
    Get words that appear ONLY in specified suras
    
    Parameters:
    s_list: list of sura numbers
    kind: 'W' for words, 'R' for roots, 'L' for lemmas
    
    Returns: list of unique words in Arabic script
    """
    if kind == 'R':
        first = quran[quran.sura.isin(s_list)].Root.dropna().unique().tolist()
        second = quran[~quran.sura.isin(s_list)].Root.dropna().unique().tolist()
        result = list(set(first) - set(second))
    elif kind == 'L':
        first = quran[quran.sura.isin(s_list)].Lemma.dropna().unique().tolist()
        second = quran[~quran.sura.isin(s_list)].Lemma.dropna().unique().tolist()
        result = list(set(first) - set(second))
    else:
        first = quran[quran.sura.isin(s_list)].FORM.dropna().unique().tolist()
        second = quran[~quran.sura.isin(s_list)].FORM.dropna().unique().tolist()
        result = list(set(first) - set(second))
    
    return [buck_to_arabic(x) for x in result if x]

print("✅ unique_sura_words function defined")


In [None]:
# Get unique root words by revelation type
print("🌿 Analyzing unique root words...")

# Meccan roots
meccan_roots = set(quran[quran.Place == 'Meccan'].Root.dropna().unique())
print(f"Meccan unique roots: {len(meccan_roots)}")

# Medinan roots
medinan_roots = set(quran[quran.Place == 'Medinan'].Root.dropna().unique())
print(f"Medinan unique roots: {len(medinan_roots)}")


In [None]:
# Calculate exclusive and shared roots
print("📊 Calculating root word statistics...")

# Meccan-only roots
meccan_only = meccan_roots - medinan_roots
print(f"Meccan-only roots: {len(meccan_only)}")

# Medinan-only roots
medinan_only = medinan_roots - meccan_roots
print(f"Medinan-only roots: {len(medinan_only)}")

# Shared roots
shared_roots = meccan_roots & medinan_roots
print(f"Shared roots: {len(shared_roots)}")

# Summary
print("\n📈 Summary:")
print(f"Total unique roots: {len(meccan_roots | medinan_roots)}")
print(f"Meccan-only: {len(meccan_only)} ({len(meccan_only)/(len(meccan_roots | medinan_roots))*100:.1f}%)")
print(f"Medinan-only: {len(medinan_only)} ({len(medinan_only)/(len(meccan_roots | medinan_roots))*100:.1f}%)")
print(f"Shared: {len(shared_roots)} ({len(shared_roots)/(len(meccan_roots | medinan_roots))*100:.1f}%)")


In [None]:
# Test with Al-Fatiha (Sura 1)
print("🧪 Testing with Al-Fatiha (Sura 1):")
fatiha_roots = sura_words([1], 'R')
print(f"Roots in Al-Fatiha: {len(fatiha_roots)}")
print("First 10 roots:", fatiha_roots[:10])


In [None]:
# Generate final summary report
print("📊 FINAL ANALYSIS SUMMARY")
print("=" * 50)
print(f"📖 Total Quranic entries analyzed: {len(quran):,}")
print(f"🌿 Total entries with root information: {quran.Root.notna().sum():,}")
print(f"📝 Total entries with lemma information: {quran.Lemma.notna().sum():,}")
print(f"📚 Total suras covered: {quran.sura.nunique()}")
print("\n🏛️ REVELATION ANALYSIS:")
print(f"🕌 Meccan-only root words: {len(meccan_only)}")
print(f"🏛️ Medinan-only root words: {len(medinan_only)}")
print(f"🤝 Shared root words: {len(shared_roots)}")
print(f"🌟 Total unique root words: {len(meccan_roots | medinan_roots)}")
print("\n✅ Analysis complete! All functions ready for use.")
