In [1]:
# Import required libraries
import pandas as pd
import numpy as np
print("✅ Libraries imported successfully!")


✅ Libraries imported successfully!


In [2]:
# Load morphological data from URL
url = 'http://textminingthequran.com/data/quranic-corpus-morphology-0.4.txt'
print("📥 Downloading morphological data...")
qdforiginal = pd.read_csv(url, sep='\t', skiprows=56)
print(f"✅ Loaded {len(qdforiginal)} entries")
qdforiginal.head()


📥 Downloading morphological data...
✅ Loaded 128219 entries


Unnamed: 0,LOCATION,FORM,TAG,FEATURES
0,(1:1:1:1),bi,P,PREFIX|bi+
1,(1:1:1:2),somi,N,STEM|POS:N|LEM:{som|ROOT:smw|M|GEN
2,(1:1:2:1),{ll~ahi,PN,STEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN
3,(1:1:3:1),{l,DET,PREFIX|Al+
4,(1:1:3:2),r~aHoma`ni,ADJ,STEM|POS:ADJ|LEM:r~aHoma`n|ROOT:rHm|MS|GEN


In [3]:
# Save original data locally
qdforiginal.to_csv('quran-morphology-v1.csv')
print("💾 Original data saved as quran-morphology-v1.csv")


💾 Original data saved as quran-morphology-v1.csv


In [4]:
# Filter for entries with ROOT information
print("🔍 Filtering entries with ROOT information...")
root_entries = qdforiginal[qdforiginal.FEATURES.str.contains('ROOT:', na=False)]
print(f"Found {len(root_entries)} entries with root information")
root_entries.head(3)


🔍 Filtering entries with ROOT information...
Found 49968 entries with root information


Unnamed: 0,LOCATION,FORM,TAG,FEATURES
1,(1:1:1:2),somi,N,STEM|POS:N|LEM:{som|ROOT:smw|M|GEN
2,(1:1:2:1),{ll~ahi,PN,STEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN
4,(1:1:3:2),r~aHoma`ni,ADJ,STEM|POS:ADJ|LEM:r~aHoma`n|ROOT:rHm|MS|GEN


In [5]:
# Apply extraction to full dataset
print("⚙️ Processing full dataset...")
tmp1 = qdforiginal.LOCATION.str.extract(r'(?P<sura>\d*):(?P<aya>\d*):(?P<word>\d*):(?P<w_seg>\d*)')
tmp2 = qdforiginal.FEATURES.str.extract(r'ROOT:(?P<Root>[^|]*)')
tmp3 = qdforiginal.FEATURES.str.extract(r'LEM:(?P<Lemma>[^|]*)')
df_qruan = pd.concat([tmp1, qdforiginal, tmp2, tmp3], axis=1)
print(f"✅ Processed dataset shape: {df_qruan.shape}")


⚙️ Processing full dataset...
✅ Processed dataset shape: (128219, 10)


In [6]:
# Clean extracted data (remove extra spaces)
print("🧹 Cleaning extracted data...")
df_qruan['Root'] = df_qruan['Root'].str.strip()
df_qruan['Lemma'] = df_qruan['Lemma'].str.strip()
print("✅ Data cleaned")
df_qruan.sample(5)


🧹 Cleaning extracted data...
✅ Data cleaned


Unnamed: 0,sura,aya,word,w_seg,LOCATION,FORM,TAG,FEATURES,Root,Lemma
45318,10,50,11,1,(10:50:11:1),mino,P,STEM|POS:P|LEM:min,,min
19686,4,98,5,1,(4:98:5:1),wa,CONJ,PREFIX|w:CONJ+,,
57797,16,46,1,1,(16:46:1:1),>awo,CONJ,STEM|POS:CONJ|LEM:>aw,,>aw
72205,22,53,10,1,(22:53:10:1),wa,CONJ,PREFIX|w:CONJ+,,
8696,2,253,52,1,(2:253:52:1),yuriydu,V,STEM|POS:V|IMPF|(IV)|LEM:>araAda|ROOT:rwd|3MS,rwd,>araAda


In [7]:
# Load table of contents with Meccan/Medinan classification
print("📖 Loading sura classification data...")
qtoc = pd.read_csv('toc.csv')
print(f"✅ Loaded classification for {len(qtoc)} suras")
qtoc.head()


📖 Loading sura classification data...
✅ Loaded classification for 114 suras


Unnamed: 0,No.,Name Arabic,Name,English Meaning,No of verses,Place,Chronology
0,1,الفاتحة,Al-Fatiha,The Opening,7,Meccan,5
1,2,البقرة,Al-Baqara,The Cow,286,Medinan,87
2,3,آل عمران,Al Imran,The House of Joachim,200,Medinan,89
3,4,النساء,An-Nisa',Women,176,Medinan,92
4,5,المائدة,Al-Ma'ida,The Table Spread,120,Medinan,112


In [8]:
# Convert sura column to integer for proper merging
print("🔧 Converting sura column to integer...")
df_qruan['sura'] = df_qruan['sura'].astype(int)
print(f"✅ Sura column type: {df_qruan['sura'].dtype}")
print(f"TOC No. column type: {qtoc['No.'].dtype}")


🔧 Converting sura column to integer...
✅ Sura column type: int64
TOC No. column type: int64


In [9]:
# Merge morphological data with classification
print("🔗 Merging data with Meccan/Medinan classification...")
quran = df_qruan.merge(qtoc.loc[:, ['No.', 'Place']], how='left', left_on='sura', right_on='No.')
print(f"✅ Merged dataset shape: {quran.shape}")
quran.info()


🔗 Merging data with Meccan/Medinan classification...
✅ Merged dataset shape: (128219, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128219 entries, 0 to 128218
Data columns (total 12 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   sura      128219 non-null  int64 
 1   aya       128219 non-null  object
 2   word      128219 non-null  object
 3   w_seg     128219 non-null  object
 4   LOCATION  128219 non-null  object
 5   FORM      128011 non-null  object
 6   TAG       128219 non-null  object
 7   FEATURES  128219 non-null  object
 8   Root      49968 non-null   object
 9   Lemma     74608 non-null   object
 10  No.       128219 non-null  int64 
 11  Place     128219 non-null  object
dtypes: int64(2), object(10)
memory usage: 11.7+ MB


In [10]:
# Save final processed data
quran.to_csv('quran-morphology-final.csv', index=False)
print("💾 Final data saved as quran-morphology-final.csv")


💾 Final data saved as quran-morphology-final.csv


In [11]:
# Create Buckwalter to Arabic mapping
print("🔤 Setting up Buckwalter to Arabic conversion...")

# Arabic to Buckwalter mapping
abjad = {
    "\u0627": 'A',
    "\u0628": 'b', "\u062A": 't', "\u062B": 'v', "\u062C": 'j',
    "\u062D": 'H', "\u062E": 'x', "\u062F": 'd', "\u0630": '*', "\u0631": 'r',
    "\u0632": 'z', "\u0633": 's', "\u0634": '$', "\u0635": 'S', "\u0636": 'D',
    "\u0637": 'T', "\u0638": 'Z', "\u0639": 'E', "\u063A": 'g', "\u0641": 'f',
    "\u0642": 'q', "\u0643": 'k', "\u0644": 'l', "\u0645": 'm', "\u0646": 'n',
    "\u0647": 'h', "\u0648": 'w', "\u0649": 'Y', "\u064A": 'y'
}

# Additional characters
abjad[' '] = ' '
abjad["\u0621"] = "'"
abjad["\u0623"] = '>'
abjad["\u0625"] = '<'
abjad["\u0624"] = '&'
abjad["\u0626"] = '}'
abjad["\u0622"] = '|'
abjad["\u064E"] = 'a'
abjad["\u064F"] = 'u'
abjad["\u0650"] = 'i'
abjad["\u0651"] = '~'
abjad["\u0652"] = 'o'
abjad["\u064B"] = 'F'
abjad["\u064C"] = 'N'
abjad["\u064D"] = 'K'
abjad["\u0640"] = '_'
abjad["\u0670"] = '`'
abjad["\u0629"] = 'p'

# Create reverse mapping (Buckwalter to Arabic)
alphabet = {}
for key in abjad:
    alphabet[abjad[key]] = key

print(f"✅ Mappings created with {len(alphabet)} characters")


🔤 Setting up Buckwalter to Arabic conversion...
✅ Mappings created with 47 characters


In [12]:
# Define conversion functions
def arabic_to_buc(ara):
    """Convert Arabic text to Buckwalter transliteration"""
    try:
        return ''.join(map(lambda x: abjad.get(x, x), list(ara)))
    except:
        return ara

def buck_to_arabic(buc):
    """Convert Buckwalter transliteration to Arabic text"""
    try:
        return ''.join(map(lambda x: alphabet.get(x, x), list(buc)))
    except:
        return buc

print("✅ Conversion functions defined")


✅ Conversion functions defined


In [13]:
# Function to get words for specific suras
def sura_words(s_list, kind='W'):
    """
    Get unique words from specified suras
    
    Parameters:
    s_list: list of sura numbers
    kind: 'W' for words, 'R' for roots, 'L' for lemmas
    
    Returns: list of words in Arabic script
    """
    if kind == 'R':
        result = quran[quran.sura.isin(s_list)].Root.dropna().unique().tolist()
    elif kind == 'L':
        result = quran[quran.sura.isin(s_list)].Lemma.dropna().unique().tolist()
    else:
        result = quran[quran.sura.isin(s_list)].FORM.dropna().unique().tolist()
    
    return [buck_to_arabic(x) for x in result if x]

print("✅ sura_words function defined")


✅ sura_words function defined


In [14]:
# Function to get unique words (appearing only in specified suras)
def unique_sura_words(s_list, kind='W'):
    """
    Get words that appear ONLY in specified suras
    
    Parameters:
    s_list: list of sura numbers
    kind: 'W' for words, 'R' for roots, 'L' for lemmas
    
    Returns: list of unique words in Arabic script
    """
    if kind == 'R':
        first = quran[quran.sura.isin(s_list)].Root.dropna().unique().tolist()
        second = quran[~quran.sura.isin(s_list)].Root.dropna().unique().tolist()
        result = list(set(first) - set(second))
    elif kind == 'L':
        first = quran[quran.sura.isin(s_list)].Lemma.dropna().unique().tolist()
        second = quran[~quran.sura.isin(s_list)].Lemma.dropna().unique().tolist()
        result = list(set(first) - set(second))
    else:
        first = quran[quran.sura.isin(s_list)].FORM.dropna().unique().tolist()
        second = quran[~quran.sura.isin(s_list)].FORM.dropna().unique().tolist()
        result = list(set(first) - set(second))
    
    return [buck_to_arabic(x) for x in result if x]

print("✅ unique_sura_words function defined")


✅ unique_sura_words function defined


In [15]:
# Get unique root words by revelation type
print("🌿 Analyzing unique root words...")

# Meccan roots
meccan_roots = set(quran[quran.Place == 'Meccan'].Root.dropna().unique())
print(f"Meccan unique roots: {len(meccan_roots)}")

# Medinan roots
medinan_roots = set(quran[quran.Place == 'Medinan'].Root.dropna().unique())
print(f"Medinan unique roots: {len(medinan_roots)}")


🌿 Analyzing unique root words...
Meccan unique roots: 1444
Medinan unique roots: 1095


In [16]:
# Calculate exclusive and shared roots
print("📊 Calculating root word statistics...")

# Meccan-only roots
meccan_only = meccan_roots - medinan_roots
print(f"Meccan-only roots: {len(meccan_only)}")

# Medinan-only roots
medinan_only = medinan_roots - meccan_roots
print(f"Medinan-only roots: {len(medinan_only)}")

# Shared roots
shared_roots = meccan_roots & medinan_roots
print(f"Shared roots: {len(shared_roots)}")

# Summary
print("\n📈 Summary:")
print(f"Total unique roots: {len(meccan_roots | medinan_roots)}")
print(f"Meccan-only: {len(meccan_only)} ({len(meccan_only)/(len(meccan_roots | medinan_roots))*100:.1f}%)")
print(f"Medinan-only: {len(medinan_only)} ({len(medinan_only)/(len(meccan_roots | medinan_roots))*100:.1f}%)")
print(f"Shared: {len(shared_roots)} ({len(shared_roots)/(len(meccan_roots | medinan_roots))*100:.1f}%)")


📊 Calculating root word statistics...
Meccan-only roots: 547
Medinan-only roots: 198
Shared roots: 897

📈 Summary:
Total unique roots: 1642
Meccan-only: 547 (33.3%)
Medinan-only: 198 (12.1%)
Shared: 897 (54.6%)


In [17]:
# Test with Al-Fatiha (Sura 1)
print("🧪 Testing with Al-Fatiha (Sura 1):")
fatiha_roots = sura_words([1], 'R')
print(f"Roots in Al-Fatiha: {len(fatiha_roots)}")
print("First 10 roots:", fatiha_roots[:10])


🧪 Testing with Al-Fatiha (Sura 1):
Roots in Al-Fatiha: 18
First 10 roots: ['سمو', 'اله', 'رحم', 'حمد', 'ربب', 'علم', 'ملك', 'يوم', 'دين', 'عبد']


In [18]:
# Generate final summary report
print("📊 FINAL ANALYSIS SUMMARY")
print("=" * 50)
print(f"📖 Total Quranic entries analyzed: {len(quran):,}")
print(f"🌿 Total entries with root information: {quran.Root.notna().sum():,}")
print(f"📝 Total entries with lemma information: {quran.Lemma.notna().sum():,}")
print(f"📚 Total suras covered: {quran.sura.nunique()}")
print("\n🏛️ REVELATION ANALYSIS:")
print(f"🕌 Meccan-only root words: {len(meccan_only)}")
print(f"🏛️ Medinan-only root words: {len(medinan_only)}")
print(f"🤝 Shared root words: {len(shared_roots)}")
print(f"🌟 Total unique root words: {len(meccan_roots | medinan_roots)}")
print("\n✅ Analysis complete! All functions ready for use.")


📊 FINAL ANALYSIS SUMMARY
📖 Total Quranic entries analyzed: 128,219
🌿 Total entries with root information: 49,968
📝 Total entries with lemma information: 74,608
📚 Total suras covered: 114

🏛️ REVELATION ANALYSIS:
🕌 Meccan-only root words: 547
🏛️ Medinan-only root words: 198
🤝 Shared root words: 897
🌟 Total unique root words: 1642

✅ Analysis complete! All functions ready for use.
