In [54]:
import os
import re
from unicodedata import normalize
from unicodedata import name as uname

from IPython.display import display, HTML

from tf.writing.transcription import Transcription as tr


In [55]:
tr.to_arabic("bisomi")

'بِسْمِ'

In [56]:
tr.from_arabic("بِسْمِ")

'bisomi'

In [19]:
BASE = os.path.expanduser("~/github/q-ran/quran")
SOURCES = f"{BASE}/sources"
WRITING = f"{BASE}/writing"
MORPH_FILE = "quranic-corpus-morphology-0.4.txt"
TEXT_FILE = "quran-uthmani.xml"
MORPH_PATH = f"{SOURCES}/{MORPH_FILE}"
TEXT_PATH = f"{SOURCES}/{TEXT_FILE}"
TRANS_TABLE = f"{WRITING}/table.html"

In [21]:
suraPat = r"<sura([^>]*)>(.*?)</sura>"
suraRe = re.compile(suraPat, re.S)

attPat = r'index="([0-9]+)"\s+name="([^"]+)"'
attRe = re.compile(attPat)

ayaPat = r"<aya([^>]*)/>"
ayaRe = re.compile(ayaPat, re.S)

aattPat = r'index="([0-9]+)"\s+text="([^"]+)"'
aattRe = re.compile(aattPat)

battPat = r'bismillah="(.*)"'
battRe = re.compile(battPat)


def uNorm(words):
    return words
    return [normalize("NFKC", word) for word in words]


def readText():
    suraDb = {}
    ayaDb = {}
    with open(TEXT_PATH) as fh:
        text = fh.read()
    suras = suraRe.findall(text)
    for (atts, content) in suras:
        (suraIndex, name) = attRe.findall(atts)[0]
        suraIndex = int(suraIndex)
        suraDb[suraIndex] = dict(name=name)
        ayas = ayaRe.findall(content)
        for aya in ayas:
            (ayaIndex, text) = aattRe.findall(aya)[0]
            text = uNorm(text.split())
            ayaIndex = int(ayaIndex)
            data = dict(text=text)
            bsl = battRe.findall(aya)
            if bsl:
                data["basmala"] = uNorm(bsl[0].split())
            ayaDb.setdefault(suraIndex, {})[ayaIndex] = data

    return (suraDb, ayaDb)

In [22]:
(suraDb, ayaDb) = readText()

In [23]:
ayaDb[112]

{1: {'text': ['قُلْ', 'هُوَ', 'ٱللَّهُ', 'أَحَدٌ'],
  'basmala': ['بِسْمِ', 'ٱللَّهِ', 'ٱلرَّحْمَٰنِ', 'ٱلرَّحِيمِ']},
 2: {'text': ['ٱللَّهُ', 'ٱلصَّمَدُ']},
 3: {'text': ['لَمْ', 'يَلِدْ', 'وَلَمْ', 'يُولَدْ']},
 4: {'text': ['وَلَمْ', 'يَكُن', 'لَّهُۥ', 'كُفُوًا', 'أَحَدٌۢ']}}

In [24]:
def readMorph():
    morphDb = {}

    with open(MORPH_PATH) as fh:
        inPrefix = True
        dataLines = 0
        for (i, line) in enumerate(fh):
            if inPrefix:
                if line.startswith("LOCATION\t"):
                    inPrefix = False
                    continue
            else:
                dataLines += 1
                (locationRep, form, tag, features) = line.rstrip("\n").split("\t")
                (suraIndex, ayaIndex, groupIndex, wordIndex) = (
                    int(x) for x in locationRep[1:-1].split(":")
                )
                morphDb.setdefault(suraIndex, {}).setdefault(ayaIndex, {}).setdefault(
                    groupIndex, {}
                )[wordIndex] = (
                    form,
                    tag,
                    features,
                )
        print(f"{dataLines:>5} lines done")
    return morphDb

In [25]:
morphDb = readMorph()

128219 lines done


In [26]:
keyTrans = dict(
    LEM="lemma",
)

valIndex = {
    "NOM": (("case", "nominative"),),
    "ACC": (("case", "accusative"),),
    "GEN": (("case", "genitive"),),
    "(II)": (("form", "II"),),
    "(III)": (("form", "III"),),
    "(IV)": (("form", "IV"),),
    "(IX)": (("form", "IX"),),
    "(V)": (("form", "V"),),
    "(VI)": (("form", "VI"),),
    "(VII)": (("form", "VII"),),
    "(VIII)": (("form", "VIII"),),
    "(X)": (("form", "X"),),
    "(XI)": (("form", "XI"),),
    "(XII)": (("form", "XII"),),
    "+VOC": (("interjection", "allahuma"),),
    "1P": (("person", "1"), ("number", "p")),
    "1S": (("person", "1"), ("number", "s")),
    "2D": (("person", "2"), ("number", "d")),
    "2FD": (("person", "2"), ("number", "d"), ("gender", "f")),
    "2FP": (("person", "2"), ("number", "p"), ("gender", "f")),
    "2FS": (("person", "2"), ("number", "s"), ("gender", "f")),
    "2MD": (("person", "2"), ("number", "d"), ("gender", "m")),
    "2MP": (("person", "2"), ("number", "p"), ("gender", "m")),
    "2MS": (("person", "2"), ("number", "s"), ("gender", "m")),
    "3D": (("person", "3"), ("number", "d")),
    "3FD": (("person", "3"), ("number", "d"), ("gender", "f")),
    "3FP": (("person", "3"), ("number", "p"), ("gender", "f")),
    "3FS": (("person", "3"), ("number", "s"), ("gender", "f")),
    "3MD": (("person", "3"), ("number", "d"), ("gender", "m")),
    "3MP": (("person", "3"), ("number", "p"), ("gender", "m")),
    "3MS": (("person", "3"), ("number", "s"), ("gender", "m")),
    "MD": (("number", "d"), ("gender", "m")),
    "MP": (("number", "p"), ("gender", "m")),
    "M": (("gender", "m"),),
    "MS": (("gender", "m"), ("number", "s")),
    "FD": (("number", "d"), ("gender", "f")),
    "FP": (("number", "p"), ("gender", "f")),
    "FS": (("number", "s"), ("gender", "f")),
    "F": (("gender", "f"),),
    "P": (("number", "p"),),
    "ACT": (("voice", "active"),),
    "PASS": (("voice", "passive"),),
    "IMPF": (("tense", "imperfect"),),
    "IMPV": (("tense", "imperative"),),
    "PERF": (("tense", "perfect"),),
    "PCPL": (("tense", "participle"),),
    "VN": (("tense", "verbalNoun"),),
    "INDEF": (("definite", 1),),
    "PREFIX": (("component", "prefix"),),
    "STEM": (("component", "main"),),
    "SUFFIX": (("component", "suffix"),),
    "Al+": (),
    "bi+": (),
    "ha+": (),
    "ka+": (),
    "sa+": (),
    "ta+": (),
    "ya+": (),
}


def parseMorph(tag, featureStr):
    features = dict(pos=tag.lower())
    unknowns = set()
    fItems = featureStr.split("|")
    for fItem in fItems:
        if ":" in fItem:
            (key, value) = fItem.split(":", 1)
            if key == "POS":
                continue
            keyRep = keyTrans.get(key, key.lower())
            features[keyRep] = value
        else:
            value = fItem
            keyValues = valIndex.get(value, None)
            if keyValues is None:
                unknowns.add(value)
            else:
                for (k, v) in keyValues:
                    features[k] = v
    return (features, unknowns)

In [27]:
wordFeatures = {}
unknowns = set()

for (sura, suraData) in morphDb.items():
    for (aya, ayaData) in suraData.items():
        for (group, groupData) in ayaData.items():
            for (word, (form, tag, featureStr)) in groupData.items():
                wordFeatures.setdefault("transcription", {})[
                    (sura, aya, group, word)
                ] = form
                (theseFeatures, theseUnknowns) = parseMorph(tag, featureStr)
                for (k, v) in theseFeatures.items():
                    wordFeatures.setdefault(k, {})[(sura, aya, group, word)] = v
                unknowns |= theseUnknowns

len(unknowns)

0

In [28]:
def link(ayaDb, morphDb, limit):
    i = 0
    unequal = {}
    transcription = {}
    for sI in morphDb:
        if limit is not None and i > limit:
            break
        for aI in morphDb[sI]:
            if limit is not None and i > limit:
                break
            for gI in morphDb[sI][aI]:
                if limit is not None and i > limit:
                    break
                tWordGroup = ""
                for wI in morphDb[sI][aI][gI]:
                    tWordGroup += morphDb[sI][aI][gI][wI][0]
                aWordGroup = ayaDb[sI][aI]["text"][gI - 1]
                if sI == 37 and aI == 130 and gI == 3:
                    aWordGroup += f" {ayaDb[sI][aI]['text'][gI]}"
                lA = len(aWordGroup)
                lT = len(tWordGroup)
                if lA != lT:
                    unequal[(tWordGroup, aWordGroup)] = (sI, aI, gI)
                else:
                    for c in range(lT):
                        transcription.setdefault(aWordGroup[c], set()).add(
                            tWordGroup[c]
                        )
                i += 1
    return (unequal, transcription, i)

In [29]:
(unequal, transcription, i) = link(ayaDb, morphDb, None)

In [30]:
len(unequal)

0

In [31]:
transcription

{'ب': {'b'},
 'ِ': {'i'},
 'س': {'s'},
 'ْ': {'o'},
 'م': {'m'},
 'ٱ': {'{'},
 'ل': {'l'},
 'ّ': {'~'},
 'َ': {'a'},
 'ه': {'h'},
 'ر': {'r'},
 'ح': {'H'},
 'ٰ': {'`'},
 'ن': {'n'},
 'ي': {'y'},
 'د': {'d'},
 'ُ': {'u'},
 'ع': {'E'},
 'ك': {'k'},
 'و': {'w'},
 'إ': {'<'},
 'ا': {'A'},
 'ت': {'t'},
 'ص': {'S'},
 'ط': {'T'},
 'ق': {'q'},
 'ذ': {'*'},
 'أ': {'>'},
 'غ': {'g'},
 'ض': {'D'},
 'ٓ': {'^'},
 'ف': {'f'},
 'ً': {'F'},
 'ى': {'Y'},
 'ؤ': {'&'},
 'ة': {'p'},
 'ز': {'z'},
 'ء': {"'"},
 'خ': {'x'},
 '۟': {'@'},
 'ئ': {'}'},
 'ٌ': {'N'},
 'ش': {'$'},
 'ظ': {'Z'},
 'ۢ': {'['},
 'ج': {'j'},
 'ث': {'v'},
 'ۥ': {','},
 'ٍ': {'K'},
 'ۦ': {'.'},
 'ـ': {'_'},
 'ٔ': {'#'},
 'ۭ': {']'},
 'ۜ': {':'},
 '۠': {'"'},
 '۪': {'-'},
 '۫': {'+'},
 'ۨ': {'!'},
 ' ': {' '},
 '۬': {'%'},
 'ۣ': {';'}}

In [50]:
def makeTranscriptionTable(transcription):
    errors = []
    table = []
    for a in sorted(transcription):
        t = transcription[a]
        if len(t) != 1:
            errors.append(a)
            continue
        t = list(t)[0]
        table.append((t, a, uname(a)))
    if errors:
        chars = " - ".join(errors)
        print(f"There are {len(errors)} errors: {chars}")
    tableStr = """
  <table>
    <tbody>
"""
    mappingStr = """
  arabic_mapping = {"""
    for (t, a, unm) in table:
        tableStr += f"""
<tr><td class="t">{t}</td><td class="g">{a}</td><td class="p"></td><td class="r"></td><td class="n">{unm}</td><td class="u">{ord(a):04x}</td></tr>"""
        qu = '"' if t == "'" else "'"
        mappingStr += f"""
      {qu}{t}{qu}: "\\u{ord(a):04x}",  # {unm}"""
    tableStr += """
    </tbody>
  </table>
"""
    mappingStr += """
  }
"""
    return (tableStr, mappingStr)

In [51]:
print(makeTranscriptionTable(transcription)[1])


  arabic_mapping = {
      ' ': "\u0020",  # SPACE
      "'": "\u0621",  # ARABIC LETTER HAMZA
      '>': "\u0623",  # ARABIC LETTER ALEF WITH HAMZA ABOVE
      '&': "\u0624",  # ARABIC LETTER WAW WITH HAMZA ABOVE
      '<': "\u0625",  # ARABIC LETTER ALEF WITH HAMZA BELOW
      '}': "\u0626",  # ARABIC LETTER YEH WITH HAMZA ABOVE
      'A': "\u0627",  # ARABIC LETTER ALEF
      'b': "\u0628",  # ARABIC LETTER BEH
      'p': "\u0629",  # ARABIC LETTER TEH MARBUTA
      't': "\u062a",  # ARABIC LETTER TEH
      'v': "\u062b",  # ARABIC LETTER THEH
      'j': "\u062c",  # ARABIC LETTER JEEM
      'H': "\u062d",  # ARABIC LETTER HAH
      'x': "\u062e",  # ARABIC LETTER KHAH
      'd': "\u062f",  # ARABIC LETTER DAL
      '*': "\u0630",  # ARABIC LETTER THAL
      'r': "\u0631",  # ARABIC LETTER REH
      'z': "\u0632",  # ARABIC LETTER ZAIN
      's': "\u0633",  # ARABIC LETTER SEEN
      '$': "\u0634",  # ARABIC LETTER SHEEN
      'S': "\u0635",  # ARABIC LETTER SAD
      'D': "\u0636"

In [52]:
print(makeTranscriptionTable(transcription)[0])


  <table>
    <tbody>

<tr><td class="t"> </td><td class="g"> </td><td class="p"></td><td class="r"></td><td class="n">SPACE</td><td class="u">0020</td></tr>
<tr><td class="t">'</td><td class="g">ء</td><td class="p"></td><td class="r"></td><td class="n">ARABIC LETTER HAMZA</td><td class="u">0621</td></tr>
<tr><td class="t">></td><td class="g">أ</td><td class="p"></td><td class="r"></td><td class="n">ARABIC LETTER ALEF WITH HAMZA ABOVE</td><td class="u">0623</td></tr>
<tr><td class="t">&</td><td class="g">ؤ</td><td class="p"></td><td class="r"></td><td class="n">ARABIC LETTER WAW WITH HAMZA ABOVE</td><td class="u">0624</td></tr>
<tr><td class="t"><</td><td class="g">إ</td><td class="p"></td><td class="r"></td><td class="n">ARABIC LETTER ALEF WITH HAMZA BELOW</td><td class="u">0625</td></tr>
<tr><td class="t">}</td><td class="g">ئ</td><td class="p"></td><td class="r"></td><td class="n">ARABIC LETTER YEH WITH HAMZA ABOVE</td><td class="u">0626</td></tr>
<tr><td class="t">A</td><td class=

In [47]:
with open(TRANS_TABLE, "w") as fh:
    fh.write(makeTranscriptionTable(transcription)[0])

In [48]:
display(HTML(makeTranscriptionTable(transcription)[0]))

0,1,2,3,4,5
,,,,SPACE,0020
',ء,,,ARABIC LETTER HAMZA,0621
>,أ,,,ARABIC LETTER ALEF WITH HAMZA ABOVE,0623
&,ؤ,,,ARABIC LETTER WAW WITH HAMZA ABOVE,0624
<,إ,,,ARABIC LETTER ALEF WITH HAMZA BELOW,0625
},ئ,,,ARABIC LETTER YEH WITH HAMZA ABOVE,0626
A,ا,,,ARABIC LETTER ALEF,0627
b,ب,,,ARABIC LETTER BEH,0628
p,ة,,,ARABIC LETTER TEH MARBUTA,0629
t,ت,,,ARABIC LETTER TEH,062a


In [73]:
cases = sorted(unequal)

In [74]:
def show(case):
    (tW, aW) = cases[case]
    print(tW, aW)
    (sI, aI, gI) = unequal[(tW, aW)]
    print(f"sura {sI} aya {aI} word group {gI}")
    print(len(tW), len(aW))
    print(list(tW))
    print("\n".join(f"{x} = {uname(x)}" for x in aW))

In [75]:
show(0)

<ilo yaAsiyna إِلْ
sura 37 aya 130 word group 3
13 4
['<', 'i', 'l', 'o', ' ', 'y', 'a', 'A', 's', 'i', 'y', 'n', 'a']
إ = ARABIC LETTER ALEF WITH HAMZA BELOW
ِ = ARABIC KASRA
ل = ARABIC LETTER LAM
ْ = ARABIC SUKUN


In [40]:
show(10)

$i}ota شِئْتَ
6 7
['$', 'i', '}', 'o', 't', 'a']
ش = ARABIC LETTER SHEEN
ِ = ARABIC KASRA
ي = ARABIC LETTER YEH
ْ = ARABIC SUKUN
ٔ = ARABIC HAMZA ABOVE
ت = ARABIC LETTER TEH
َ = ARABIC FATHA


In [37]:
for (tG, aG) in cases[0:20]:
    print(f"{tG} != {aG} {len(unequal[(tG, aG)])}")

$a>onK != شَأْنٍ 1
$a>onN != شَأْنٌ 1
$a>onihimo != شَأْنِهِمْ 1
$aAni}aka != شَانِئَكَ 1
$aEaA^}iri != شَعَآئِرِ 1
$aEa`^}ira != شَعَٰٓئِرَ 1
$aEa`^}iri != شَعَٰٓئِرِ 1
$a`Ti}i != شَٰطِئِ 1
$amaA^}ilihimo != شَمَآئِلِهِمْ 1
$i}onaA != شِئْنَا 1
$i}ota != شِئْتَ 1
$i}otum != شِئْتُم 1
$i}otumaA != شِئْتُمَا 1
$i}otumo != شِئْتُمْ 1
$ufaEa`^&uA@ != شُفَعَٰٓؤُا۟ 1
$ufaEa`^&unaA != شُفَعَٰٓؤُنَا 1
$urakaA^&uhum != شُرَكَآؤُهُم 1
$urakaA^&uhumo != شُرَكَآؤُهُمْ 1
$urakaA^&ukumu != شُرَكَآؤُكُمُ 1
$urakaA^&unaA != شُرَكَآؤُنَا 1
