In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [218]:
# define dictionary arabic to IPA
arabic_to_ipa = {
    'ا': 'a',
    'ب': 'b',
    'ت': 't',
    'ث': 'ṡ',
    'ج': 'j',
    'ح': 'ḥ',
    'خ': 'kh',
    'د': 'd',
    'ذ': 'ż',
    'ر': 'r',
    'ز': 'z',
    'س': 's',
    'ش': 'sy',
    'ص': 'ṣ',
    'ض': 'ḍ',
    'ط': 'ṭ',
    'ظ': 'ẓ',
    'ع': '\'',
    'غ': 'g',
    'ف': 'f',
    'ق': 'q',
    'ك': 'k',
    'ل': 'l',
    'م': 'm',
    'ن': 'n',
    'و': 'w',
    'ه': 'h',
    'ء': '’',
    'ي': 'y',
    'آ': 'â',
    'أ': '’',
    'ؤ': '’',
    'إ': '’',
    'َ': 'a',
    'ِ': 'i',
    'ُ': 'u',
    'ً': 'an',
    'ٍ': 'in',
    'ٌ': 'un',
    ' ': ' ',
    'ْ': '.', # sukun
    'ّ': ',', # tasjid
    'ة' : 'ṫ',
    # 'ئ' : 'ʔ',
    'ى' : 'a',
}


In [240]:
# sukun sebelumnya ada a: maka hapus a: tersebut
# jika tasyid juga sama
def arabic_to_ipa_string(arabic_string):
    latin_text = ''
    # Loop melalui setiap karakter dalam teks arab
    for i, char in enumerate(arabic_string):
        # Jika karakter adalah tasjid, ulangi huruf sebelumnya
        if char == 'ّ':
            if i > 0:  # Pastikan tidak out of range
                prev_phoneme = arabic_to_ipa.get(arabic_string[i - 1], arabic_string[i - 1])
                latin_text = latin_text[:-len(prev_phoneme)] + prev_phoneme * 2
        elif char in arabic_to_ipa:
            # Ambil fonem yang sesuai dengan karakter dari dictionary
            phoneme = arabic_to_ipa.get(char, char)
            # Tambahkan fonem ke hasil konversi
            latin_text += phoneme    

    # rules of mad
    mads = {               
        'aal.' : 'al',
        'ṫ.' : 'h',        
        
        'iya' :'i#a',
        'iyi' : 'i#i',
        'iyu' : 'i#u',
        'iyy' : 'i#y',
        'iy' : 'î',
        
        'uwa': 'u#a',
        'uwi': 'u#i',
        'uwu': 'u#u',
        'uww': 'u#w',
        'uw' : 'û',
        
        'un. l': 'u ll',
        'an. l': 'a ll',
        'in. l': 'i ll',

        'ana a' : 'an a',
        'ana y' : 'a yy',
        'ana n' : 'a nn',
        'ana m' : 'a mm',
        'ana w' : 'a ww',
        'an. y' : 'a yy',
        'an. n' : 'a nn',
        'an. m' : 'a mm',
        'an. w' : 'a ww',
        'aa' : 'â',

        'un y' : 'u yy',
        'un n' : 'u nn',
        'un m' : 'u mm',
        'un w' : 'u ww',
        
        'in y' : 'i yy',
        'in n' : 'i nn',
        'in m' : 'i mm',
        'in w' : 'i ww',
        
        'ay.' : 'ai',
        'aw.' : 'au',
        
        'al,' : '',
        ' al.' : ' l',
        ' al': ' ',
        
        '.' : '',
        
        'alllah' : 'allâh',
        'allah' : 'allâh',
        'illah' : 'illâh',
        'ullah' : 'ullâh',
        
        '’a' : 'a',
        '’i' : 'i',
        '’u' : 'u',
        
        'i#a' : 'iya',
        'i#i' : 'iyi',
        'i#u' : 'iyu',
        'i#y' : 'iyy',
        
        'u#a' : 'uwa',
        'u#i' : 'uwi',
        'u#u' : 'uwu',
        'u#w' : 'uww',
        'ûa ': 'û ',
        
        '’a' : 'a',
        '’i' : 'i',
        '’u' : 'u',
        '’â' : 'â',
        '’î' : 'î',
        '’û' : 'û',
        
        'ṫ' : 't',
    }
    
    # replace mad_jaiz
    for key, value in mads.items():
        latin_text = latin_text.replace(key, value)
    
    # remove aL
    index = latin_text.find("al")
    while index != -1:
        if index + 3 < len(latin_text) and latin_text[index+2] == latin_text[index+3]:
            latin_text = latin_text[:index] + latin_text[index+2:]

        index = latin_text.find("al", index + 1)

    index = latin_text.find("âl")
    while index != -1:
        if index + 3 < len(latin_text) and latin_text[index+2] == latin_text[index+3]:
            latin_text = latin_text[:index] + 'a' + latin_text[index+2:]

        index = latin_text.find("âl", index + 1)

    # remove last vowel
    if(
        latin_text[-1] == "a" or
        latin_text[-1] == "i" or
        latin_text[-1] == "u"
      ):
      latin_text = latin_text[:-1]
    if(
        latin_text[-2:] == "an"
      ):
      latin_text = latin_text[:-2] + 'â'
    if(
        latin_text[-2:] == "in"
    ):
      latin_text = latin_text[:-2]
    if(
        latin_text[-2:] == "un"
      ):
      latin_text = latin_text[:-2]
    if(
        latin_text[-3:] == "ana" or
        latin_text[-3:] == "ina" or
        latin_text[-3:] == "una"
      ):
      latin_text = latin_text[:-2]
    if(
        latin_text[-1] == "t"
      ):
      latin_text = latin_text[:-1] + 'h'

    return latin_text

In [239]:
text = "وَلَمْ يَكُنْ لَهُ كُفُوًا أَحَدٌ"
text2 = "عَمَّ يَتَسَاءَلُونَ"
print(arabic_to_ipa_string(text))
print("-" * 100)
print(arabic_to_ipa_string(text2))

walam yaku llahu kufuwana aḥad
----------------------------------------------------------------------------------------------------
'amma yatasâalûn


In [230]:
# load text arabic file
filename = 'data/transcription_with_vocal.txt'

# read the file
with open(filename, 'r') as file:
    text = file.read()
    
# split the text into lines
lines = text.split('\n')

# print the first 10 lines only see text
for line in lines:
    # split the line into no.surat, no.ayat, and text
    surat, ayat, text = line.split('|')
    # convert arabic text to ipa
    ipa_text = arabic_to_ipa_string(text)
    # print the no.surat, no.ayat, and text
    print(f'Surat: {surat}, Ayat: {ayat}, Text: {text}, IPA: {ipa_text}')

Surat: 78, Ayat: 1, Text: عَمَّ يَتَسَاءَلُونَ, IPA: 'amma yatasâalûn
Surat: 78, Ayat: 2, Text: عَنِ النَّبَإِ الْعَظِيمِ, IPA: 'ani nnabai l'aẓîm
Surat: 78, Ayat: 3, Text: الَّذِي هُمْ فِيهِ مُخْتَلِفُونَ, IPA: allażî hum fîhi mukhtalifûn
Surat: 78, Ayat: 4, Text: كَلَّا سَيَعْلَمُونَ, IPA: kallâ saya'lamûn
Surat: 78, Ayat: 5, Text: ثُمَّ كَلَّا سَيَعْلَمُونَ, IPA: ṡumma kallâ saya'lamûn
Surat: 78, Ayat: 6, Text: أَلَمْ نَجْعَلِ الْأَرْضَ مِهَادًا, IPA: alam naj'ali larḍa mihâdâ
Surat: 78, Ayat: 7, Text: وَالْجِبَالَ أَوْتَادًا, IPA: waljibâla autâdâ
Surat: 78, Ayat: 8, Text: وَخَلَقْنَاكُمْ أَزْوَاجًا, IPA: wakhalaqnâkum azwâjâ
Surat: 78, Ayat: 9, Text: وَجَعَلْنَا نَوْمَكُمْ سُبَاتًا, IPA: waja'alnâ naumakum subâtâ
Surat: 78, Ayat: 10, Text: وَجَعَلْنَا اللَّيْلَ لِبَاسًا, IPA: waja'alnâ llaila libâsâ
Surat: 78, Ayat: 11, Text: وَجَعَلْنَا النَّهَارَ مَعَاشًا, IPA: waja'alnâ nnahâra ma'âsyâ
Surat: 78, Ayat: 12, Text: وَبَنَيْنَا فَوْقَكُمْ سَبْعًا شِدَادًا, IPA: wabanainâ fauqakum s

In [114]:
# save to file raw_transcription_phonem.txt
with open('data/raw_transcription_phonem.csv', 'w') as file:
    for line in lines:
        # split the line into no.surat, no.ayat, and text
        surat, ayat, text = line.split('|')
        if(lines.index(line) == len(lines) - 1):
            break
        next_line = lines.index(line) + 1
        next_surat, next_ayat, next_text = lines[next_line].split('|')
        # if next line has ayat 1
        if(next_ayat != '1'):
            # convert arabic text to ipa
            ipa_text = arabic_to_ipa_string(text)
            # write to file
            file.write(f'{surat}|{ayat}|{ipa_text}\n')    

In [2]:
pred = "faqa:la: a:ʔana rabukumu a:lʔaʕla:"
truth = "faqa:la: a:ʔana rabukumu a:l0ʔaʕlaa:"

# count for aa: in target
count_aa = pred.count('aa:')

print(count_aa)

0
