In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [26]:
# define dictionary arabic to IPA
arabic_to_ipa = {
    'ا': 'a:',
    'ب': 'b',
    'ت': 't',
    'ث': 'θ',
    'ج': 'dʒ',
    'ح': 'ħ',
    'خ': 'x',
    'د': 'd',
    'ذ': 'ð',
    'ر': 'r',
    'ز': 'z',
    'س': 's',
    'ش': 'ʃ',
    'ص': 'sˤ',
    'ض': 'dˤ',
    'ط': 'tˤ',
    'ظ': 'ðˤ',
    'ع': 'ʕ',
    'غ': 'ɣ',
    'ف': 'f',
    'ق': 'q',
    'ك': 'k',
    'ل': 'l',
    'م': 'm',
    'ن': 'n',
    'ه': 'h',
    'و': 'w',
    'ي': 'j',
    'ء': 'ʔ',
    'آ': 'ʔa:',
    'أ': 'ʔ',
    'ؤ': 'ʔw',
    'إ': 'ʔ',
    'َ': 'a',
    'ِ': 'i',
    'ُ': 'u',
    'ً': 'an',
    'ٍ': 'in',
    'ٌ': 'un',
    'ْ': '', # sukun
    'ّ': '', # tasjid
    'ة' : 't',
    'ئ' : 'ʔ',
    'ى' : 'a:',
}


In [110]:
# sukun sebelumnya ada a: maka hapus a: tersebut
# jika tasyid juga sama
def arabic_to_ipa_string(arabic_string):
    new_str = ''
    for char in arabic_string:
        if('ْ' in char): # sukun
            # get the previous of char
            if new_str[-2] == ':':
                txt_removed_1 = "a: a:"                
                txt_removed_2 = "a:l"
                if(new_str[-len(txt_removed_1):] in txt_removed_1):
                    new_str = new_str.replace(txt_removed_1, " ")                    
                elif(new_str[-len(txt_removed_2):] in txt_removed_2):
                    new_str = new_str[:-len(txt_removed_2)] + "l"
        if('ّ' in char): # tasydid
            # get the previous of char
            if new_str[-2] == 'l':
                txt_add = new_str[-1]
                txt_removed_1 = "a:l"+txt_add
                if(new_str[-len(txt_removed_1):] in txt_removed_1):
                    new_str = new_str[:-len(txt_removed_1)] + txt_add
            elif new_str[-1] == 'l':
                txt_removed_1 = "a:l"
                if(new_str[-len(txt_removed_1):] in txt_removed_1):
                    new_str = new_str[:-len(txt_removed_1)] + "l"
        if char in arabic_to_ipa:
            new_str += arabic_to_ipa[char]
        else:
            new_str += char
            
    # remove double space and space at the beginning
    new_str = new_str.strip()
    new_str = new_str.replace('  ', ' ')
    
    # rules of mad
    mads = {
        # mad jaiz
        "aa: ʔ": "a:: ʔ",
        "ij ʔ": "i:: ʔ",
        "uw ʔ": "u:: ʔ",
        # mad wajib
        "aa:ʔ": "a::ʔ",
        "ijʔ": "i::ʔ",
        "uwʔ": "u::ʔ",
    }
    
    # replace mad_jaiz
    for key, value in mads.items():
        new_str = new_str.replace(key, value)
    
    # check if has laðij in first word
    if(new_str[:5] == 'laðij'):
        new_str = 'alaðij' + new_str[5:]

    return new_str

In [113]:
# load text arabic file
filename = 'data/transcription_with_vocal.txt'

# read the file
with open(filename, 'r') as file:
    text = file.read()
    
# split the text into lines
lines = text.split('\n')

# print the first 10 lines only see text
for line in lines:
    # split the line into no.surat, no.ayat, and text
    surat, ayat, text = line.split('|')
    # convert arabic text to ipa
    ipa_text = arabic_to_ipa_string(text)
    # print the no.surat, no.ayat, and text
    print(f'Surat: {surat}, Ayat: {ayat}, Text: {text}, IPA: {ipa_text}')

Surat: 78, Ayat: 1, Text: عَمَّ يَتَسَاءَلُونَ, IPA: ʕama jatasa::ʔaluwna
Surat: 78, Ayat: 2, Text: عَنِ النَّبَإِ الْعَظِيمِ, IPA: ʕani nabaʔi lʕaðˤijmi
Surat: 78, Ayat: 3, Text: الَّذِي هُمْ فِيهِ مُخْتَلِفُونَ, IPA: alaðij hum fijhi muxtalifuwna
Surat: 78, Ayat: 4, Text: كَلَّا سَيَعْلَمُونَ, IPA: kalaa: sajaʕlamuwna
Surat: 78, Ayat: 5, Text: ثُمَّ كَلَّا سَيَعْلَمُونَ, IPA: θuma kalaa: sajaʕlamuwna
Surat: 78, Ayat: 6, Text: أَلَمْ نَجْعَلِ الْأَرْضَ مِهَادًا, IPA: ʔalam nadʒʕali lʔardˤa mihaa:dana:
Surat: 78, Ayat: 7, Text: وَالْجِبَالَ أَوْتَادًا, IPA: waldʒibaa:la ʔawtaa:dana:
Surat: 78, Ayat: 8, Text: وَخَلَقْنَاكُمْ أَزْوَاجًا, IPA: waxalaqnaa:kum ʔazwaa:dʒana:
Surat: 78, Ayat: 9, Text: وَجَعَلْنَا نَوْمَكُمْ سُبَاتًا, IPA: wadʒaʕalnaa: nawmakum subaa:tana:
Surat: 78, Ayat: 10, Text: وَجَعَلْنَا اللَّيْلَ لِبَاسًا, IPA: wadʒaʕalnaa: lajla libaa:sana:
Surat: 78, Ayat: 11, Text: وَجَعَلْنَا النَّهَارَ مَعَاشًا, IPA: wadʒaʕalnaa: nahaa:ra maʕaa:ʃana:
Surat: 78, Ayat: 12, Text: وَب

In [114]:
# save to file raw_transcription_phonem.txt
with open('data/raw_transcription_phonem.csv', 'w') as file:
    for line in lines:
        # split the line into no.surat, no.ayat, and text
        surat, ayat, text = line.split('|')
        if(lines.index(line) == len(lines) - 1):
            break
        next_line = lines.index(line) + 1
        next_surat, next_ayat, next_text = lines[next_line].split('|')
        # if next line has ayat 1
        if(next_ayat != '1'):
            # convert arabic text to ipa
            ipa_text = arabic_to_ipa_string(text)
            # write to file
            file.write(f'{surat}|{ayat}|{ipa_text}\n')    

In [2]:
pred = "faqa:la: a:ʔana rabukumu a:lʔaʕla:"
truth = "faqa:la: a:ʔana rabukumu a:l0ʔaʕlaa:"

# count for aa: in target
count_aa = pred.count('aa:')

print(count_aa)

0
