In [1]:
import re
import json
import unicodedata
import pandas as pd
from pathlib import Path
from collections import Counter

import sys

sys.path.append("../")
from text_processing_utils import (
    preprocess_text,
    tokenize_text,
    form_wildcard_pattern,
    get_groups_of_characters_with_diacritics,
)

pd.set_option("display.max_colwidth", None)

In [2]:
form_wildcard_pattern("123 - فَمِنَ الرَّامِي السُّوِيدِيِّ أُوسْكَار")

['1',
 '2',
 '3',
 ' ',
 '-',
 ' ',
 'ف',
 '<*>',
 'م',
 '<*>',
 'ن',
 '<*>',
 ' ',
 'ا',
 '<*>',
 'ل',
 '<*>',
 'ر',
 '<*>',
 'ا',
 '<*>',
 'م',
 '<*>',
 'ي',
 '<*>',
 ' ',
 'ا',
 '<*>',
 'ل',
 '<*>',
 'س',
 '<*>',
 'و',
 '<*>',
 'ي',
 '<*>',
 'د',
 '<*>',
 'ي',
 '<*>',
 ' ',
 'أ',
 '<*>',
 'و',
 '<*>',
 'س',
 '<*>',
 'ك',
 '<*>',
 'ا',
 '<*>',
 'ر',
 '<*>']

In [3]:
# Ad-hoc fix of the json files' format
def load_file_lines(filename):
    with open(filename, "r") as f:
        lines = "\n".join([l.strip() for l in f if l.strip()])
        lines = re.sub(r"}\n{", r"},\n{", lines)
        lines = "[\n" + lines + "\n]"
    return json.loads(lines)


DATA_DIR = Path("../data/")
filenames = sorted([f for f in DATA_DIR.glob("*_metadata*")])

In [4]:
def generate_information_report(filename):
    data = load_file_lines(filename)
    df = pd.DataFrame(data)
    df.rename(columns={"transcript": "text", "transcription": "text"}, inplace=True)

    df["preprocessed_text"] = df["text"].apply(lambda t: preprocess_text(t))

    # Find the diacritics that follow each character
    df["characters_diacritics"] = df["preprocessed_text"].apply(
        get_groups_of_characters_with_diacritics
    )

    char_counts = Counter([c for t in df["preprocessed_text"] for c in t]).most_common()

    info_df = pd.DataFrame(
        [{"Character": ch, "Count": count} for ch, count in char_counts]
    )
    info_df["Code"] = info_df["Character"].apply(lambda ch: ord(ch))
    info_df["Name"] = info_df["Character"].apply(lambda ch: unicodedata.name(ch))

    # Mn -> diacritic
    info_df["Type"] = info_df["Character"].apply(lambda ch: unicodedata.category(ch))
    info_df["is_diacritic"] = info_df["Type"].apply(lambda t: t == "Mn")

    # Build the set of diacritics following each character
    character_to_following_diacritics = {}
    for i, l in enumerate(df["characters_diacritics"].tolist()):
        for ch, dia_list in l:
            if ch not in character_to_following_diacritics:
                character_to_following_diacritics[ch] = []
            character_to_following_diacritics[ch].append(
                ("+".join([unicodedata.name(d) for d in dia_list]), dia_list)
            )

    info_df["Following_diacritics"] = info_df["Character"].apply(
        lambda c: Counter(character_to_following_diacritics.get(c, None)).most_common()
    )

    info_df.sort_values(
        by=["is_diacritic", "Code", "Count"], ascending=True, inplace=True
    )

    return info_df

## ArVoice dataset

### Summary:
- Vowels such as أاإ sometimes have diacritics following themة
    - e.g., `أُعْلِنَ يَوْمَ ...` 
    - <span style="color:red">Question: Are we going to add wildcard tokens after these characters, based on this finding?</span>
- **Note:** I decided not to resolve any of the potential errors reported below in the preprocessing script, as they are all rare.
##### Annotation errors (ArVoice - test set):
- Rare occurences of an incomplete diacritics combination:
    - 11 times `ARABIC SHADDA` is on its own, without an accompanying short vowel
    - **Note:** This is going to be mapped to `<UNK_DIAC>` in the main experiment
- Two cases of a diacritic following the `SPACE` character
    - <span style="color:red">Question: How is this handled when computing the different error rates?</span>

##### Annotation errors (ArVoice - train set):
- Rare occurences of Invalid diacritics combinations:
    - 4 times `ARABIC KASRA+ARABIC KASRA`, once `ARABIC KASRA+ARABIC SUKUN`, once `ARABIC FATHA+ARABIC KASRA`
    - **Note:** This is going to be mapped to `<UNK_DIAC>` in the main experiment
- The `TATWEEL` character only exists in this dataset
    - Out of 26 times, it was (mistakenly?) diacritized once

In [5]:
df = pd.DataFrame(load_file_lines(filenames[0]))
df[df["transcription"].apply(lambda t: "أُ" in t)]

Unnamed: 0,file_name,transcription,speaker_id,source,duration,sampling_rate,audio_filepath
5,/l/ArVoice/v1/part-1/tashkeela/tashkeela-test/female_ab_00018.wav,صَعَدَتْ أَسْعَارُ النَّفْطِ الْجُمْعَةَ مُوَسِّعَةً مَكَاسِبَهَا إلَى أَكْثَرَ مِنْ خَمْسَةٍ وَ عِشْرينْ مُنْذُ أَنْ هَوَتِ الْأُسْبُوعَ الْمَاضِيَ إلَى أَدْنَى مُسْتَوَيَاتِهَا فِي إِثْنا عَشَرَ عَامًا وَذَلِكَ بِدَعْمٍ مِنْ تَوَقُّعَاتٍ بِاتِّفَاقٍ بَيْنَ مُصَدِّرِي النَّفْطِ الرَّئِيسِيِّينَ لِخَفْضِ الْإِنْتَاجِ وَكَبْحِ وَاحِدَةٍ مِنْ أَكْبَرِ تُخَمِ الْإِمْدَادَاتِ فِي التَّارِيخِ,female_ab,tashkeela,21.302041,16000,/home/rufael/Projects/diac-btc/data/arvoice/raw/test/part-1/tashkeela/tashkeela-test/female_ab_00018.wav
6,/l/ArVoice/v1/part-1/tashkeela/tashkeela-test/female_ab_00022.wav,يُعْرَضُ الْقِيثارُ الْكَهْرَبَائِيُّ الَّذِي عَزَفَ عَلَيْهِ جُونْ لِينُون عِنْدَمَا كَانَ عُضْوًا فِي فِرْقَةِ الْبِيتِلْز أُغْنِيَةَ بِيبَرْبَاك رَايْتَر فِي مَلْعَبِ آبِي رُودَ عَامَ أَلْفٍ وَ تِسْعَمِئَةٍ وَ سِتَّةٍ وَ سِتِّينَ لِلْبَيْعِ فِي مَزَادٍ عَلَنِيٍّ الْيَوْمَ الْأَحَدَ لِلْأَشْيَاءِ التَّذْكَارِيَّةِ الْمُتَعَلِّقَةِ بِمُوسِيقَى الرُّوك آنْد رُول فِي لُنْدُنَ,female_ab,tashkeela,19.919093,16000,/home/rufael/Projects/diac-btc/data/arvoice/raw/test/part-1/tashkeela/tashkeela-test/female_ab_00022.wav
7,/l/ArVoice/v1/part-1/tashkeela/tashkeela-test/female_ab_00010.wav,أُعْلِنَ يَوْمَ الْثَالثَ عَشَرْ مِنْ أُكْتُوبَرْ تِشْرِينَ الْأَوَّلِ أَلْفَيْنِ وَ سِتَّةَ عَشَرْ عَنْ فَوْزِ بُوب ديلان بِجَائِزَةِ نُوبِلَ لِلْآدَابِ فِي قَرَارٍ اعْتُبِرَ مُفَاجِئًا,female_ab,tashkeela,10.465034,16000,/home/rufael/Projects/diac-btc/data/arvoice/raw/test/part-1/tashkeela/tashkeela-test/female_ab_00010.wav
8,/l/ArVoice/v1/part-1/tashkeela/tashkeela-test/female_ab_00023.wav,سَجَّلَت الدَّوْرَةُ الخَامِسة وَالثَّلَاثِينَ لِمَعْرِضِ الشَّارِقَةِ الدَّوْلِيِّ لِلْكِتَابِ الَّتِي أُسْدِلَ السِّتارُ عَلَيْهَا يَوْمَ السَّبْتِ رَقْمًا قِيَاسِيًّا فِي عَدَدِ الزُّوَارِ وَحَجْمِ الْمَبِيعَاتِ,female_ab,tashkeela,13.357098,16000,/home/rufael/Projects/diac-btc/data/arvoice/raw/test/part-1/tashkeela/tashkeela-test/female_ab_00023.wav
10,/l/ArVoice/v1/part-1/tashkeela/tashkeela-test/female_ab_00039.wav,وَذَهَبَتْ أَبْحَاثٌ أُخْرَى إِلَى أنَّ السَّبَبَ قَدْ يَكُونُ امْتِصَاصَ الْمُحِيطَاتِ الْعَمِيقَةِ لِلْمَزِيدِ مِنَ الْحَرَارَةِ,female_ab,tashkeela,7.618095,16000,/home/rufael/Projects/diac-btc/data/arvoice/raw/test/part-1/tashkeela/tashkeela-test/female_ab_00039.wav
21,/l/ArVoice/v1/part-1/tashkeela/tashkeela-test/female_ab_00015.wav,فَعِنْدَ إنْشَاءِ الْأُمَمِ الْمُتَّحِدَةِ عَامَ أَلْفٍ وَ تِسْعَمِئَةٍ وَ خَمْسَةٍ وَ أَرْبَعينْ اعْتَمَدَتِ الْمُنَظَّمَةُ الدَّوْلِيَّةُ خَمْسَ لُغَاتٍ حَيَّةٍ هِيَ الْإسْبَانِيَّةُ وَالْإنْجِلِيزِيَّةُ وَالرُّوسِيَّةُ وَالصِّينِيَّةُ وَالْفَرَنْسِيَّةُ بِحَيْثُ تَكُونُ لُغَاتِ تَوْثِيقِ الْمَحَاضِرِ الرَّسْمِيَّةِ وَأوْرَاقِ الْعَمَلِ أثْنَاءَ الاجْتِمَاعَاتِ وَتُعْتَمَدُ فِي التَّرْجَمَةِ الْحَيَّةِ الْمُبَاشِرَةِ أثْنَاءَ الْمُؤْتَمَرَاتِ تَحَدُّثًا وَكِتَابَةً,female_ab,tashkeela,25.252018,16000,/home/rufael/Projects/diac-btc/data/arvoice/raw/test/part-1/tashkeela/tashkeela-test/female_ab_00015.wav
28,/l/ArVoice/v1/part-1/tashkeela/tashkeela-test/female_ab_00025.wav,مَعْرِضٌ أُوزْبَكِيٌّ فِي قَطَر مِنْ كُنُوزِ الْمَعْرِفَةِ إِلَى اللَّوْحَاتِ الْفَنِّيَّةِ وَالْمَنْحُوتَاتِ وَالْمُجَوْهَرَاتِ تَتَنَوَّعُ ذَخِيرَةُ مَعْرِضِ أُوزْبَكِسْتَانَ الْفَنِيِّ الَّذِي افْتُتِحَ فِي الدَّوْحَةِ مَطْلَعَ الْأُسْبُوعِ وَيَسْتَقْبِلُ الزُّوَّارَ حَتَّى السَادِسَ عَشَرْ مِنَ الشَّهْرِ الْجَارِي,female_ab,tashkeela,16.506032,16000,/home/rufael/Projects/diac-btc/data/arvoice/raw/test/part-1/tashkeela/tashkeela-test/female_ab_00025.wav
41,/l/ArVoice/v1/part-1/tashkeela/tashkeela-test/female_ad_00060.wav,بِمُشَارَكَةِ اثْنينٍ وَ سِتِّينَ فِيلْمًا مِنْ تِسْعَةَعَشْرَةَ دَوْلَةً أُورُوبِّيَّة وَالْبَلَدِ الْمُضِيف مِصْر انْطَلَقَتْ فَعَّالِيَّاتُ الدَّوْرَةِ الثَّانِيَةِ مِنْ مِهْرَجَانِ الْأُقْصُرِ لِلسِّينَمَا الْمِصْرِيَّةِ وَالْأُورُوبِّيَّةِ وَتَحْضُرُ السِّينَمَا الْأَلْمَانِيَّةُ ضَيْفَ شَرَفْ عَلَى الْمِهْرَجَانِ الَّذِي يُفْتَتَحُ بِالْفِيلْمِ الْمِصْرِيِّ الْجَدِيدْ لَا مُؤَاخَذَة لِلْمُخْرِجْ عَمْرو سَلَامَة,female_ad,tashkeela,23.98907,16000,/home/rufael/Projects/diac-btc/data/arvoice/raw/test/part-1/tashkeela/tashkeela-test/female_ad_00060.wav
43,/l/ArVoice/v1/part-1/tashkeela/tashkeela-test/female_ad_00062.wav,لِنَيْلِ الدُّكْتُورَاه فِي أُصُولِ الفِقْهِ بَعْدَ أَيَّامٍ مِنْ نَيْلِهِ دَرَجَةَ المَاجِسْتِير مِنْ جَامِعَةِ أَبُودِيسَ فِي القُدْسِ المُحْتَلَّةِ,female_ad,tashkeela,8.939093,16000,/home/rufael/Projects/diac-btc/data/arvoice/raw/test/part-1/tashkeela/tashkeela-test/female_ad_00062.wav
48,/l/ArVoice/v1/part-1/tashkeela/tashkeela-test/female_ad_00061.wav,سَبْعِينِيٌّ فِلَسْطِينِيٌّ يَسْتَعِدُّ لِلدُّكْتُورَاه يَتَطَلَّعُ الحَاجُّ تَوفِيق مَحَامِيد سَبْعُونَ عَامًا مِنْ سُكَّانِ بَلْدَةِ أُمِّ الفَحْمِ بِالدَّاخِلِ الفِلَسْطِينِيِّ,female_ad,tashkeela,10.930023,16000,/home/rufael/Projects/diac-btc/data/arvoice/raw/test/part-1/tashkeela/tashkeela-test/female_ad_00061.wav


In [6]:
# Test set
print(filenames[0])
info_df_0 = generate_information_report(filenames[0])
print("\nChecking the diacritics combinations in the dataset:\n")
print(
    Counter(
        [
            t[0][0]
            for l in info_df_0[info_df_0["Character"] != " "][
                "Following_diacritics"
            ].tolist()
            for t in l
            for _ in range(t[1])
        ]
    ).most_common()
)
print()

info_df_0

../data/arvoice_test_metadata.json

Checking the diacritics combinations in the dataset:

[('', 7231), ('ARABIC FATHA', 6814), ('ARABIC KASRA', 4212), ('ARABIC SUKUN', 2848), ('ARABIC DAMMA', 1592), ('ARABIC FATHA+ARABIC SHADDA', 889), ('ARABIC KASRA+ARABIC SHADDA', 300), ('ARABIC KASRATAN', 288), ('ARABIC FATHATAN', 176), ('ARABIC DAMMA+ARABIC SHADDA', 155), ('ARABIC DAMMATAN', 61), ('ARABIC FATHATAN+ARABIC SHADDA', 30), ('ARABIC SHADDA', 11), ('ARABIC KASRATAN+ARABIC SHADDA', 8), ('ARABIC DAMMATAN+ARABIC SHADDA', 6)]



Unnamed: 0,Character,Count,Code,Name,Type,is_diacritic,Following_diacritics
1,,4867,32,SPACE,Zs,False,"[((, ), 4865), ((ARABIC DAMMA, ُ), 1), ((ARABIC FATHA, َ), 1)]"
46,(,1,40,LEFT PARENTHESIS,Ps,False,"[((, ), 1)]"
47,),1,41,RIGHT PARENTHESIS,Pe,False,"[((, ), 1)]"
45,Q,1,81,LATIN CAPITAL LETTER Q,Lu,False,"[((, ), 1)]"
40,ء,75,1569,ARABIC LETTER HAMZA,Lo,False,"[((ARABIC KASRA, ِ), 29), ((ARABIC FATHA, َ), 24), ((, ), 7), ((ARABIC DAMMA, ُ), 6), ((ARABIC FATHATAN, ً), 5), ((ARABIC KASRATAN, ٍ), 4)]"
43,آ,32,1570,ARABIC LETTER ALEF WITH MADDA ABOVE,Lo,False,"[((, ), 32)]"
20,أ,522,1571,ARABIC LETTER ALEF WITH HAMZA ABOVE,Lo,False,"[((ARABIC FATHA, َ), 379), ((, ), 73), ((ARABIC DAMMA, ُ), 58), ((ARABIC SUKUN, ْ), 12)]"
44,ؤ,28,1572,ARABIC LETTER WAW WITH HAMZA ABOVE,Lo,False,"[((ARABIC FATHA, َ), 16), ((ARABIC DAMMA, ُ), 6), ((ARABIC SUKUN, ْ), 3), ((ARABIC KASRA, ِ), 2), ((, ), 1)]"
33,إ,182,1573,ARABIC LETTER ALEF WITH HAMZA BELOW,Lo,False,"[((ARABIC KASRA, ِ), 121), ((, ), 61)]"
37,ئ,146,1574,ARABIC LETTER YEH WITH HAMZA ABOVE,Lo,False,"[((ARABIC KASRA, ِ), 83), ((ARABIC FATHA, َ), 57), ((, ), 4), ((ARABIC FATHATAN, ً), 2)]"


In [7]:
# Train set
print(filenames[1])
info_df_1 = generate_information_report(filenames[1])
print("\nChecking the diacritics combinations in the dataset:\n")
print(
    Counter(
        [
            t[0][0]
            for l in info_df_1[info_df_1["Character"] != " "][
                "Following_diacritics"
            ].tolist()
            for t in l
            for _ in range(t[1])
        ]
    ).most_common()
)
print()

info_df_1

../data/arvoice_train_metadata.json

Checking the diacritics combinations in the dataset:

[('', 105490), ('ARABIC FATHA', 59812), ('ARABIC KASRA', 39828), ('ARABIC SUKUN', 27018), ('ARABIC DAMMA', 14903), ('ARABIC FATHA+ARABIC SHADDA', 8682), ('ARABIC KASRA+ARABIC SHADDA', 2939), ('ARABIC KASRATAN', 2444), ('ARABIC FATHATAN', 2150), ('ARABIC DAMMA+ARABIC SHADDA', 1424), ('ARABIC DAMMATAN', 786), ('ARABIC SHADDA', 303), ('ARABIC KASRATAN+ARABIC SHADDA', 189), ('ARABIC FATHATAN+ARABIC SHADDA', 157), ('ARABIC DAMMATAN+ARABIC SHADDA', 97), ('ARABIC KASRA+ARABIC KASRA', 4), ('ARABIC KASRA+ARABIC SUKUN', 1), ('ARABIC FATHA+ARABIC KASRA', 1)]



Unnamed: 0,Character,Count,Code,Name,Type,is_diacritic,Following_diacritics
1,,50882,32,SPACE,Zs,False,"[((, ), 50875), ((ARABIC FATHA, َ), 3), ((ARABIC KASRA, ِ), 2), ((ARABIC SUKUN, ْ), 2)]"
49,"""",4,34,QUOTATION MARK,Po,False,"[((, ), 4)]"
51,-,2,45,HYPHEN-MINUS,Pd,False,"[((, ), 2)]"
47,،,5,1548,ARABIC COMMA,Po,False,"[((, ), 5)]"
40,ء,915,1569,ARABIC LETTER HAMZA,Lo,False,"[((ARABIC KASRA, ِ), 339), ((ARABIC FATHA, َ), 217), ((, ), 203), ((ARABIC DAMMA, ُ), 81), ((ARABIC KASRATAN, ٍ), 36), ((ARABIC FATHATAN, ً), 27), ((ARABIC DAMMATAN, ٌ), 11), ((ARABIC SUKUN, ْ), 1)]"
44,آ,278,1570,ARABIC LETTER ALEF WITH MADDA ABOVE,Lo,False,"[((, ), 277), ((ARABIC FATHA, َ), 1)]"
21,أ,5707,1571,ARABIC LETTER ALEF WITH HAMZA ABOVE,Lo,False,"[((ARABIC FATHA, َ), 3358), ((, ), 1736), ((ARABIC DAMMA, ُ), 444), ((ARABIC SUKUN, ْ), 160), ((ARABIC SHADDA, ّ), 4), ((ARABIC FATHATAN, ً), 2), ((ARABIC KASRA, ِ), 1), ((ARABIC FATHA+ARABIC SHADDA, َّ), 1), ((ARABIC KASRATAN, ٍ), 1)]"
43,ؤ,344,1572,ARABIC LETTER WAW WITH HAMZA ABOVE,Lo,False,"[((ARABIC FATHA, َ), 121), ((, ), 98), ((ARABIC DAMMA, ُ), 61), ((ARABIC SUKUN, ْ), 58), ((ARABIC KASRA, ِ), 6)]"
30,إ,2436,1573,ARABIC LETTER ALEF WITH HAMZA BELOW,Lo,False,"[((ARABIC KASRA, ِ), 1462), ((, ), 972), ((ARABIC FATHA, َ), 1), ((ARABIC KASRA+ARABIC KASRA, ِِ), 1)]"
39,ئ,1242,1574,ARABIC LETTER YEH WITH HAMZA ABOVE,Lo,False,"[((ARABIC KASRA, ِ), 758), ((ARABIC FATHA, َ), 228), ((, ), 211), ((ARABIC SUKUN, ْ), 19), ((ARABIC DAMMA, ُ), 16), ((ARABIC FATHATAN, ً), 6), ((ARABIC KASRATAN, ٍ), 4)]"


# ClArTTS

In [8]:
# Test set
print(filenames[2])
info_df_2 = generate_information_report(filenames[2])
print("\nChecking the diacritics combinations in the dataset:\n")
print(
    Counter(
        [
            t[0][0]
            for l in info_df_2[info_df_2["Character"] != " "][
                "Following_diacritics"
            ].tolist()
            for t in l
            for _ in range(t[1])
        ]
    ).most_common()
)
print()

info_df_2

../data/clartts_test_metadata.json

Checking the diacritics combinations in the dataset:

[('ARABIC FATHA', 2446), ('', 1448), ('ARABIC KASRA', 961), ('ARABIC SUKUN', 934), ('ARABIC DAMMA', 633), ('ARABIC FATHA+ARABIC SHADDA', 266), ('ARABIC KASRATAN', 61), ('ARABIC FATHATAN', 49), ('ARABIC KASRA+ARABIC SHADDA', 46), ('ARABIC DAMMATAN', 44), ('ARABIC DAMMA+ARABIC SHADDA', 39), ('ARABIC FATHATAN+ARABIC SHADDA', 7), ('ARABIC DAMMATAN+ARABIC SHADDA', 3), ('ARABIC KASRATAN+ARABIC SHADDA', 2), ('ARABIC SHADDA', 1)]



Unnamed: 0,Character,Count,Code,Name,Type,is_diacritic,Following_diacritics
1,,1489,32,SPACE,Zs,False,"[((, ), 1487), ((ARABIC FATHA, َ), 2)]"
39,ء,33,1569,ARABIC LETTER HAMZA,Lo,False,"[((ARABIC KASRA, ِ), 15), ((ARABIC FATHA, َ), 8), ((ARABIC DAMMA, ُ), 7), ((ARABIC FATHATAN, ً), 2), ((ARABIC KASRATAN, ٍ), 1)]"
43,آ,6,1570,ARABIC LETTER ALEF WITH MADDA ABOVE,Lo,False,"[((, ), 6)]"
19,أ,183,1571,ARABIC LETTER ALEF WITH HAMZA ABOVE,Lo,False,"[((ARABIC FATHA, َ), 163), ((ARABIC DAMMA, ُ), 13), ((ARABIC SUKUN, ْ), 7)]"
44,ؤ,5,1572,ARABIC LETTER WAW WITH HAMZA ABOVE,Lo,False,"[((ARABIC FATHA, َ), 4), ((ARABIC DAMMA, ُ), 1)]"
29,إ,70,1573,ARABIC LETTER ALEF WITH HAMZA BELOW,Lo,False,"[((ARABIC KASRA, ِ), 42), ((, ), 28)]"
41,ئ,23,1574,ARABIC LETTER YEH WITH HAMZA ABOVE,Lo,False,"[((ARABIC KASRA, ِ), 17), ((ARABIC FATHA, َ), 2), ((ARABIC KASRATAN, ٍ), 1), ((ARABIC FATHATAN, ً), 1), ((ARABIC DAMMA, ُ), 1), ((ARABIC SUKUN, ْ), 1)]"
4,ا,918,1575,ARABIC LETTER ALEF,Lo,False,"[((, ), 914), ((ARABIC FATHA, َ), 2), ((ARABIC DAMMA, ُ), 1), ((ARABIC SHADDA, ّ), 1)]"
13,ب,288,1576,ARABIC LETTER BEH,Lo,False,"[((ARABIC KASRA, ِ), 103), ((ARABIC FATHA, َ), 93), ((ARABIC SUKUN, ْ), 40), ((ARABIC DAMMA, ُ), 30), ((ARABIC FATHA+ARABIC SHADDA, َّ), 9), ((ARABIC KASRATAN, ٍ), 5), ((ARABIC KASRA+ARABIC SHADDA, ِّ), 3), ((ARABIC FATHATAN, ً), 2), ((ARABIC DAMMATAN, ٌ), 2), ((ARABIC DAMMA+ARABIC SHADDA, ُّ), 1)]"
24,ة,87,1577,ARABIC LETTER TEH MARBUTA,Lo,False,"[((ARABIC KASRA, ِ), 32), ((ARABIC KASRATAN, ٍ), 17), ((ARABIC DAMMA, ُ), 13), ((ARABIC FATHATAN, ً), 9), ((ARABIC DAMMATAN, ٌ), 8), ((ARABIC FATHA, َ), 7), ((, ), 1)]"


In [9]:
# Train set
print(filenames[3])
info_df_3 = generate_information_report(filenames[3])
print("\nChecking the diacritics combinations in the dataset:\n")
print(
    Counter(
        [
            t[0][0]
            for l in info_df_3[info_df_3["Character"] != " "][
                "Following_diacritics"
            ].tolist()
            for t in l
            for _ in range(t[1])
        ]
    ).most_common()
)
print()

info_df_3

../data/clartts_train_metadata.json

Checking the diacritics combinations in the dataset:

[('ARABIC FATHA', 108430), ('', 69968), ('ARABIC KASRA', 43507), ('ARABIC SUKUN', 43105), ('ARABIC DAMMA', 28448), ('ARABIC FATHA+ARABIC SHADDA', 11981), ('ARABIC FATHATAN', 2529), ('ARABIC KASRA+ARABIC SHADDA', 2335), ('ARABIC KASRATAN', 2225), ('ARABIC DAMMA+ARABIC SHADDA', 2145), ('ARABIC DAMMATAN', 2121), ('ARABIC FATHATAN+ARABIC SHADDA', 177), ('ARABIC DAMMATAN+ARABIC SHADDA', 160), ('ARABIC KASRATAN+ARABIC SHADDA', 136), ('ARABIC SHADDA', 39), ('ARABIC KASRA+ARABIC KASRA', 1), ('ARABIC SHADDA+ARABIC SUKUN', 1), ('ARABIC DAMMA+ARABIC KASRA', 1), ('ARABIC FATHA+ARABIC DAMMA', 1)]



Unnamed: 0,Character,Count,Code,Name,Type,is_diacritic,Following_diacritics
1,,66780,32,SPACE,Zs,False,"[((, ), 66736), ((ARABIC FATHA, َ), 27), ((ARABIC KASRA, ِ), 7), ((ARABIC DAMMA, ُ), 4), ((ARABIC SUKUN, ْ), 4), ((ARABIC KASRATAN, ٍ), 1), ((ARABIC SHADDA, ّ), 1)]"
37,ء,1874,1569,ARABIC LETTER HAMZA,Lo,False,"[((ARABIC KASRA, ِ), 1005), ((ARABIC FATHA, َ), 333), ((ARABIC DAMMA, ُ), 277), ((ARABIC KASRATAN, ٍ), 88), ((ARABIC DAMMATAN, ٌ), 72), ((ARABIC FATHATAN, ً), 61), ((, ), 33), ((ARABIC SUKUN, ْ), 4), ((ARABIC KASRA+ARABIC KASRA, ِِ), 1)]"
43,آ,361,1570,ARABIC LETTER ALEF WITH MADDA ABOVE,Lo,False,"[((, ), 361)]"
19,أ,8382,1571,ARABIC LETTER ALEF WITH HAMZA ABOVE,Lo,False,"[((ARABIC FATHA, َ), 7419), ((ARABIC DAMMA, ُ), 503), ((ARABIC SUKUN, ْ), 411), ((ARABIC FATHATAN, ً), 20), ((, ), 19), ((ARABIC KASRA, ِ), 6), ((ARABIC DAMMATAN, ٌ), 3), ((ARABIC KASRATAN, ٍ), 1)]"
44,ؤ,337,1572,ARABIC LETTER WAW WITH HAMZA ABOVE,Lo,False,"[((ARABIC SUKUN, ْ), 132), ((ARABIC FATHA, َ), 126), ((ARABIC DAMMA, ُ), 72), ((ARABIC DAMMATAN, ٌ), 6), ((ARABIC KASRA, ِ), 1)]"
25,إ,3882,1573,ARABIC LETTER ALEF WITH HAMZA BELOW,Lo,False,"[((, ), 2217), ((ARABIC KASRA, ِ), 1663), ((ARABIC FATHA, َ), 2)]"
41,ئ,930,1574,ARABIC LETTER YEH WITH HAMZA ABOVE,Lo,False,"[((ARABIC KASRA, ِ), 670), ((ARABIC FATHA, َ), 85), ((ARABIC DAMMA, ُ), 65), ((ARABIC SUKUN, ْ), 43), ((ARABIC FATHATAN, ً), 42), ((ARABIC KASRATAN, ٍ), 19), ((, ), 4), ((ARABIC DAMMATAN, ٌ), 2)]"
3,ا,44070,1575,ARABIC LETTER ALEF,Lo,False,"[((, ), 43907), ((ARABIC FATHA, َ), 99), ((ARABIC DAMMA, ُ), 44), ((ARABIC KASRA, ِ), 10), ((ARABIC FATHATAN, ً), 8), ((ARABIC SUKUN, ْ), 2)]"
15,ب,12396,1576,ARABIC LETTER BEH,Lo,False,"[((ARABIC KASRA, ِ), 4533), ((ARABIC FATHA, َ), 3771), ((ARABIC SUKUN, ْ), 1781), ((ARABIC DAMMA, ُ), 1215), ((ARABIC FATHA+ARABIC SHADDA, َّ), 257), ((ARABIC KASRATAN, ٍ), 198), ((ARABIC FATHATAN, ً), 177), ((ARABIC DAMMATAN, ٌ), 162), ((ARABIC DAMMA+ARABIC SHADDA, ُّ), 114), ((ARABIC KASRA+ARABIC SHADDA, ِّ), 88), ((, ), 74), ((ARABIC FATHATAN+ARABIC SHADDA, ًّ), 12), ((ARABIC SHADDA, ّ), 5), ((ARABIC DAMMATAN+ARABIC SHADDA, ٌّ), 5), ((ARABIC KASRATAN+ARABIC SHADDA, ٍّ), 4)]"
24,ة,4266,1577,ARABIC LETTER TEH MARBUTA,Lo,False,"[((ARABIC KASRA, ِ), 1484), ((ARABIC DAMMA, ُ), 884), ((ARABIC DAMMATAN, ٌ), 540), ((ARABIC FATHA, َ), 506), ((ARABIC FATHATAN, ً), 396), ((ARABIC KASRATAN, ٍ), 360), ((, ), 95), ((ARABIC SUKUN, ْ), 1)]"
