In [1]:
!pip install tokenize-uk



In [2]:
import csv
import phonemizer_test as phonemizer
import re

phonemized = []
dict_rules = {r"[^ а-яієїІґ́’'\+]": "clean",
            r"\+": "stress",
            "щ": "shch",
            "ь": "soft_sign",
            "['’]": "apostrophe",
            "дз": "dz",
            "дж": "dzh",
            "я": "ja",
            "ю": "ju",
            "є": "je",
            "ї": "ji",
            "(?<=[дзлнрстцq])j": "j_palatal",
            "(?<=[бвгжкмпфхчшґs])j": "j_half_palatal",
            "й": "y",
            "(?<=[дзлнрстцq])і": "i_palatal",
            "f": "apostrophe",
            "т´с(?=´а)": "tsia",
            "(?<=с)ц(?=´ц)": "ststs_cluster",
            "(?<=д)[шч](?=с)": "dshs_cluster",
            "ст(?=с´к)": "stsk_cluster",
            "(?<=[сшн])т(?=[чцдс])": "t_clusters",
            "тс(?=´к|тв)": "ts_clusters",
            "(?<=с)т(?=н)": "stn_cluster",
            r"(?:(?<=на|[вп]і)|(?<=[сп]ере))(́?)q": "dz_boundary",
            r"(?:(?<=\sна|\sві)|(?<=пі|^на|^ві)|(?<=\sпере)|(?<=^пере))(́?)s": "dzh_boundary",
            r"(?:(?<=\s)|(?<=^))з(?=[сцчш])": "s_boundary",
            r"(?:(?<=\s)|(?<=^))з(?=[кптфх])": "s_boundary_word",
            "п(?=[бдзжгґqs])": "p_voiced",
            "к(?=[бдзжгґqs])": "k_voiced",
            "х(?=[бдзжгґqs])": "kh_voiced",
            "т(?=´?[бдзжгґqs])": "d_voiced",
            "с(?=´?[бдзжгґqs])": "z_voiced",
            "ш(?=[бдзжгґqs])": "zh_voiced",
            "ц(?=´?[бдзжгґqs])": "ts_voiced",
            "ч(?=[бдзжгґqs])": "ch_voiced",
            "[дs](?=´?[цзсq])": "D_hissing",
            "[тч](?=´?[цзсq])": "T_hissing",
            "[дq]´?(?=[чжшs])": "D_hushing",
            "[тц]´?(?=[чжшs])": "T_hushing",
            "з´?(?=[чжшs])": "z_assimilation",
            "с´?(?=[чжшs])": "s_assimilation",
            "ж(?=[зсцq])": "zh_assimilation",
            "ш(?=[зсцq])": "sh_assimilation",
            r"(?:(?<=н´і|д´[оі])|(?<=во|[лд]е|кі))(́?)г(?=[кт])": "h_voiceless",
            "(?<=[дтнзсцлq])(?=[дтнзсцлq]´)": "palatalization",
            "(?<![бвгґжкпфхчшдзлрстцмнqsj'´])в(?!['аоеуіи])": "w_vocalization",
            "ф(?=[бдзжгґqs])": "f_voiced",
            "(?<=[бвгжкмпфхчшґs])і": "i_half_palatal",
            r"([бвгґжкпфхчшрмs])\1'": "geminated_half",
            r"рр´": "geminated_r",
            "q": "dz",
            "s": "dzh",
            "д͡з": "dz",
            "д͡ж": "dzh",
            "j(?![аоеуіи])": "j_vocalization",
            "(?<=[мн][аеиоу])|(?<=м'[аеіуо])|(?<=н´[аеоуі])": "nasalization_p",
            "(?<=[j´'])(?=[аеоу])": "i_articulation_p",
            "(?<=[бвгґжкпфхчшдзлрстцмнqsj'´])(?=·?[оу])": "labialization",
            r"(?<=[иіаеоу])(?=[мн])": "nasalization_r",
            r"(?:(?<=[аеоу])|(?<=[аеоу]̃))(?=[дзлрстцнq]´|j|[бвгжкмпфхчшґs]')": "i_articulation_r",
            r"(?<=[иіаеоу])́(?=[мн])": "nasalization_r_stress",
            r"(?:(?<=[аеоу]́)|(?<=[аеоу]̃́))(?=[дзлрстцнq]´|j|[бвгжкмпфхчшґs]')": "i_articulation_r_stress",
            r"([бвгґжкпфхчшдзлрстцмнqs]´?)\1(['´]?°?)": "geminated_all",
            r"(?<=·е)(̃?)(?=·)": "e_to_i",
            r"(е̃?)(?!́|̃́|\(|\))": "e_to_y",
            r"(и̃?)(?!́|̃́|\(|\))": "y_to_e",
            r"(?<=о)(̃?)(?=[бвгґжкпфхчшдзлрстцмнqsўjĭ͡'´°·:]+[у]̃?́)": "o_to_u",
            r"\(и\)(?=(·в'і|м|·j°·у)?(\s|$))": "e_affix",
            r"\(і\)(?=(·в'і|·j°·у)(\s|$))": "e_affix_i",
            r"\(е\)(?=(х|ĭ|м|ш|т´)?(\s|$))": "y_affix",
            r"(?:(?<=\sе)|(?<=^е))\(и\)": "e_start",
            r"(?:(?<=\sи)|(?<=^и))\(е\)": "y_start"}


with open(f"words_spell.txt", "r") as f:
        file = f.read()
list_of_words = file.split("\n")


with open(f"Transcription.csv", "w", newline="") as csvfile:
    file_writer = csv.writer(csvfile, delimiter=',')
    file_writer.writerow(["Word", "Broad transcription", "Broad rules", "Narrow transcription", "Narrow rules", "IPA transcription", "IPA rules", "Comment"])
    for word in list_of_words:
        output_word = phonemizer.remove_punctuation(word)
        output_word = phonemizer.Transcriptor(output_word).transcribe_baseline()

        #print(output_word)

        rules_broad = []
        word_to_match = re.sub(r"[а-яіїєґ’']", "", word)
        if word[:3].isupper() and re.match(r"\b[АЕОИУІЇЮЯЄ]?[БВГҐДЖЗПТЦЧЛМНРСФКХШЩЙ]{2,}[АЕОИУІЇЮЯЄ]?\b", word_to_match):
            rules_broad.append("abbreviation")
        for rule in output_word[1]:
            rules_broad.append(dict_rules[rule])
        rules_narrow = []
        for rule in output_word[3]:
            rules_narrow.append(dict_rules[rule])
        rules_ipa = []
        for rule in output_word[5]:
            rules_ipa.append(dict_rules[rule])

        file_writer.writerow([word, output_word[0], list(set(rules_broad)), output_word[2], list(set(rules_narrow)), output_word[4], list(set(rules_ipa)), ""])

In [6]:
import csv

import re
dict_rules = {r"[^ а-яієїІґ́’'\+]": "clean",
            r"\+": "stress",
            "щ": "shch",
            "ь": "soft_sign",
            "['’]": "apostrophe",
            "дз": "dz",
            "дж": "dzh",
            "я": "ja",
            "ю": "ju",
            "є": "je",
            "ї": "ji",
            "(?<=[дзлнрстцq])j": "j_palatal",
            "(?<=[бвгжкмпфхчшґs])j": "j_half_palatal",
            "й": "y",
            "(?<=[дзлнрстцq])і": "i_palatal",
            "f": "apostrophe",
            "т´с(?=´а)": "tsia",
            "(?<=с)ц(?=´ц)": "ststs_cluster",
            "(?<=д)[шч](?=с)": "dshs_cluster",
            "ст(?=с´к)": "stsk_cluster",
            "(?<=[сшн])т(?=[чцдс])": "t_clusters",
            "тс(?=´к|тв)": "ts_clusters",
            "(?<=с)т(?=н)": "stn_cluster",
            r"(?:(?<=на|[вп]і)|(?<=[сп]ере))(́?)q": "dz_boundary",
            r"(?:(?<=\sна|\sві)|(?<=пі|^на|^ві)|(?<=\sпере)|(?<=^пере))(́?)s": "dzh_boundary",
            r"(?:(?<=\s)|(?<=^))з(?=[сцчш])": "s_boundary",
            r"(?:(?<=\s)|(?<=^))з(?=[кптфх])": "s_boundary_word",
            "п(?=[бдзжгґqs])": "p_voiced",
            "к(?=[бдзжгґqs])": "k_voiced",
            "х(?=[бдзжгґqs])": "kh_voiced",
            "т(?=´?[бдзжгґqs])": "d_voiced",
            "с(?=´?[бдзжгґqs])": "z_voiced",
            "ш(?=[бдзжгґqs])": "zh_voiced",
            "ц(?=´?[бдзжгґqs])": "ts_voiced",
            "ч(?=[бдзжгґqs])": "ch_voiced",
            "[дs](?=´?[цзсq])": "D_hissing",
            "[тч](?=´?[цзсq])": "T_hissing",
            "[дq]´?(?=[чжшs])": "D_hushing",
            "[тц]´?(?=[чжшs])": "T_hushing",
            "з´?(?=[чжшs])": "z_assimilation",
            "с´?(?=[чжшs])": "s_assimilation",
            "ж(?=[зсцq])": "zh_assimilation",
            "ш(?=[зсцq])": "sh_assimilation",
            r"(?:(?<=н´і|д´[оі])|(?<=во|[лд]е|кі))(́?)г(?=[кт])": "h_voiceless",
            "(?<=[дтнзсцлq])(?=[дтнзсцлq]´)": "palatalization",
            "abbr": "abbreviation"}

dict_rules_n = {
            "д͡з": "dz",
            "д͡ж": "dzh",
            "(?<![бвгґжкпфхчшдзлрстцмнqsj'´])в(?!['аоеуіи])": "w_vocalization",
            "ф(?=[бдзжгґqs])": "f_voiced",
            "(?<=[бвгжкмпфхчшґs])і": "i_half_palatal",
            "j(?![аоеуіи])": "j_vocalization",
            "(?<=[мн][аеиоу])|(?<=м'[аеіуо])|(?<=н´[аеоуі])": "nasalization_p",
            "(?<=[j´'])(?=[аеоу])": "i_articulation_p",
            "(?<=[бвгґжкпфхчшдзлрстцмнqsj'´])(?=·?[оу])": "labialization",
            r"(?<=[иіаеоу])(?=[мн])": "nasalization_r",
            r"(?:(?<=[аеоу])|(?<=[аеоу]̃))(?=[дзлрстцнq]´|j|[бвгжкмпфхчшґs]')": "i_articulation_r",
            r"(?<=[иіаеоу])́(?=[мн])": "nasalization_r_stress",
            r"(?:(?<=[аеоу]́)|(?<=[аеоу]̃́))(?=[дзлрстцнq]´|j|[бвгжкмпфхчшґs]')": "i_articulation_r_stress",
            r"([бвгґжкпфхчшдзлрстцмнqs]´?)\1(['´]?°?)": "geminated_all"}

dict_rules_ipa = {"(?<![бвгґжкпфхчшдзлрстцмнqsj'´])в(?!['аоеуіи])": "w_vocalization",
            "ф(?=[бдзжгґqs])": "f_voiced",
            "(?<=[бвгжкмпфхчшґs])і": "i_half_palatal",
            r"([бвгґжкпфхчшрмs])\1'": "geminated_half",
            r"рр´": "geminated_r",
            "q": "dz",
            "s": "dzh"}

def create_table(t_row, r_row, dict_r):
    for value in set(dict_r.values()):
        with open(f"results/Transcription_{value}.csv", "w", newline="") as csvfile:
            file_writer = csv.writer(csvfile, delimiter=',')
            file_writer.writerow(["Word", "Transcription", "Rules", "Comment"])

            for row in list(file):
                if value in row[r_row]:
                    file_writer.writerow(row)


with open(f"Transcription.csv", "r") as f:
    file = list(csv.reader(f))[1:]
    create_table(1, 2, dict_rules)
    create_table(3, 4, dict_rules_n)
    create_table(5, 6, dict_rules_ipa)

In [7]:
!zip -r /content/results.zip /content/results

  adding: content/results/ (stored 0%)
  adding: content/results/Transcription_D_hushing.csv (deflated 93%)
  adding: content/results/Transcription_abbreviation.csv (deflated 84%)
  adding: content/results/Transcription_nasalization_r_stress.csv (stored 0%)
  adding: content/results/Transcription_geminated_r.csv (deflated 89%)
  adding: content/results/Transcription_shch.csv (deflated 93%)
  adding: content/results/Transcription_t_clusters.csv (deflated 92%)
  adding: content/results/Transcription_D_hissing.csv (deflated 93%)
  adding: content/results/Transcription_clean.csv (deflated 92%)
  adding: content/results/Transcription_nasalization_p.csv (deflated 92%)
  adding: content/results/Transcription_w_vocalization.csv (deflated 92%)
  adding: content/results/Transcription_ju.csv (deflated 92%)
  adding: content/results/Transcription_i_articulation_r.csv (deflated 92%)
  adding: content/results/Transcription_T_hushing.csv (deflated 91%)
  adding: content/results/Transcription_dzh.csv 