In [None]:
import sys
sys.path.append('../')

In [None]:
import pandas as pd
import numpy as np
import importlib

import training.preprocess as preprocess
import testing.tagger as tagger
import testing.syllabification as syllabification
import config
import testing.stemmer as stemmer

from pprint import pprint

## Import result data

In [None]:
fpath = "../../data/g2p/var/test/20201214222029_rules_stem_gkn_n=7_B=15_fold_1.txt"

res_data = pd.read_csv(
    fpath,
    sep='\t',
    header=None,
    names=['word', 'syllables', 'prediction', 'mmc'],
    na_filter=False
)
res_data.head(5)

## Wrong phoneme frequencies

In [None]:
wrong_phonemes = []
wrong_phonemes_dict = {}

for row in res_data.itertuples():
    if row.mmc == 0:
        wrong_phonemes.append("-")
    else:
        wp_string = ""
        c = 0
        for i in range(len(row.syllables)):
            if row.syllables[i] != row.prediction[i]:
                wp = f"{row.syllables[i]}/{row.prediction[i]}"
                wp_string += wp

                if wp not in wrong_phonemes_dict:
                    wrong_phonemes_dict[wp] = 0
                wrong_phonemes_dict[wp] += 1

                c += 1

                if c < row.mmc:
                    wp += " "

        wrong_phonemes.append(wp_string)

res_data_new = res_data.copy()
res_data_new["wrong_phonemes"] = wrong_phonemes

freq_data_list = []

for phonemes_str, freq in wrong_phonemes_dict.items():
    phonemes = phonemes_str.split("/")
    freq_data_list.append((phonemes[0], phonemes[1], freq))

freq_data = pd.DataFrame(freq_data_list, columns=["real_phoneme", "pred_phoneme", "frequency"]).sort_values("frequency", ascending=False).reset_index(drop=True)

freq_data

In [None]:
fpath = "../../data/g2p/var/test/wfp_b=15_fold_1.txt"

freq_data.to_csv(
    fpath,
    sep='\t',
    index=False,
    header=False
)

## Error source frequencies

In [None]:
wrong_types = []
wrong_types_dict = {"affix": 0, "root": 0, "both": 0}

st = stemmer.Stemmer()

for row in res_data.itertuples():
    if row.mmc == 0:
        wrong_types.append("-")
    else:
        prefix, root, d_suffix, i_suffix = st.getRoot(row.word)

        if prefix == '' and d_suffix == '' and i_suffix == '':
            wrong_types.append("root")
            wrong_types_dict["root"] += 1
            continue
        
        affix = False
        root = False

        for i in range(len(row.syllables)):
            if row.syllables[i] != row.prediction[i]:
                if i >= len(prefix) and i < (len(row.word) - len(d_suffix) - len(i_suffix)):
                    root = True
                else:
                    affix = True
            
        if affix and root:
            wrong_types.append("both")
            wrong_types_dict["both"] += 1
        elif affix:
            wrong_types.append("affix")
            wrong_types_dict["affix"] += 1
        else:
            wrong_types.append("root")
            wrong_types_dict["root"] += 1

res_data_new = res_data.copy()
res_data_new["wrong_types"] = wrong_types

pprint(wrong_types_dict)
print(f"Total word error: {sum(wrong_types_dict[x] for x in wrong_types_dict)}")

## Save to csv

In [None]:
fpath = "../../data/g2p/var/wrong_types_vanilla.txt"

res_data_new.to_csv(
    fpath,
    sep='\t', 
)