## Prepare wav

In [1]:
def write_lines(lines, file):
    with open(file, 'w') as f:
        for line in lines:
            f.write("%s\n" % line.replace("\n", ""))

In [2]:
import glob
import os

def illegal_name(name):
    return "DS_Store" in name or "._" in name

def generate_csv(base_dir, output_file):
    dirs = os.listdir(base_dir)
    trans_all = []
    for dir in dirs:
        if (illegal_name(dir)):
            continue
        with open(base_dir + "/" + dir + "/trans.txt") as f:
            trans = f.readlines()
        files = list(map(lambda x: x.replace(".wav",""), os.listdir(base_dir + "/" + dir)))
        for file in files:
            if (illegal_name(file) or ".txt" in file):
                continue
            index = int(file.replace(".wav",""))
            if (index < 0 or index >= len(trans)):
                raise Exception("File does not have a corresponding transcription. Should be between {} and {}".format(0, len(trans)))
            trans_all.append("{},{}/{}/{}.wav".format(trans[index],base_dir,dir,file))
    write_lines(trans_all, output_file)

generate_csv("MelaNetDataIVR/data_ivr/wav", "MelaNetDataIVR/data_ivr/trans.csv")

## Prepare json files describing train and valid data 

In [3]:
from src.json_generator import generate_json
from random import shuffle

def split_file(file):
    lines = open(file).readlines()
    shuffle(lines)
    train_end = int(len(lines)*.95)
    train = lines[:train_end]
    valid = lines[train_end:]
    return train,valid

train,valid = split_file("MelaNetDataIVR/data_ivr/trans.csv")
write_lines(train, "MelaNetDataIVR/data_ivr/train_trans.csv")
write_lines(valid, "MelaNetDataIVR/data_ivr/valid_trans.csv")
generate_json("MelaNetDataIVR/data_ivr/train_trans.csv", ".", "MelaNetDataIVR/data_ivr/train_corpus.json")
generate_json("MelaNetDataIVR/data_ivr/valid_trans.csv", ".", "MelaNetDataIVR/data_ivr/valid_corpus.json")

In [4]:
from src.csv_generator import generate_csv

generate_csv("MelaNetDataIVR/data_ivr/train_corpus.json", "MelaNetDataIVR/data_ivr/train_trans_md.csv")
generate_csv("MelaNetDataIVR/data_ivr/valid_corpus.json", "MelaNetDataIVR/data_ivr/valid_trans_md.csv")

## Remove characters with the same sound

In [5]:
from src.json_generator import generate_json
from src.redundant import remove_redundant
supported = "ሁ ለ ም ራ ሮ ሰ ስ ሶ ባ ት ን ኝ አ ዎ ዘ ዜ ይ ደ ድ ጠ".split()

remove_redundant("MelaNetDataIVR/data_ivr/train_corpus.json", char_set=supported)
remove_redundant("MelaNetDataIVR/data_ivr/valid_corpus.json", char_set=supported)

== Unsupported characters ==
set()
== Character stat ==
[(' ', 51282), ('ት', 33215), ('አ', 23935), ('ስ', 23833), ('ም', 14059), ('ድ', 9808), ('ለ', 9794), ('ን', 9327), ('ሁ', 5030), ('ዎ', 4970), ('ይ', 4764), ('ደ', 4764), ('ባ', 4716), ('ሰ', 4716), ('ሮ', 4667), ('ዜ', 4667), ('ራ', 4655), ('ዘ', 4575), ('ጠ', 4575), ('ኝ', 4575), ('ሶ', 4500)]
== Unsupported characters ==
set()
== Character stat ==
[(' ', 2700), ('ት', 1713), ('አ', 1272), ('ስ', 1241), ('ም', 739), ('ድ', 530), ('ን', 508), ('ለ', 491), ('ዎ', 280), ('ሰ', 261), ('ሮ', 261), ('ዜ', 261), ('ባ', 261), ('ይ', 246), ('ደ', 246), ('ሁ', 245), ('ዘ', 235), ('ኝ', 235), ('ጠ', 235), ('ራ', 231), ('ሶ', 218)]
