In [None]:
import re

file_path = r"ne_np_female\line_index.tsv"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

words = re.sub(r'\n[^\t]*\t', ' ', text)[20:-1].split(" ")
word_count = {}

for word in words:
    if word:    
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
        

with open("dict_ne.txt", "w", encoding="utf-8") as f:
    words_sorted = sorted(word_count, key=lambda x: word_count[x], reverse=True)
    for word in words_sorted:
        f.write(f"{word}\n")

In [None]:
words_map = {}

with (
    open("dict_ne.txt", "r", encoding="utf-8") as f1, 
    open("dict_en.txt", "r", encoding="utf-8") as f2, 
    # open("dict_map.txt", "w", encoding="utf-8") as f3
    ):

    for w1, w2 in zip(f1, f2):
        words_map[w1[:-1]] = w2.strip()
        # f3.write(f"{w1[:-1]}: {w2.strip()}\n")

In [None]:
import csv

with open("metadata.csv", mode="w", newline="") as file:
    writer = csv.writer(file, delimiter='|')

    for i, line in enumerate(text.split("\n")[:-1]):
        fname, words_ne = line.split("\t")
        words_en = ""
        for word in words_ne.split(" "):
            if word in words_map:
                words_en += words_map[word] + " "

        writer.writerow([f"segment_{i}", words_en])

In [None]:
class R:

    def __init__(self):
        self.special_chars_ne = ["अ", "आ", "इ", "ई", "उ", "ऊ", "ऋ", "ॠ", "ए", "ऐ", "ओ", "औ", "ॐ"]
        self.consonants_ne    = ["क", "ख", "ग", "घ", "ङ", "च", "छ", "ज", "झ", "ञ", "ट", "ठ", "ड", "ढ", "ण", "त", "थ", "द", "ध", "न", "प", "फ", "ब", "भ", "म", "य", "र", "ल", "व", "श", "ष", "स", "ह", "क्ष", "ज्ञ"]
        self.vowel_marks_ne   = ["ा", "ि", "ी", "ु", "ू", "ृ", "े", "ै", "ो", "ौ", "्", "ं", "ँ", ""]

        self.special_chars_en = ["a", "aa", "i", "ii", "u", "uu", "ri", "ree", "e", "ai", "o", "au", "om"]
        self.consonants_en    = ["k", "kh", "g", "gh", "ng", "ch", "chh", "j", "jh", "yn", "t", "th", "d", "dh", "n", "t", "th", "d", "dh", "n", "p", "ph", "b", "bh", "m", "y", "r", "l", "b", "sh", "sh", "s", "h", "ksh", "gy"]
        self.vowel_marks_en   = ["aa", "i", "ii", "u", "uu", "ri", "e", "ai", "o", "au", "", "a", "aa", "a"]

        assert len(self.special_chars_ne) == len(self.special_chars_en)
        assert len(self.consonants_ne   ) == len(self.consonants_en   )
        assert len(self.vowel_marks_ne  ) == len(self.vowel_marks_en  )

        all_ne = [consonant + vowel_mark for consonant in self.consonants_ne for vowel_mark in self.vowel_marks_ne] + self.special_chars_ne
        all_en = [consonant + vowel_mark for consonant in self.consonants_en for vowel_mark in self.vowel_marks_en] + self.special_chars_en

        self.map = dict(zip(all_ne, all_en))


    def process(self, text):
        for key, value in self.map.items():
            text = text.replace(key, value)

        for i in self.vowel_marks_ne:
            text = text.replace(i, "")

        return text   

In [None]:
t = R()

with open("metadata_.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file, delimiter='|')

    for i, line in enumerate(text.split("\n")[:-1]):
        fname, words_ne = line.split("\t")
        words_en = t.process(words_ne)
        writer.writerow([f"segment_{i}", words_en])

In [None]:
# from pathlib import Path

# paths = Path(r"test_nep_pinyin\wavs").glob("*.wav")
# for i, path in enumerate(paths):
#     path.rename(path.parent/f"segment_{i}.wav")