#Инициализация

In [1]:
import librosa
import numpy as np


SR = 11025
N_FFT = 1024

# Преобразование аудиофайла
def audio_reformatting(audio_path:str="./audio.mp3"):
    y, _ = librosa.load(audio_path, sr=SR)
    D_transpose_npy = np.transpose(librosa.stft(y, n_fft=N_FFT))
    return D_transpose_npy


In [2]:
CRITERIA_NAMES = [["Thinking", "thi", 2],
                  ["Space", "spa", 2],
                  ["Long", "lon", 2],
                  ["Stress", "str", 2],
                  ["Consonant (Open, Close, Approximant, Non-sibilant fricative)", "con", 9],
                  ["Nasal", "nas", 2],
                  ["Plosive", "plo", 2],
                  ["Sibilant fricative", "sib", 2],
                  ["Trill", "tri", 2],
                  ["Lateral", "lat", 2],
                  ["Labialization, Rounded", "lbz", 3],
                  ["Bilabial", "bil", 2],
                  ["Dental", "den", 2],
                  ["Alveolar", "alv", 2],
                  ["Retroflex, Rhotic", "ret", 2],
                  ["Palatal, Front", "pal", 2],
                  ["Velar, Back", "vel", 2],
                  ["Uvular", "uvu", 2],
                  ["Pharyngeal/epiglottal", "pha", 2],
                  ["Glottal", "glo", 2],
                  ["Aspiration", "asp", 2],
                  ["Voiceless", "vls", 2]
                  ]
CRITERIA_TAGS_ID = {i[1]: n for n, i in enumerate(CRITERIA_NAMES)}
CRITERIA_LEN = len(CRITERIA_NAMES)

COMBINATION_CRITERIA_NAMES = [["Tap/flap", "tap", 2],
                              ["Labiodental", "lad", 2],
                              ["Postalveolar", "plv", 2]
                              ]
COMBINATION_CRITERIA_TAGS_ID = {i[1]: n for n, i in enumerate(COMBINATION_CRITERIA_NAMES)}
COMBINATION_CRITERIA_LEN = len(COMBINATION_CRITERIA_NAMES)

import numpy as np
import torch
from torch.utils.data import Dataset
import librosa
# import soundfile as sf

# Common sounds collection for classifying
class SoundClassifying:
    def __init__(self):
        with open('data/ipa_symbols.txt', 'r', encoding="utf8") as f:
            self.symbols_table = {i[0]: [float(j) for j in i[1:]] for i in [i[:-1].split("\t") for i in f.readlines()]}
        with open('data/ipa_mods.txt', 'r', encoding="utf8") as f:
            self.mods_table = {i[0]: [j for j in i[1:]] for i in [i[:-1].split("\t") for i in f.readlines()]}
    def symbol_info(self, sym): return self.symbols_table[sym].copy()
    def mod_info(self, sym): return self.mods_table[sym].copy()
    def empty(self): return [0.0 for i in CRITERIA_NAMES]

# Creating all combinantions
def combs(a):
    if len(a) == 0:
        return [[]]
    cs = []
    for c in combs(a[1:]):
        cs += [c, c+[a[0]]]
    return cs

# Diversifying notation with including and excluding values in brackets
def brackets_split(notation):
    notation_split = [i.split(")") for i in notation.split("(")]
    if len(notation_split) != 1:
        notation_start = notation_split[0][0]
        notation_split = notation_split[1:]
        notation_combs = combs(range(len(notation_split)))
        notation_split_vars = []
        for c_list in notation_combs:
            new_notation_var = notation_start
            for n, section in enumerate(notation_split):
                if n in c_list:
                    new_notation_var += section[0]
                new_notation_var += section[1]
            notation_split_vars.append(new_notation_var)
    else:
        notation_split_vars = notation_split[0]
    return notation_split_vars

# Main classification function
def classify(notation_stage_1:str, classifying:SoundClassifying, amplify:float=1.0, reverse:bool=False, bias:float=0.0):
    vowels = ["i", "y", "ɨ", "ʉ", "ɯ", "u", "ɪ", "ʏ", "ʊ", "e", "ø",
              "ɘ", "ɵ", "ɤ", "o", "ə", "ɚ", "ɛ", "œ", "ɜ", "ɝ", "ɞ",
              "ʌ", "ɔ", "æ", "ɐ", "a", "ɶ", "ä", "ɑ", "ɒ", "̩"]
    curr_weight = 1.0
    sound_symbols = classifying.symbols_table.keys()
    sound_mods = classifying.mods_table.keys()

    # Initial notation
    notation_stage_2 = []
    for char in notation_stage_1:
        if char == " ":
            curr_classifying = classifying.empty()
            curr_classifying[CRITERIA_TAGS_ID["spa"]] = 1.0
            curr_classifying[CRITERIA_TAGS_ID["lon"]] = 0.0
            notation_stage_2.append([["_", char, "l"], curr_weight, curr_classifying])
        elif char == "|":
            curr_classifying = classifying.empty()
            curr_classifying[CRITERIA_TAGS_ID["spa"]] = 1.0
            curr_classifying[CRITERIA_TAGS_ID["lon"]] = 1.0
            notation_stage_2.append([["_", char, "l"], curr_weight, curr_classifying])
        elif char == "‿":
            curr_classifying = classifying.empty()
            curr_classifying[CRITERIA_TAGS_ID["spa"]] = 1.0
            curr_classifying[CRITERIA_TAGS_ID["lon"]] = 0.0
            notation_stage_2.append([["_", char, "s"], curr_weight, curr_classifying])
        elif char in sound_symbols:
            if char in vowels:
                notation_stage_2.append([["s", char, "v"], curr_weight, classifying.symbol_info(char)])
            else:
                notation_stage_2.append([["s", char, "c"], curr_weight, classifying.symbol_info(char)])
        elif "◌"+char in sound_mods:
                notation_stage_2.append([["m", char, "←"], curr_weight, classifying.mod_info("◌"+char)])
        elif char in sound_mods:
            notation_stage_2.append([["m", char, "→"], curr_weight, classifying.mod_info(char)])
    notation_stage_3 = []
    for sound in notation_stage_2:
        if sound[0][0] == "m":
            if sound[0][2] == "←":
                last_sound = notation_stage_3[-1].copy()
                last_sound[0] = last_sound[0].copy()
                last_sound[2] = last_sound[2].copy()
                for cat, val in enumerate(sound[2]):
                    if val == "":
                        last_sound[0][1] = notation_stage_3[-1][0][1] + sound[0][1]
                    elif val[0] == "-":
                        last_sound[0][1] = notation_stage_3[-1][0][1] + sound[0][1]
                        last_sound[2][cat] = notation_stage_3[-1][2][cat] - float(val[1:])
                        if last_sound[2][cat] < 0: last_sound[2][cat] = 0
                    elif val[0] == "+":
                        last_sound[0][1] = notation_stage_3[-1][0][1] + sound[0][1]
                        last_sound[2][cat] = notation_stage_3[-1][2][cat] + float(val[1:])
                        if last_sound[2][cat] > 1: last_sound[2][cat] = 1
                    elif val[:3] == "to ":
                        last_sound[0][1] = notation_stage_3[-1][0][1] + sound[0][1]
                        last_sound[2][cat] = (notation_stage_3[-1][2][cat] + float(val[3:])) / 2
                    elif val[:3] == "up ":
                        last_sound[0][1] = notation_stage_3[-1][0][1] + sound[0][1]

                        curr_val = notation_stage_3[-1][2][cat]
                        new_val = float(val[3:])
                        if curr_val < new_val:
                            last_sound[2][cat] = new_val
                    else:
                        last_sound[0][1] = notation_stage_3[-1][0][1] + sound[0][1]
                        last_sound[2][cat] = float(val)
                notation_stage_3[-1] = last_sound
            else:
                notation_stage_3.append(sound)
        else:
            notation_stage_3.append(sound)

    # Adding mods
    notation_stage_4 = []
    for sound in notation_stage_3[::-1]:
        if sound[0][0] == "m":
            if sound[0][2] == "→":
                for n in range(len(notation_stage_4)):
                    n_rev = len(notation_stage_4) - n - 1
                    if notation_stage_4[n_rev][0][0] != "s" or "̯" in notation_stage_4[n_rev][0][1]:
                        pass
                    elif "̩" in notation_stage_4[n_rev][0][1] or \
                    notation_stage_4[n_rev][2][CRITERIA_TAGS_ID["con"]] < 0.8125:
                        notation_stage_4[n_rev][0][1] = sound[0][1] + notation_stage_4[n_rev][0][1]
                        for s in range(len(sound[2])):
                            if sound[2][s] != "":
                                notation_stage_4[n_rev][2][s] += float(sound[2][s]) * sound[1]
                        break
            else:
                notation_stage_4.append(sound)
        else:
            notation_stage_4.append(sound)

    notation_stage_4 = notation_stage_4[::-1]

    # Stress (strtucture of words)
    structure = ""
    for sound in notation_stage_4:
        if sound[0][0] == "_" and sound[2][CRITERIA_TAGS_ID["lon"]] > 0.375:
            structure += "_"
        elif sound[0][0] == "_":
            structure += "-"
        elif sound[0][0] == "s":
            if (sound[0][2] == "v" or "̩" in sound[0][1]) and "̯" not in sound[0][1]:
                if sound[2][CRITERIA_TAGS_ID["str"]] > 0:
                    structure += "V"
                else:
                    structure += "v"
            elif (sound[0][2] == "c" or "̯" in sound[0][1]) and "̩" not in sound[0][1]:
                structure += "c"
        else:
            structure += "o"

    # Stress (stressing needed sounds)
    structure_check = [[0]*len(i)+["_"] if "v" in i and i.count("v") == 1 and i.count("V") == 0
                       else [1]*len(i)+["_"] for i in structure.split("_")]
    structure_list = []
    for i in structure_check:
        structure_list += i
    structure_list = structure_list[:-1]

    stressed = False
    for n, struc in enumerate(structure_list):
        if struc == 0 and not stressed:
            if structure[n] == "v":
                notation_stage_4[n][2][CRITERIA_TAGS_ID["str"]] = 1.0
                stressed = True
        elif struc == "_":
            stressed = False

    # Long sounds
    if len(notation_stage_4) != 0:
        notation_stage_5 = [notation_stage_4[0].copy()]
    else:
        notation_stage_5 = []
    for n in range(1, len(notation_stage_4)):
        sound1 = notation_stage_4[n][2].copy()
        sound2 = notation_stage_5[-1][2].copy()
        sound1[CRITERIA_TAGS_ID["lon"]] = 0.0
        sound2[CRITERIA_TAGS_ID["lon"]] = 0.0
        stress1 = sum([i in notation_stage_4[n][0][1] for i in "ˌˈ"]) == 0
        stress2 = sum([i in notation_stage_5[-1][0][1] for i in "ˌˈ"]) == 0
        cv1 = "̩" in notation_stage_4[n][0][1]
        cv2 = "̩" in notation_stage_5[-1][0][1]
        vc1 = "̯" in notation_stage_4[n][0][1]
        vc2 = "̯" in notation_stage_5[-1][0][1]
        if sound1 == sound2 and (stress1 and stress2) and cv1 == cv2 and vc1 == vc2:
            notation_stage_5[-1][0][1] += notation_stage_4[n][0][1]
            notation_stage_5[-1][2][CRITERIA_TAGS_ID["lon"]] = 1.0
        else:
            notation_stage_5.append(notation_stage_4[n])

    # Removing spaces from in the beginning and in the end
    notation_stage_6 = []
    no_longer_spaces = False
    for sound in notation_stage_5:
        if sound[2][CRITERIA_TAGS_ID["spa"]] != 1.0:
            no_longer_spaces = True
        if no_longer_spaces:
            notation_stage_6.append(sound)
    notation_stage_7 = []
    no_longer_spaces = False
    for sound in notation_stage_6[::-1]:
        if sound[2][CRITERIA_TAGS_ID["spa"]] != 1.0:
            no_longer_spaces = True
        if no_longer_spaces:
            notation_stage_7.append(sound)
    notation_stage_7 = notation_stage_7[::-1]

    # Multiplying and reversing
    notation_stage_amp = []
    for sound in notation_stage_7:
        if reverse:
            for cat in range(len(sound[2])):
                sound[2][cat] = amplify - (amplify * sound[2][cat]) + bias
        else:
            for cat in range(len(sound[2])):
                sound[2][cat] = amplify * sound[2][cat] + bias
        notation_stage_amp.append(sound)

    return notation_stage_amp

# Dataset collect
class ListenIPADataset(Dataset):
    def __init__(self, notations:list, amplify:float=1.0, reverse:bool=False, bias:float=0.0):
        self.classifying = SoundClassifying()
        self.amplify = amplify
        self.reverse = reverse
        self.bias = bias
        self.pairs = notations

    def append(self, pair):
        self.pairs.append(pair)

    def pop(self, index:int=-1):
        self.pairs.pop(index)

    def criteria_max(self):
        return self.bias if self.reverse else self.amplify+self.bias

    def criteria_min(self):
        return self.amplify+self.bias if self.reverse else self.bias

    def thinking_empty(self):
        if self.reverse:
            returning_array = [self.bias] + [self.amplify + self.bias for i in CRITERIA_NAMES[1:]]
        else:
            returning_array = [self.amplify + self.bias] + [self.bias for i in CRITERIA_NAMES[1:]]

        return tuple(returning_array)

    def output_audio_npy(self, index:int):
        return np.load(f'data/audio_data_stft/{self.pairs[index][0]}.npy')

    def output_audio_mp3(self, index:int, curr_sr:int=SR):
        return librosa.load(f'data/audio_data/{self.pairs[index][0]}.mp3', sr=curr_sr)
        # y, sr = librosa.load(self.pairs[index][0], sr=curr_sr)

    def output_labels(self, index:int, is_shortened=True):
        curr_weights = [brackets_split(i[1:-1].strip()) for ipa_id, i in self.pairs[index][1]]
        t = []
        if is_shortened:
            for weight in curr_weights:
                for variant in [[i2[2] for i2 in classify(i1, self.classifying, self.amplify, self.reverse, self.bias)] for i1 in weight]:
                    t.append(tuple([tuple(k) for k in variant]))
        else:
            for weight in curr_weights:
                for variant in [[i2 for i2 in classify(i1, self.classifying, self.amplify, self.reverse, self.bias)] for i1 in weight]:
                    t.append(tuple([(tuple(k[0]), k[1], tuple(k[2])) for k in variant]))
        return tuple(set(t))

    def output_audio_id(self, index:int):
        return self.pairs[index][0]

    def output_labels_text(self, index:int):
        return self.pairs[index][1]

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, index:int):
        return {"audio": self.output_audio_npy(index), "labels": self.output_labels(index),
                "audio_id": self.output_audio_id(index), "labels_text": self.output_labels_text(index)}


In [3]:
import torch
import torch.nn as nn
from torch import complex64


class ListenIPA(nn.Module):
    def __init__(self, audio_input_size, criteria_len, kernel_size, num_layers, is_var1):
        super(ListenIPA, self).__init__()
        self.audio_input_size = audio_input_size
        self.criteria_len = criteria_len
        self.new_hidden_size = 129
        self.kernel_size = kernel_size
        self.num_layers = num_layers
        self.is_var1 = is_var1

        if is_var1:
            self.conv1 = nn.Conv1d(1, 1, kernel_size=kernel_size, stride=kernel_size//7,
                                   padding=kernel_size//2, dtype=complex64, bias=False)
        else:
            self.conv1 = nn.Conv1d(1, 1, kernel_size=kernel_size, stride=kernel_size//7,
                                   padding=kernel_size//2, dtype=complex64, bias=True)

        self.conv2 = nn.Conv1d(1, 1, kernel_size=kernel_size, stride=kernel_size//7,
                               padding=kernel_size//2, dtype=complex64, bias=False)

        self.initial_linears2 = nn.Sequential(
            nn.Linear(self.new_hidden_size * 2, self.new_hidden_size, bias=False),
            nn.Linear(self.new_hidden_size, self.new_hidden_size, bias=False),
        )

        if is_var1:
            self.recc = nn.LSTM(self.new_hidden_size, self.new_hidden_size, num_layers, batch_first=False)
        else:
            self.recc = nn.GRU(self.new_hidden_size, self.new_hidden_size, num_layers, batch_first=False)

        self.final_linears1 = nn.Sequential(
            nn.Linear(self.new_hidden_size, criteria_len, bias=False),
        )

    def forward(self, x, hidden):
        x = self.conv1(x)
        x = self.conv2(x)
        x = torch.cat((x.real, x.imag), dim=-1)
        x = self.initial_linears2(x)
        x, hidden = self.recc(x, hidden)
        x = self.final_linears1(x)
        return x, hidden

    def init_hidden(self):
        if self.is_var1:
            return (torch.zeros(self.num_layers, 1, self.new_hidden_size),
                    torch.zeros(self.num_layers, 1, self.new_hidden_size))
        else:
            return torch.zeros(self.num_layers, 1, self.new_hidden_size)

In [4]:
vowels_table = [[["i","i̹","y"], ["ɨ","ɨ̹","ʉ"], ["ɯ","u̜","u"]],
                [["ɪ","ɪ̹","ʏ"], ["ɪ̈","ɪ̹̈","ʏ̈"], ["ɯ̽","ʊ̜","ʊ"]],
                [["e","e̹","ø"], ["ɘ","ɘ̹","ɵ"], ["ɤ","o̜","o"]],
                [["e̞","e̹̞","ø̞"], ["ə̜","ə","ə̹"], ["ɤ̞","o̜̞","o̞"]],
                [["ɛ","ɛ̹","œ"], ["ɜ","ɜ̹","ɞ"], ["ʌ","ɔ̜","ɔ"]],
                [["æ","æ̹","œ̞"], ["æ̈","ɐ","ɐ̹"], ["ʌ̞","ɔ̜̞","ɔ̞"]],
                [["a","a̹","ɶ"], ["ä","ä̹","ɶ̈"], ["ɑ","ɒ̜","ɒ"]]]

MODS_TABLE_ADD = {"approx": "̞", "nasal": "̃", "plosive": "͐", "sibilant": "͒", "non-sib fr":"̐", "trill": "͙", "tap": "͓",
              "dental": "̪", "alveolar": "͇", "half_labialization": "̹", "retroflex": "˞"}
SUB_TABLE_ADD = {"lateral": "ˡ", "labialization": "ʷ", "bilabial": "ᵙ", "labiodental": "ᶹ", "palatal": "ʲ", "velar": "ˠ",
              "uvular": "ʶ", "pharyngeal": "ˤ", "glottal": "ˀ", "aspiration": "ʰ"} #"labialization": "ᵝ"

l_sounds = [["approx", [["dental", "l̪ l̪̊"], ["postalveolar", "l̠ l̠̊"],
                        ["alveolar", "l l̥"], ["retroflex", "ɭ ɭ̊"],
                        ["velar", "ʟ ʟ̥"], ["uvular", "ʟ̠ ʟ̠̊"],
                        ["palatal", "ʎ ʎ̥"]]],
            ["tap",[["dental", "ɺ̪ ɺ̪̊"], ["postalveolar", "ɺ̠ ɺ̠̊"],
                    ["alveolar", "ɺ ɺ̥"], ["retroflex", "𝼈 𝼈̥"],
                    ["velar", "ʟ̆ ʟ̥̆"], ["uvular", "ʟ̠̆ ʟ̠̥̆"],
                    ["palatal", "ʎ̆ ʎ̥̆"]]],
            ["sibilant",   [["dental", "ʫ̪ ʪ̪"], ["postalveolar", "ʫ̠ ʪ̠"],
                            ["alveolar", "ʫ ʪ"], ["retroflex", "ʫ̢ ʪ̢"]]],
            ["non-sib fr", [["dental", "l̪̝ l̪̝̊"], ["postalveolar", "l̠̝ l̠̝̊"],
                            ["alveolar", "ɮ ɬ"], ["retroflex", "𝼅 ꞎ"],
                            ["velar", "𝼄̬ 𝼄"], ["uvular", "ʟ̠̝ ʟ̠̝̊"],
                            ["palatal", "𝼆̬ 𝼆"]]]]
p_sounds = [["approx", [["dental", "ɹ̪ ɹ̪̊"], ["postalveolar", "ɹ̠ ɹ̠̊"],
                        ["alveolar", "ɹ ɹ̥"], ["retroflex", "ɻ ɻ̊"],
                        ["labiodental", "ʋ ʋ̥"], ["bilabial", "β̞ ɸ̞"],
                        ["velar", "ɰ ɰ̊"], ["uvular", "ʁ̞ χ̞"],
                        ["palatal", "j j̊"], ["pharyngeal", "ʕ̞ ħ̞"],
                        ["glottal", "ʔ̬̞ ʔ̞"]]],
            ["tap",[["dental", "ɾ̪ ɾ̪̊"], ["postalveolar", "ɾ̠ ɾ̠̊"],
                    ["alveolar", "ɾ ɾ̥"], ["retroflex", "ɽ ɽ̊"],
                    ["labiodental", "ⱱ ⱱ̥"], ["bilabial", "ⱱ̟ ⱱ̟̊"],
                    ["velar", "ɡ̆ k̆"], ["uvular", "ɢ̆ q̆"],
                    ["palatal", "j̆ j̊̆"], ["pharyngeal", "ʢ̆ ʜ̆"],
                    ["glottal", "ʔ̬̆ ʔ̆"]]],
            ["trill",  [["dental", "r̪ r̪̊"], ["postalveolar", "r̠ r̠̊"],
                        ["alveolar", "r r̥"], ["retroflex", "ɽ͙ ɽ͙̊"],
                        ["labiodental", "ʙ̪ ʙ̪̊"], ["bilabial", "ʙ ʙ̥"],
                        ["velar", "ɡ͙ k͙"], ["uvular", "ʀ ʀ̥"],
                        ["palatal", "j͙ j͙̊"], ["pharyngeal", "ʢ ʜ"],
                        ["glottal", "ʔ̬͙ ʔ͙"]]],
            ["sibilant",   [["dental", "z̪ s̪"], ["postalveolar", "ʒ ʃ"],
                            ["alveolar", "z s"], ["retroflex", "ʐ ʂ"],
                            ["labiodental", "*z-ᶹ *s-ᶹ"], ["bilabial", "*z-ᵙ *s-ᵙ"],
                            ["velar", "*z-ˠ *s-ˠ"], ["uvular", "*z-ʶ *s-ʶ"],
                            ["palatal", "*z-ʲ *s-ʲ"], ["pharyngeal", "*z-ˤ *s-ˤ"],
                            ["glottal", "*z-ˀ *s-ˀ"]]],
            ["plosive",[["dental", "d̪ t̪"], ["postalveolar", "d̠ t̠"],
                        ["alveolar", "d t"], ["retroflex", "ɖ ʈ"],
                        ["labiodental", "ȸ ȹ"], ["bilabial", "b p"],
                        ["velar", "ɡ k"], ["uvular", "ɢ q"],
                        ["palatal", "ɟ c"], ["pharyngeal", "ʡ̬ ʡ"],
                        ["glottal", "ʔ̬ ʔ"]]],
            ["nasal",  [["dental", "n̪ n̪̊"], ["postalveolar", "n̠ n̠̊"],
                        ["alveolar", "n n̥"], ["retroflex", "ɳ ɳ̊"],
                        ["labiodental", "ɱ ɱ̊"], ["bilabial", "m m̥"],
                        ["velar", "ŋ ŋ̊"], ["uvular", "ɴ ɴ̥"],
                        ["palatal", "ɲ ɲ̊"], ["pharyngeal", "*n-ˤ *n̥-ˤ"],
                        ["glottal", "*n-ˀ *n̥-ˀ"]]],
            ["non-sib fr", [["dental", "ð θ"], ["postalveolar", "ɹ̠̝ ɹ̠̝̊"],
                            ["alveolar", "ð̠ θ̠"], ["retroflex", "ɻ̝ ɻ̝̊"],
                            ["labiodental", "v f"], ["bilabial", "β ɸ"],
                            ["velar", "ɣ x"], ["uvular", "ʁ χ"],
                            ["palatal", "ʝ ç"], ["pharyngeal", "ʕ ħ"],
                            ["glottal", "ɦ h"]]]]


def output_to_symbols_reformat(outputs:list, think_value=0.5, reverse=False, amplify=1.0, bias=0.0):
    # Reverse back
    if reverse:
        new_outputs = []
        for output in outputs:
            new_outputs.append([-i+bias*2+amplify for i in output])
        outputs = new_outputs.copy()

    # Max to 1 and min to 0
    if amplify != 1 or bias != 0:
        new_outputs = []
        for output in outputs:
            new_outputs.append([(i-bias)/amplify for i in output])
        outputs = new_outputs.copy()

    # All thinking is out
    new_outputs = []
    for output in outputs:
        if output[CRITERIA_TAGS_ID["thi"]] <= think_value: #Not thinking
            new_outputs.append(output)
    outputs = new_outputs.copy()

    return outputs

def select_sound(symbols:str, voiceless:bool):
    vd, vl = symbols.split(" ")
    return vl if voiceless else vd

def sound_manner(nc:dict, manner_list:list[list], used_artic:str):
    for manner, sounds in manner_list:
        if nc[manner]:
            sym = select_sound(sounds, nc["vl"])
            nc[manner], nc[used_artic] = False, False
            return nc, sym, True
    return nc, "", False

def sound_artic(nc:dict, artic_list:list[list]):
    for artic, manner_list in artic_list:
        if nc[artic]:
            nc, sym, ok = sound_manner(nc, manner_list, artic)
            if ok: return nc, sym
    return nc, ""

def create_new_criteria_output(output):
    nc = {}
    nc["approx"]     = True if output[CRITERIA_TAGS_ID["con"]] < 0.9375 else False
    nc["nasal"]      = False if output[CRITERIA_TAGS_ID["nas"]] < 0.5 else True
    nc["plosive"]    = False if output[CRITERIA_TAGS_ID["plo"]] < 0.5 else True
    nc["sibilant"]   = False if output[CRITERIA_TAGS_ID["sib"]] < 0.5 else True
    nc["trill"]      = False if output[CRITERIA_TAGS_ID["tri"]] < 0.5 else True
    nc["lateral"]    = False if output[CRITERIA_TAGS_ID["lat"]] < 0.5 else True
    nc["bilabial"]   = False if output[CRITERIA_TAGS_ID["bil"]] < 0.5 else True
    nc["dental"]     = False if output[CRITERIA_TAGS_ID["den"]] < 0.5 else True
    nc["alveolar"]   = False if output[CRITERIA_TAGS_ID["alv"]] < 0.5 else True
    nc["retroflex"]  = False if output[CRITERIA_TAGS_ID["ret"]] < 0.5 else True
    nc["palatal"]    = False if output[CRITERIA_TAGS_ID["pal"]] < 0.5 else True
    nc["velar"]      = False if output[CRITERIA_TAGS_ID["vel"]] < 0.5 else True
    nc["uvular"]     = False if output[CRITERIA_TAGS_ID["uvu"]] < 0.5 else True
    nc["pharyngeal"] = False if output[CRITERIA_TAGS_ID["pha"]] < 0.5 else True
    nc["glottal"]    = False if output[CRITERIA_TAGS_ID["glo"]] < 0.5 else True
    nc["aspiration"] = False if output[CRITERIA_TAGS_ID["asp"]] < 0.5 else True
    nc["vl"]         = False if output[CRITERIA_TAGS_ID["vls"]] < 0.5 else True

    curr_crit = output[CRITERIA_TAGS_ID["lbz"]]
    nc["labialization"] = False if curr_crit < 0.75 else True
    nc["half_labialization"] = True if curr_crit > 0.25 and curr_crit <= 0.75 else False

    nc["tap"], nc["labiodental"], nc["postalveolar"], nc["non-sib fr"] = False, False, False, False
    if nc["plosive"] and nc["trill"]:
        nc["tap"], nc["plosive"], nc["trill"] = True, False, False
    if nc["bilabial"] and nc["dental"]:
        nc["labiodental"], nc["bilabial"], nc["dental"], nc["alveolar"] = True, False, False, False
    if nc["alveolar"] and nc["retroflex"]:
        nc["postalveolar"], nc["alveolar"], nc["retroflex"] = True, False, False
    if nc["dental"] and nc["alveolar"]:
        nc["dental"], nc["alveolar"] = True, False

    nc["non-sib fr"] = not (nc["approx"] or nc["tap"] or nc["trill"] or
                            nc["sibilant"] or nc["plosive"] or nc["nasal"])
    return nc

def output_to_symbols(outputs:list, think_value=0.5, ignore_first=False, reverse=False, amplify=1.0, bias=0.0, spacing=""):
    if ignore_first: outputs.pop(0)
    outputs = output_to_symbols_reformat(outputs, think_value, reverse, amplify, bias)
    whole_symbol_notation = ""
    for output in outputs:
        curr_sym = ""
        is_vowel = False
        is_consonant = False
        is_space = False
        if output[CRITERIA_TAGS_ID["spa"]] >= 0.5: #Space
            is_space = True
            curr_sym = " " if output[CRITERIA_TAGS_ID["lon"]] < 0.5 else "|"
        elif output[CRITERIA_TAGS_ID["con"]] < 0.8125: # Vowel
            is_vowel = True
        else:
            is_consonant = True

        if is_consonant:
            nc = create_new_criteria_output(output)

            if nc["alveolar"] and nc["velar"] and nc["lateral"] and nc["approx"]:
                curr_sym = select_sound("ɫ ɫ̥", nc["vl"])
                nc["alveolar"], nc["velar"], nc["lateral"], nc["approx"] = False, False, False, False
            elif nc["labialization"] and nc["velar"] and nc["approx"]:
                curr_sym = select_sound("w ʍ", nc["vl"])
                nc["labialization"], nc["velar"], nc["approx"] = False, False, False
            elif nc["labialization"] and nc["palatal"] and nc["approx"]:
                curr_sym = select_sound("ɥ ɥ̊", nc["vl"])
                nc["labialization"], nc["palatal"], nc["approx"] = False, False, False
            elif nc["postalveolar"] and nc["palatal"] and nc["sibilant"] and not nc["approx"]:
                curr_sym = select_sound("ʑ ɕ", nc["vl"])
                nc["postalveolar"], nc["palatal"], nc["sibilant"] = False, False, False
            else:
                if nc["lateral"]:
                    nc, curr_sym = sound_artic(nc, l_sounds)
                    if curr_sym != "":
                        nc["lateral"] = False
                else:
                    nc, curr_sym = sound_artic(nc, p_sounds)

        if is_vowel:
            nc = create_new_criteria_output(output)
            nc["approx"] = False

            roundness = 2 if nc["labialization"] else 1 if nc["half_labialization"] else 0
            nc["labialization"], nc["half_labialization"] = False, False

            backness = 1 if nc["palatal"] == nc["velar"] else 0 if nc["palatal"] else 2
            nc["palatal"], nc["velar"] = False, False

            curr_crit = output[CRITERIA_TAGS_ID["con"]]
            curr_crit = 0 if curr_crit < 0 else 0.75 if curr_crit > 0.75 else curr_crit
            curr_crit = curr_crit * 16 + 1
            closeness = 6 - int(curr_crit // 2)

            curr_sym = vowels_table[closeness][backness][roundness]
            if nc["retroflex"]:
                if "ə" in curr_sym:
                    curr_sym = curr_sym.replace("ə", "ɚ")
                elif "ɜ" in curr_sym:
                    curr_sym = curr_sym.replace("ɜ", "ɝ")
                else:
                    curr_sym += "˞"
            nc["retroflex"] = False

            if nc["vl"]:
                if "y" in curr_sym or "̹" in curr_sym or "̞" in curr_sym:
                    curr_sym += "̊"
                else:
                    curr_sym += "̥"

        if not is_space:
            if curr_sym == "":
                curr_sym = "◌"

            curr_sym_split = curr_sym.split("-", 1)
            if len(curr_sym_split) == 1: curr_sym_split.append("")

            for i in MODS_TABLE_ADD:
                if nc[i]:
                    curr_sym_split[0] += MODS_TABLE_ADD[i]
                    nc[i] = False
            for i in SUB_TABLE_ADD:
                if nc[i]:
                    curr_sym_split[1] += SUB_TABLE_ADD[i]
                    nc[i] = False
            curr_sym = curr_sym_split[0]+curr_sym_split[1]

            if SUB_TABLE_ADD["labialization"] in curr_sym and SUB_TABLE_ADD["palatal"] in curr_sym:
                curr_sym = curr_sym.replace(SUB_TABLE_ADD["labialization"], "ᶣ")
                curr_sym = curr_sym.replace(SUB_TABLE_ADD["palatal"], "")

            # Long sound and stress
            if output[CRITERIA_TAGS_ID["lon"]] >= 0.5:
                curr_sym += "ː"
            if output[CRITERIA_TAGS_ID["str"]] >= 0.5:
                curr_sym = "ˈ"+curr_sym

        # Adding spacing
        curr_sym += spacing

        whole_symbol_notation += curr_sym
    return whole_symbol_notation

In [5]:
torch.set_printoptions(precision=8)
# DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
DEVICE = torch.device("cpu")

kernel_size = 15
num_layers = 4
old_version = True

# audio_path = "./zzzzzzz.wav"
def audio_to_ipa(audio_path="./audio.mp3", saving=True, think_value=0.5):
    audio_frames = audio_reformatting(audio_path)
    with torch.no_grad():
        hidden = MODEL.init_hidden()
        outputs = []
        for audio_frame in audio_frames:
            audio_frame = torch.from_numpy(audio_frame).unsqueeze(0).unsqueeze(0)
            output, hidden = MODEL(audio_frame, hidden)
            outputs.append(output.to(DEVICE))
        collected_outputs = [[float(j) for j in i[0][0]] for i in outputs]
        ipa = output_to_symbols(collected_outputs, think_value)
        if saving:
            with open(audio_path+".txt", 'w', encoding="utf8") as f:
                f.write(ipa)
        return ipa

def multiple_audio_to_ipa(audio_paths:list, saving=True, think_value=0.5):
    return [[audio_path, audio_to_ipa(audio_path, saving, think_value)] for audio_path in audio_paths]

def save_all(ipas:list, folder_path:str):
    with open(folder_path+"!all_files.txt", 'w', encoding="utf8") as f:
        for ipa in ipas:
            f.write(f"{ipa[0]}\t{ipa[1]}\n")

SUPPORTED_FILES = [".mp3", ".ogg", ".wav", ".flac", ".m4a"]
def folder_audio_to_ipa(folder_path:str, saving=True, think_value=0.5):
    import os
    folder_path = folder_path.replace("\\", "/")
    folder_path = folder_path if folder_path[-1] == "/" else folder_path+"/"
    all_folder_files = os.listdir(path=folder_path)

    audio_folder_files = []
    for file in all_folder_files:
        full_file_path = folder_path+file
        _, file_extension = os.path.splitext(full_file_path)
        if file_extension in SUPPORTED_FILES:
            audio_folder_files.append(full_file_path)
    ipas = multiple_audio_to_ipa(audio_folder_files, saving, think_value)
    if saving:
        save_all(ipas, folder_path)
    return ipas


In [6]:
!gdown 1CKfW-A6MZjGjpEaIe5UXkc0q_rjexN99
!unzip "listen_ipa_archive.zip"

Downloading...
From: https://drive.google.com/uc?id=1CKfW-A6MZjGjpEaIe5UXkc0q_rjexN99
To: /content/listen_ipa_archive.zip
100% 10.2M/10.2M [00:00<00:00, 19.1MB/s]
Archive:  listen_ipa_archive.zip
   creating: data/
   creating: data/audio_examples/
  inflating: data/audio_examples/113.mp3  
  inflating: data/audio_examples/13809.mp3  
  inflating: data/audio_examples/13868.mp3  
  inflating: data/audio_examples/153747.mp3  
  inflating: data/audio_examples/165331.mp3  
  inflating: data/audio_examples/22311.mp3  
  inflating: data/audio_examples/22404.mp3  
  inflating: data/audio_examples/22545.mp3  
  inflating: data/audio_examples/228166.mp3  
  inflating: data/audio_examples/228408.mp3  
  inflating: data/audio_examples/259561.mp3  
  inflating: data/examples.json      
  inflating: !!!test.wav             
  inflating: model_epoch_1.pth       
  inflating: model_epoch_2.pth       
  inflating: model_epoch_3.pth       
  inflating: model_new_epoch_1.pth   
  inflating: model_new_ep

# Загрузка модели

In [None]:
# Выбрать модель
epoch = 1
old_version = True

MODEL = ListenIPA(513, CRITERIA_LEN, kernel_size, num_layers, old_version)
model_path = f'/content/model{"" if old_version else "_new"}_epoch_{epoch}.pth'
MODEL.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
MODEL.eval()

# Преобразование аудио в транскрипцию

In [None]:
print(audio_to_ipa("/content/!!!test.wav", think_value=0.5))
for i in folder_audio_to_ipa('/content/data/audio_examples/', think_value=0.5):
    print(i[0], i[1], sep="\t")