In [2]:
import torch
import numpy as np
import models as mo
import WordMetrics
import WordMatching as wm
import epitran
import ModelInterfaces as mi
import AIModels
import RuleBasedModels
from string import punctuation
import time

ModuleNotFoundError: No module named 'torch'

In [None]:
import ModelInterfaces
import torch
import numpy as np
import nemo.collections.asr as nemo_asr

from omegaconf import OmegaConf, open_dict


class NeuralASR(ModelInterfaces.IASRModel):
    word_locations_in_samples = None
    audio_transcript = None

    def __init__(self, model) -> None:
        super().__init__()
        self.model = model

    def getTranscript(self) -> str:
        """Get the transcripts of the process audio"""
        assert (
            self.audio_transcript != None,
            "Can get audio transcripts without having processed the audio",
        )
        return self.audio_transcript

    def getWordLocations(self) -> list:
        """Get the pair of words location from audio"""
        assert (
            self.word_locations_in_samples != None,
            "Can get word locations without having processed the audio",
        )

        return self.word_locations_in_samples

    def processAudio(self, audio: str):
        """Process the audio"""
        decoding_cfg = self.model.cfg.decoding
        with open_dict(decoding_cfg):
            decoding_cfg.preserve_alignments = True
            decoding_cfg.compute_timestamps = True
            self.model.change_decoding_strategy(decoding_cfg)

        hypotheses = self.model.transcribe([audio], return_hypotheses=True)
        if type(hypotheses) == tuple and len(hypotheses) == 2:
            hypotheses = hypotheses[0]

        timestamp_dict = hypotheses[0].timestep
        time_stride = 8 * self.model.cfg.preprocessor.window_stride
        word_timestamps = timestamp_dict["word"]

        for stamp in word_timestamps:
            stamp["start_ts"] = stamp.pop("start_offset") * time_stride
            stamp["end_ts"] = stamp.pop("end_offset") * time_stride

        self.word_locations_in_samples = word_timestamps
        self.audio_transcript = hypotheses[0].text


class NeuralTTS(ModelInterfaces.ITextToSpeechModel):
    def __init__(self, model: torch.nn.Module, sampling_rate: int) -> None:
        super().__init__()
        self.model = model
        self.sampling_rate = sampling_rate

    def getAudioFromSentence(self, sentence: str) -> np.array:
        with torch.inference_mode():
            audio_transcript = self.model.apply_tts(
                texts=[sentence], sample_rate=self.sampling_rate
            )[0]

        return audio_transcript

    
    
    
    
      assert (
    
      assert (
    


In [None]:
import torch
import numpy as np
import models as mo
import torchaudio
import WordMetrics
import WordMatching as wm
import epitran
import ModelInterfaces as mi
import AIModels
import RuleBasedModels
from string import punctuation
import time


class PronunciationTrainer:
    current_transcript: str
    current_ipa: str
    current_recorded_audio: torch.Tensor
    current_recorded_transcript: str
    current_recorded_word_locations: list
    current_recorded_intonations: torch.tensor
    current_words_pronunciation_accuracy = []
    categories_thresholds = np.array([80, 60, 59])

    sampling_rate = 16000

    def __init__(
        self, asr_model: mi.IASRModel, word_to_ipa_coverter: mi.ITextToPhonemModel
    ) -> None:
        self.asr_model = asr_model
        self.ipa_converter = word_to_ipa_coverter

    def getTranscriptAndWordsLocations(self, audio_length_in_samples: int):
        audio_transcript = self.asr_model.getTranscript()
        word_locations_in_samples = self.asr_model.getWordLocations()
        # print("ASR transcript-output: ", audio_transcript)
        # print("ASR word-locations: ", word_locations_in_samples)

        return audio_transcript, word_locations_in_samples

    ##################### ASR Functions ###########################

    def processAudioForGivenText(self, recordedAudio: str = None, real_text=None):
        start = time.time()
        recording_transcript, recording_ipa, word_locations = self.getAudioTranscript(
            recordedAudio
        )
        print(recording_ipa)
        print("Time for NN to transcript audio: ", str(time.time() - start))

        start = time.time()
        (
            real_and_transcribed_words,
            real_and_transcribed_words_ipa,
            mapped_words_indices,
        ) = self.matchSampleAndRecordedWords(real_text, recording_transcript)
        print("Time for matching transcripts: ", str(time.time() - start))

        start_time, end_time = self.getWordLocationsFromRecordInSeconds(
            word_locations, mapped_words_indices
        )
        pronunciation_accuracy, current_words_pronunciation_accuracy = (
            self.getPronunciationAccuracy(real_and_transcribed_words)
        )  # _ipa
        pronunciation_categories = self.getWordsPronunciationCategory(
            current_words_pronunciation_accuracy
        )

        result = {
            "recording_transcript": recording_transcript,
            "real_and_transcribed_words": real_and_transcribed_words,
            "recording_ipa": recording_ipa,
            "start_time": start_time,
            "end_time": end_time,
            "real_and_transcribed_words_ipa": real_and_transcribed_words_ipa,
            "pronunciation_accuracy": pronunciation_accuracy,
            "pronunciation_categories": pronunciation_categories,
        }

        return result

    def getAudioTranscript(self, recordedAudio: str = None):
        current_recorded_audio, sample_rate = torchaudio.load(recordedAudio)
        transform = torchaudio.transforms.Resample(
            orig_freq=sample_rate, new_freq=16000
        )
        current_recorded_audio = transform(current_recorded_audio)
        current_recorded_audio = self.preprocessAudio(current_recorded_audio)

        self.asr_model.processAudio(recordedAudio)
        current_recorded_transcript, current_recorded_word_locations = (
            self.getTranscriptAndWordsLocations(recordedAudio)
        )
        current_recorded_ipa = self.ipa_converter.convertToPhonem(
            current_recorded_transcript
        )

        return (
            current_recorded_transcript,
            current_recorded_ipa,
            current_recorded_word_locations,
        )

    def getWordLocationsFromRecordInSeconds(
        self, word_locations, mapped_words_indices
    ) -> list:
        start_time = []
        end_time = []
        for word_idx in range(len(mapped_words_indices)):
            start_time.append(
                float(word_locations[mapped_words_indices[word_idx]]["start_ts"])
            )
            end_time.append(
                float(word_locations[mapped_words_indices[word_idx]]["end_ts"])
            )

        return " ".join([str(time) for time in start_time]), " ".join(
            [str(time) for time in end_time]
        )

    ##################### END ASR Functions ###########################

    ##################### Evaluation Functions ###########################

    def matchSampleAndRecordedWords(self, real_text, recorded_transcript):
        words_estimated = recorded_transcript.split()

        if real_text is None:
            words_real = self.current_transcript.split()
        else:
            words_real = real_text.split()
            

        mapped_words, mapped_words_indices = wm.get_best_mapped_words(
            words_estimated, words_real
        )
        # print(mapped_words, words_estimated, words_real)
        real_and_transcribed_words = []
        real_and_transcribed_words_ipa = []
        for word_idx in range(len(words_real)):
            if word_idx >= len(mapped_words) - 1:
                mapped_words.append("-")

            real_and_transcribed_words.append(
                (words_real[word_idx], mapped_words[word_idx])
            )
            real_and_transcribed_words_ipa.append(
                (
                   self.ipa_converter.convertToPhonem(words_real[word_idx]), 
                   self.ipa_converter.convertToPhonem(mapped_words[word_idx]),
                )
            )
            
        print(real_and_transcribed_words_ipa)

        return (
            real_and_transcribed_words,
            real_and_transcribed_words_ipa,
            mapped_words_indices,
        )

    def getPronunciationAccuracy(self, real_and_transcribed_words_ipa) -> float:
        total_mismatches = 0.0
        number_of_phonemes = 0.0
        current_words_pronunciation_accuracy = []
        for pair in real_and_transcribed_words_ipa:

            real_without_punctuation = self.removePunctuation(pair[0]).lower()
            number_of_word_mismatches = WordMetrics.edit_distance_python(
                real_without_punctuation, self.removePunctuation(pair[1]).lower()
            )
            total_mismatches += number_of_word_mismatches
            number_of_phonemes_in_word = len(real_without_punctuation)
            number_of_phonemes += number_of_phonemes_in_word

            current_words_pronunciation_accuracy.append(
                float(number_of_phonemes_in_word - number_of_word_mismatches)
                / number_of_phonemes_in_word
                * 100
            )

        percentage_of_correct_pronunciations = (
            (number_of_phonemes - total_mismatches) / number_of_phonemes * 100
        )

        return (
            np.round(percentage_of_correct_pronunciations),
            current_words_pronunciation_accuracy,
        )

    def removePunctuation(self, word: str) -> str:
        return "".join([char for char in word if char not in punctuation])

    def getWordsPronunciationCategory(self, accuracies) -> list:
        categories = []

        for accuracy in accuracies:
            categories.append(self.getPronunciationCategoryFromAccuracy(accuracy))

        return categories

    def getPronunciationCategoryFromAccuracy(self, accuracy) -> int:
        return np.argmin(abs(self.categories_thresholds - accuracy))

    def preprocessAudio(self, audio: torch.tensor) -> torch.tensor:
        audio = audio - torch.mean(audio)
        audio = audio / torch.max(torch.abs(audio))
        return audio

In [None]:
def getTrainer(language: str):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = mo.getASRModel2()
    model = model.to(device)
    model.eval()
    asr_model = NeuralASR(model)

    if language == "de":
        phonem_converter = RuleBasedModels.EpitranPhonemConverter(
            epitran.Epitran("eng-Latn")
        )
    elif language == "en":
        phonem_converter = RuleBasedModels.EngPhonemConverter()
    else:
        raise ValueError("Language not implemented")

    trainer = PronunciationTrainer(asr_model, phonem_converter)

    return trainer


import torchaudio

trainer = getTrainer("en")

[NeMo I 2024-05-29 18:20:14 nemo_logging:381] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2024-05-29 18:20:14 nemo_logging:393] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 20
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    
[NeMo W 2024-05-29 18:20:14 nemo_logging:393] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 32
    shuffle: false
    num_workers: 8

[NeMo I 2024-05-29 18:20:14 nemo_logging:381] PADDING: 0
[NeMo I 2024-05-29 18:20:16 nemo_logging:381] Model EncDecCTCModelBPE was successfully restored from /home/ubuntu/.cache/huggingface/hub/models--nvidia--stt_en_fastconformer_ctc_large/snapshots/42b3eb6bd6f86465f0691f9ea33ddf8f4c5d1c10/stt_en_fastconformer_ctc_large.nemo.


In [None]:
real_transcript = "Printing in the only sense with which we are at present concerned differs from most if not from all the arts and crafts represented in the Exhibition"
file_path = "output.wav"

result = trainer.processAudioForGivenText(file_path, real_transcript)

[NeMo I 2024-05-29 18:20:16 nemo_logging:381] Changed decoding strategy to 
    strategy: greedy
    preserve_alignments: true
    compute_timestamps: true
    word_seperator: ' '
    ctc_timestamp_type: all
    batch_dim_index: 0
    greedy:
      preserve_alignments: false
      compute_timestamps: false
      preserve_frame_confidence: false
      confidence_method_cfg: null
    beam:
      beam_size: 4
      search_type: default
      preserve_alignments: false
      compute_timestamps: false
      return_best_hypothesis: true
      beam_alpha: 1.0
      beam_beta: 0.0
      kenlm_path: null
      flashlight_cfg:
        lexicon_path: null
        boost_path: null
        beam_size_token: 16
        beam_threshold: 20.0
        unk_weight: -.inf
        sil_weight: 0.0
      pyctcdecode_cfg:
        beam_prune_logp: -10.0
        token_min_logp: -5.0
        prune_history: false
        hotwords: null
        hotword_weight: 10.0
    confidence_cfg:
      preserve_frame_confidence:

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

    


prɪntɪŋ ɪn ðə oʊnli sɛns wɪð wɪtʃ wiː ɑːr æt prɛzənt kəlsɑːnuː dɪfɝz frʌm moʊst ɪf nɑːt frʌm ɔl ɑːrtən ənd tʃɛrəftɝ rɛprəzɛntəd ɪn ætʃɪgɪsən
Time for NN to transcript audio:  0.9944329261779785
[('prɪntɪŋ', 'prɪntɪŋ'), ('ɪn', 'ɪn'), ('ðə', 'ðə'), ('oʊnli', 'oʊnli'), ('sɛns', 'sɛns'), ('wɪð', 'wɪð'), ('wɪtʃ', 'wɪtʃ'), ('wiː', 'wiː'), ('ɑːr', 'ɑːr'), ('æt', 'æt'), ('prɛzənt', 'prɛzənt'), ('kənsɝːnd', 'kəlsɑːnuː'), ('dɪfɝz', 'dɪfɝz'), ('frʌm', 'frʌm'), ('moʊst', 'moʊst'), ('ɪf', 'ɪf'), ('nɑːt', 'nɑːt'), ('frʌm', 'frʌm'), ('ɔl', 'ɔl'), ('ðə', '-'), ('ɑːrts', 'ɑːrtən'), ('ənd', 'ənd'), ('kræfts', 'tʃɛrəftɝ'), ('rɛprəzɛntəd', 'rɛprəzɛntəd'), ('ɪn', 'ɪn'), ('ðə', '-'), ('ɛksəbɪʃən', 'ætʃɪgɪsən')]
Time for matching transcripts:  8.331892251968384


In [None]:
print(result)

{'recording_transcript': 'printing in the only sense with which we are at present callsanu differs from most if not from all artan and charaafter represented in achigisson', 'real_and_transcribed_words': [('Printing', 'printing'), ('in', 'in'), ('the', 'the'), ('only', 'only'), ('sense', 'sense'), ('with', 'with'), ('which', 'which'), ('we', 'we'), ('are', 'are'), ('at', 'at'), ('present', 'present'), ('concerned', 'callsanu'), ('differs', 'differs'), ('from', 'from'), ('most', 'most'), ('if', 'if'), ('not', 'not'), ('from', 'from'), ('all', 'all'), ('the', '-'), ('arts', 'artan'), ('and', 'and'), ('crafts', 'charaafter'), ('represented', 'represented'), ('in', 'in'), ('the', '-'), ('Exhibition', 'achigisson')], 'recording_ipa': 'prɪntɪŋ ɪn ðə oʊnli sɛns wɪð wɪtʃ wiː ɑːr æt prɛzənt kəlsɑːnuː dɪfɝz frʌm moʊst ɪf nɑːt frʌm ɔl ɑːrtən ənd tʃɛrəftɝ rɛprəzɛntəd ɪn ætʃɪgɪsən', 'start_time': '0.72 1.2 1.36 1.52 1.68 2.16 2.64 2.8000000000000003 3.04 3.2 3.44 3.6 4.72 5.68 5.92 6.16 6.640000000

In [None]:
print("Pronunciation Score: ", result["pronunciation_accuracy"])

Pronunciation Score:  80.0


In [None]:
import json

In [None]:
event_tts = {
    "body": json.dumps(
        {
            "value": "Printing in the only sense with which we are at present concerned differs from most if not from all the arts and crafts represented in the Exhibition",
            "language": "de",
        }
    )
}

In [None]:
real_transcript = json.loads(event_tts["body"])["value"]
print(real_transcript)

Printing in the only sense with which we are at present concerned differs from most if not from all the arts and crafts represented in the Exhibition


In [None]:
import lambdaTTS

output_tts = lambdaTTS.lambda_handler(event_tts, [])

body = json.loads(output_tts["body"])
wavBase64 = body["wavBase64"]
print(wavBase64[:100])

Using cache found in /home/ubuntu/.cache/torch/hub/snakers4_silero-models_master
    


UklGRqSjBABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YYCjBAATAC0ANgA/AE8AUQBbAGUAbQBpAGYAYwBaAE8ASABC


In [None]:
encoded_audio = wavBase64

dummy_event = {
    "body": json.dumps(
        {
            "title": real_transcript,
            "base64Audio": "data:audio/ogg;base64," + encoded_audio,
            "language": "de",
        }
    )
}

In [None]:
import torch
import json
import os
import WordMatching as wm
import utilsFileIO
import pronunciationTrainer
import base64
import time
import audioread
import numpy as np
from torchaudio.transforms import Resample


trainer_SST_lambda = {}
trainer_SST_lambda["de"] = getTrainer("de")
trainer_SST_lambda["en"] = getTrainer("en")

transform = Resample(orig_freq=48000, new_freq=16000)


def lambda_handler(event, result):

    data = json.loads(event["body"])

    real_text = data["title"]
    file_bytes = base64.b64decode(data["base64Audio"][22:].encode("utf-8"))
    language = data["language"]

    if len(real_text) == 0:
        return {
            "statusCode": 200,
            "headers": {
                "Access-Control-Allow-Headers": "*",
                "Access-Control-Allow-Credentials": "true",
                "Access-Control-Allow-Origin": "*",
                "Access-Control-Allow-Methods": "OPTIONS,POST,GET",
            },
            "body": "",
        }

    start = time.time()
    random_file_name = "./" + utilsFileIO.generateRandomString() + ".ogg"
    f = open(random_file_name, "wb")
    f.write(file_bytes)
    f.close()
    print("Time for saving binary in file: ", str(time.time() - start))

    # start = time.time()
    # signal, fs = audioread_load(random_file_name)
 
    # signal = transform(torch.Tensor(signal)).unsqueeze(0)

    print("Time for loading .ogg file file: ", str(time.time() - start))

    result = trainer_SST_lambda[language].processAudioForGivenText(random_file_name, real_text)

    start = time.time()
    os.remove(random_file_name)
    print("Time for deleting file: ", str(time.time() - start))

    start = time.time()
    real_transcripts_ipa = " ".join(
        [word[0] for word in result["real_and_transcribed_words_ipa"]]
    )
    matched_transcripts_ipa = " ".join(
        [word[1] for word in result["real_and_transcribed_words_ipa"]]
    )

    real_transcripts = " ".join(
        [word[0] for word in result["real_and_transcribed_words"]]
    )
    matched_transcripts = " ".join(
        [word[1] for word in result["real_and_transcribed_words"]]
    )

    words_real = real_transcripts.lower().split()
    mapped_words = matched_transcripts.split()

    is_letter_correct_all_words = ""
    for idx, word_real in enumerate(words_real):

        mapped_letters, mapped_letters_indices = wm.get_best_mapped_words(
            mapped_words[idx], word_real
        )

        is_letter_correct = wm.getWhichLettersWereTranscribedCorrectly(
            word_real, mapped_letters
        )  # , mapped_letters_indices)

        is_letter_correct_all_words += (
            "".join([str(is_correct) for is_correct in is_letter_correct]) + " "
        )

    pair_accuracy_category = " ".join(
        [str(category) for category in result["pronunciation_categories"]]
    )
    print("Time to post-process results: ", str(time.time() - start))

    res = {
        "real_transcript": result["recording_transcript"],
        "ipa_transcript": result["recording_ipa"],
        "pronunciation_accuracy": str(int(result["pronunciation_accuracy"])),
        "real_transcripts": real_transcripts,
        "matched_transcripts": matched_transcripts,
        "real_transcripts_ipa": real_transcripts_ipa,
        "matched_transcripts_ipa": matched_transcripts_ipa,
        "pair_accuracy_category": pair_accuracy_category,
        "start_time": result["start_time"],
        "end_time": result["end_time"],
        "is_letter_correct_all_words": is_letter_correct_all_words,
    }

    return json.dumps(res)


# From Librosa


def audioread_load(path, offset=0.0, duration=None, dtype=np.float32):
    """Load an audio buffer using audioread.

    This loads one block at a time, and then concatenates the results.
    """

    y = []
    with audioread.audio_open(path) as input_file:
        sr_native = input_file.samplerate
        n_channels = input_file.channels

        s_start = int(np.round(sr_native * offset)) * n_channels

        if duration is None:
            s_end = np.inf
        else:
            s_end = s_start + (int(np.round(sr_native * duration)) * n_channels)

        n = 0

        for frame in input_file:
            frame = buf_to_float(frame, dtype=dtype)
            n_prev = n
            n = n + len(frame)

            if n < s_start:
                # offset is after the current frame
                # keep reading
                continue

            if s_end < n_prev:
                # we're off the end.  stop reading
                break

            if s_end < n:
                # the end is in this frame.  crop.
                frame = frame[: s_end - n_prev]

            if n_prev <= s_start <= n:
                # beginning is in this frame
                frame = frame[(s_start - n_prev) :]

            # tack on the current frame
            y.append(frame)

    if y:
        y = np.concatenate(y)
        if n_channels > 1:
            y = y.reshape((-1, n_channels)).T
    else:
        y = np.empty(0, dtype=dtype)

    return y, sr_native


# From Librosa


def buf_to_float(x, n_bytes=2, dtype=np.float32):
    """Convert an integer buffer to floating point values.
    This is primarily useful when loading integer-valued wav data
    into numpy arrays.

    Parameters
    ----------
    x : np.ndarray [dtype=int]
        The integer-valued data buffer

    n_bytes : int [1, 2, 4]
        The number of bytes per sample in ``x``

    dtype : numeric type
        The target output type (default: 32-bit float)

    Returns
    -------
    x_float : np.ndarray [dtype=float]
        The input data buffer cast to floating point
    """

    # Invert the scale of the data
    scale = 1.0 / float(1 << ((8 * n_bytes) - 1))

    # Construct the format string
    fmt = "<i{:d}".format(n_bytes)

    # Rescale and format the data buffer
    return scale * np.frombuffer(x, fmt).astype(dtype)


[NeMo I 2024-05-29 18:20:32 nemo_logging:381] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2024-05-29 18:20:32 nemo_logging:393] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 20
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    
[NeMo W 2024-05-29 18:20:32 nemo_logging:393] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 32
    shuffle: false
    num_workers: 8

[NeMo I 2024-05-29 18:20:32 nemo_logging:381] PADDING: 0
[NeMo I 2024-05-29 18:20:34 nemo_logging:381] Model EncDecCTCModelBPE was successfully restored from /home/ubuntu/.cache/huggingface/hub/models--nvidia--stt_en_fastconformer_ctc_large/snapshots/42b3eb6bd6f86465f0691f9ea33ddf8f4c5d1c10/stt_en_fastconformer_ctc_large.nemo.
[NeMo I 2024-05-29 18:20:36 nemo_logging:381] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2024-05-29 18:20:36 nemo_logging:393] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 20
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    
[NeMo W 2024-05-29 18:20:36 nemo_logging:393] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 32
    shuffle: false
    num_workers: 8

[NeMo I 2024-05-29 18:20:36 nemo_logging:381] PADDING: 0
[NeMo I 2024-05-29 18:20:38 nemo_logging:381] Model EncDecCTCModelBPE was successfully restored from /home/ubuntu/.cache/huggingface/hub/models--nvidia--stt_en_fastconformer_ctc_large/snapshots/42b3eb6bd6f86465f0691f9ea33ddf8f4c5d1c10/stt_en_fastconformer_ctc_large.nemo.


In [None]:
res = lambda_handler(dummy_event, result)

Time for saving binary in file:  0.0006592273712158203
Time for loading .ogg file file:  0.0007236003875732422
[NeMo I 2024-05-29 18:20:38 nemo_logging:381] Changed decoding strategy to 
    strategy: greedy
    preserve_alignments: true
    compute_timestamps: true
    word_seperator: ' '
    ctc_timestamp_type: all
    batch_dim_index: 0
    greedy:
      preserve_alignments: false
      compute_timestamps: false
      preserve_frame_confidence: false
      confidence_method_cfg: null
    beam:
      beam_size: 4
      search_type: default
      preserve_alignments: false
      compute_timestamps: false
      return_best_hypothesis: true
      beam_alpha: 1.0
      beam_beta: 0.0
      kenlm_path: null
      flashlight_cfg:
        lexicon_path: null
        boost_path: null
        beam_size_token: 16
        beam_threshold: 20.0
        unk_weight: -.inf
        sil_weight: 0.0
      pyctcdecode_cfg:
        beam_prune_logp: -10.0
        token_min_logp: -5.0
        prune_history:

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]



IndexError: list index out of range

In [None]:
print(res)

{"real_transcript": "printing in the only zens with which we are at present concerned differs from most if not from altar arts and crafts represented in the exhibits zone", "ipa_transcript": "p\u0280\u026anti\u02d0\u014b i\u02d0n te\u02d0 \u0254nly\u02d0 t\u0361s\u0259ns vi\u02d0t vhi\u02d0x ve\u02d0 \u0251\u02d0\u0280e\u02d0 \u0251\u02d0t p\u0280e\u02d0s\u0259nt k\u0254\u014bk\u0259\u0280ne\u02d0d d\u026af\u0259\u0280s f\u0280o\u02d0m m\u0254st i\u02d0f no\u02d0t f\u0280o\u02d0m alt\u0251\u02d0\u0280 a\u0280ts and k\u0280afts \u0280\u025bp\u0280e\u02d0s\u0259nte\u02d0d i\u02d0n te\u02d0 \u025bkshi\u02d0b\u026ats t\u0361so\u02d0n\u0259", "pronunciation_accuracy": "91", "real_transcripts": "Printing in the only sense with which we are at present concerned differs from most if not from all the arts and crafts represented in the Exhibition", "matched_transcripts": "printing in the only zens with which we are at present concerned differs from most if not from - - arts and crafts represente

In [None]:
import pandas as pd

# Assuming res is your dictionary
df = pd.DataFrame([res])
df

Unnamed: 0,0
0,"{""real_transcript"": ""printing in the only zens..."


In [None]:
import pandas as pd

# Data provided by the user
data = {
    "real_transcript": "printing in the only sense with which we are at present callsanu differs from most if not from all artan and charaafter represented in achigisson",
    "ipa_transcript": "ˈprɪnɪŋ ɪn ðə ˈoʊnli sɛns wɪθ wɪʃ wi ər æt ˈprɛzənt callsanu ˈdɪfərz frəm moʊst ɪf nɑt frəm ɔl artan ənd charaafter ˌrɛprɪˈzɛnɪd ɪn achigisson",
    "pronunciation_accuracy": 49,
    "real_transcripts": "Printing in the only sense with which we are at present concerned differs from most if not from all the arts and crafts represented in the Exhibition",
    "matched_transcripts": "printing in the only sense with which we are at present callsanu differs from most if not from all artan and charaafter represented in achigisson - -",
    "real_transcripts_ipa": "ˈprɪnɪŋ ɪn ðə ˈoʊnli sɛns wɪθ wɪʃ wi ər æt ˈprɛzənt kənˈsɜrnd ˈdɪfərz frəm moʊst ɪf nɑt frəm ɔl ðə ɑrts ənd kræfts ˌrɛprɪˈzɛnɪd ɪn ðə ˌɛksəˈbɪʃən",
    "matched_transcripts_ipa": "ˈprɪnɪŋ ɪn ðə ˈoʊnli sɛns wɪθ wɪʃ wi ər æt ˈprɛzənt callsanu ˈdɪfərz frəm moʊst ɪf nɑt frəm ɔl artan ənd charaafter ˌrɛprɪˈzɛnɪd ɪn achigisson  ",
    "pair_accuracy_category": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2],
    "start_time": [0.72, 1.2, 1.36, 1.52, 1.68, 2.16, 2.64, 2.8000000000000003, 3.04, 3.2, 3.44, 3.6, 4.72, 5.68, 5.92, 6.16, 6.640000000000001, 6.96, 7.28, 7.5200000000000005, 8.24, 8.4, 9.040000000000001, 10.8, 10.96, 10.96, 10.96],
    "end_time": [1.2, 1.36, 1.52, 1.68, 2.16, 2.64, 2.8000000000000003, 3.04, 3.2, 3.44, 3.6, 4.72, 5.68, 5.92, 6.16, 6.640000000000001, 6.96, 7.28, 7.5200000000000005, 8.24, 8.4, 9.040000000000001, 10.8, 10.96, 11.92, 11.92, 11.92],
    "is_letter_correct_all_words": "11111111 11 111 1111 11111 1111 11111 11 111 11 1111111 100000000 1111000 1111 1111 11 111 1111 111 100 1000 100 010000 00000000000 11 000 0000000000"
}

# Convert the data into a DataFrame for better visualization
df = pd.DataFrame(data)
%pip install ace-tools
import ace_tools as tools; tools.display_dataframe_to_user(name="Pronunciation Data", dataframe=df)

[31mERROR: Could not find a version that satisfies the requirement ace-tools (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for ace-tools[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'ace_tools'

In [None]:
real = 'Printing in the only sense with which we are at present concerned differs from most if not from all the arts and crafts represented in the Exhibition'
recorded = 'printing in the only sense with which we are at present callsanu differs from most if not from all artan and charaafter represented in achigisson'

In [None]:
words_real = real.split()
words_estimated = recorded.split()

In [None]:
import WordMatching as wm
import WordMetrics
from ortools.sat.python import cp_model
import numpy as np
from string import punctuation
from dtwalign import dtw_from_distance_matrix
import time

offset_blank = 1
TIME_THRESHOLD_MAPPING = 5.0


word_distance_matrix = wm.get_word_distance_matrix(words_estimated, words_real)
print(word_distance_matrix)

[[ 1.  6.  7.  7.  7.  6.  7.  8.  7.  7.  5.  8.  7.  7.  7.  7.  7.  7.
   8.  7.  6.  7.  6.  8.  6.  7.  7.]
 [ 6.  0.  3.  3.  4.  3.  4.  2.  3.  2.  6.  8.  6.  4.  4.  1.  3.  4.
   3.  3.  4.  2.  6. 10.  0.  3.  8.]
 [ 7.  3.  0.  4.  4.  3.  4.  2.  2.  3.  6.  8.  6.  4.  4.  3.  3.  4.
   3.  0.  4.  3.  6. 10.  3.  0.  9.]
 [ 7.  3.  4.  0.  4.  4.  5.  4.  4.  4.  7.  7.  7.  4.  4.  4.  3.  4.
   3.  4.  4.  3.  6. 10.  3.  4. 10.]
 [ 7.  4.  4.  4.  0.  5.  5.  4.  4.  5.  5.  7.  6.  5.  4.  5.  4.  5.
   5.  4.  4.  4.  6.  7.  4.  4. 10.]
 [ 6.  3.  3.  4.  5.  0.  2.  3.  4.  3.  7.  9.  6.  4.  4.  3.  3.  4.
   4.  3.  3.  4.  5. 10.  3.  3.  8.]
 [ 7.  4.  4.  5.  5.  2.  0.  4.  5.  5.  7.  8.  7.  5.  5.  4.  5.  5.
   5.  4.  5.  5.  6. 11.  4.  4.  8.]
 [ 8.  2.  2.  4.  4.  3.  4.  0.  2.  2.  6.  8.  6.  4.  4.  2.  3.  4.
   3.  2.  4.  3.  6. 10.  2.  2. 10.]
 [ 7.  3.  2.  4.  4.  4.  5.  2.  0.  2.  5.  7.  6.  3.  4.  3.  3.  3.
   2.  2.  2.  2.  5. 

In [None]:
def get_best_path_from_distance_matrix(word_distance_matrix):
    modelCpp = cp_model.CpModel()
    print("check cp")
    number_of_real_words = word_distance_matrix.shape[1]
    number_of_estimated_words = word_distance_matrix.shape[0]-1

    number_words = np.maximum(number_of_real_words, number_of_estimated_words)

    estimated_words_order = [modelCpp.NewIntVar(0, int(
        number_words - 1 + offset_blank), 'w%i' % i) for i in range(number_words+offset_blank)]

    # They are in ascending order
    for word_idx in range(number_words-1):
        modelCpp.Add(
            estimated_words_order[word_idx+1] >= estimated_words_order[word_idx])

    total_phoneme_distance = 0
    real_word_at_time = {}
    for idx_estimated in range(number_of_estimated_words):
        for idx_real in range(number_of_real_words):
            real_word_at_time[idx_estimated, idx_real] = modelCpp.NewBoolVar(
                'real_word_at_time'+str(idx_real)+'-'+str(idx_estimated))
            modelCpp.Add(estimated_words_order[idx_estimated] == idx_real).OnlyEnforceIf(
                real_word_at_time[idx_estimated, idx_real])
            total_phoneme_distance += word_distance_matrix[idx_estimated,
                                                           idx_real]*real_word_at_time[idx_estimated, idx_real]

    # If no word in time, difference is calculated from empty string
    for idx_real in range(number_of_real_words):
        word_has_a_match = modelCpp.NewBoolVar(
            'word_has_a_match'+str(idx_real))
        modelCpp.Add(sum([real_word_at_time[idx_estimated, idx_real] for idx_estimated in range(
            number_of_estimated_words)]) == 1).OnlyEnforceIf(word_has_a_match)
        total_phoneme_distance += word_distance_matrix[number_of_estimated_words,
                                                       idx_real]*word_has_a_match.Not()

    # Loss should be minimized
    modelCpp.Minimize(total_phoneme_distance)

    solver = cp_model.CpSolver()
    solver.parameters.max_time_in_seconds = TIME_THRESHOLD_MAPPING
    status = solver.Solve(modelCpp)
    print('check 2')
    print(status)
    print(solver)
    mapped_indices = []
    try:
        for word_idx in range(number_words):
            mapped_indices.append(
                (solver.Value(estimated_words_order[word_idx])))
        print("checkkkk")
        return np.array(mapped_indices, dtype=int)
    except:
        return []

In [None]:
mapped_indices = get_best_path_from_distance_matrix(word_distance_matrix)

check cp


check 2
4
<ortools.sat.python.cp_model.CpSolver object at 0x7f99c9150510>
checkkkk


In [None]:
print(mapped_indices)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 20 21 22 23 24
 26 26 26]


In [None]:
import eng_to_ipa as ipa
ipa.convert("hurray")

'həˈreɪ'