# Installing Requirements

In [None]:
!pip install allosaurus

Collecting allosaurus
  Downloading allosaurus-1.0.2-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.1/52.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting resampy (from allosaurus)
  Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting panphon (from allosaurus)
  Downloading panphon-0.20.0-py2.py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting unicodecsv (from panphon->allosaurus)
  Downloading unicodecsv-0.14.1.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting munkres (from panphon->allosaurus)
  Downloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->allosaurus)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py

#Installing latest model from releases

In [None]:
#!python -m allosaurus.bin.list_model

Available Models
- uni2005 (default)


In [None]:
!python -m allosaurus.bin.download_model -m "latest"

downloading model  latest
from:  https://github.com/xinjli/allosaurus/releases/download/v1.0/latest.tar.gz
to:    /usr/local/lib/python3.10/dist-packages/allosaurus/pretrained
please wait...


In [None]:
!python -m allosaurus.bin.update_phone --lang "deu" --input "/content/phone_dir"

In [None]:
!python -m allosaurus.bin.list_phone --lang "deu"

a aː b d d̠ d̺ e eː f h i iː j k kʰ l m n oː p pʰ s t tʰ t̠ u uː v x y yː z øː ŋ œ ɐ ɔ ə ɛ ɛː ɡ ɪ ʀ ʁ ʃ ʊ ʏ ʏː ʒ ʔ ʋ ɕ o ɔɪ ç ɑː ɑ r ts p͡f ø tʃ dʒ aɪ aʊ g t͡s ɔʏ


In [None]:
#!python -m allosaurus.bin.write_phone --lang "deu" --output "phone_dir"

# Preparing train and test data for modelling

In [None]:
!unzip data_100.zip

In [None]:
!python -m allosaurus.bin.prep_feat --model="uni2005" --path="/content/data/validate"

100% 20/20 [00:05<00:00,  3.66it/s]


/content/test/SNM-test/test_20/clips1/common_voice_de_17639965.wav

In [None]:
!python -m allosaurus.bin.prep_feat --model="uni2005" --path="/content/data/train"

100% 100/100 [00:21<00:00,  4.57it/s]


# Preparing train and test text features

In [None]:
!python -m allosaurus.bin.prep_token --model="uni2005" --lang="deu" --path="/content/data/train"

  0% 0/100 [00:00<?, ?it/s]100% 100/100 [00:00<00:00, 44705.86it/s]


In [None]:
!python -m allosaurus.bin.prep_token --model="uni2005" --lang="deu" --path="/content/data/validate"

  0% 0/20 [00:00<?, ?it/s]100% 20/20 [00:00<00:00, 31524.27it/s]


# Training the model

In [None]:
!python -m allosaurus.bin.adapt_model --pretrained_model="uni2005" --new_model="allo_100_1" --path="/content/data" --lang="deu" --device_id=0 --epoch=50

epoch[batch]: 00[0000] | train loss 4.10741 train per 0.59939
epoch0 | validate per : 0.55771
saving model
epoch[batch]: 01[0000] | train loss 3.71454 train per 0.52862
epoch1 | validate per : 0.57929
epoch[batch]: 02[0000] | train loss 3.29648 train per 0.52213
epoch2 | validate per : 0.54693
saving model
epoch[batch]: 03[0000] | train loss 3.17889 train per 0.51976
epoch3 | validate per : 0.52751
saving model
epoch[batch]: 04[0000] | train loss 2.81023 train per 0.47119
epoch4 | validate per : 0.50917
saving model
epoch[batch]: 05[0000] | train loss 2.72573 train per 0.49422
epoch5 | validate per : 0.49515
saving model
epoch[batch]: 06[0000] | train loss 2.40107 train per 0.45058
epoch6 | validate per : 0.49622
epoch[batch]: 07[0000] | train loss 2.25515 train per 0.43481
epoch7 | validate per : 0.48544
saving model
epoch[batch]: 08[0000] | train loss 2.28269 train per 0.43502
epoch8 | validate per : 0.47896
saving model
epoch[batch]: 09[0000] | train loss 2.02656 train per 0.40995
e

# Testing the model

In [None]:
!python -m allosaurus.bin.list_model

In [None]:
import allosaurus
from allosaurus.app import read_recognizer
model = read_recognizer("your_new_model_name")

In [None]:
output = model.recognize(path_to_voice_file,'deu')

In [None]:
import numpy as np
from typing import List, Tuple

def levenshtein_distance(reference: List[str], hypothesis: List[str]) -> Tuple[int, int, int]:
    """
    Calculate the Levenshtein distance between two lists of phonemes.
    """
    ref_len = len(reference)
    hyp_len = len(hypothesis)

    # Create a distance matrix
    dist_matrix = np.zeros((ref_len + 1, hyp_len + 1), dtype=int)

    # Initialize the distance matrix
    for i in range(ref_len + 1):
        dist_matrix[i][0] = i
    for j in range(hyp_len + 1):
        dist_matrix[0][j] = j

    # Populate the distance matrix
    for i in range(1, ref_len + 1):
        for j in range(1, hyp_len + 1):
            if reference[i - 1] == hypothesis[j - 1]:
                cost = 0
            else:
                cost = 1
            dist_matrix[i][j] = min(dist_matrix[i - 1][j] + 1,      # Deletion
                                    dist_matrix[i][j - 1] + 1,      # Insertion
                                    dist_matrix[i - 1][j - 1] + cost)  # Substitution

    # The distance is the value in the bottom right corner of the matrix
    distance = dist_matrix[ref_len][hyp_len]

    # Backtrack to find the number of insertions, deletions, and substitutions
    i, j = ref_len, hyp_len
    insertions = deletions = substitutions = 0

    while i > 0 or j > 0:
        if i > 0 and j > 0 and dist_matrix[i][j] == dist_matrix[i - 1][j - 1] + (1 if reference[i - 1] != hypothesis[j - 1] else 0):
            if reference[i - 1] != hypothesis[j - 1]:
                substitutions += 1
            i -= 1
            j -= 1
        elif i > 0 and dist_matrix[i][j] == dist_matrix[i - 1][j] + 1:
            deletions += 1
            i -= 1
        else:
            insertions += 1
            j -= 1

    return insertions, deletions, substitutions

def phoneme_error_rate(reference: List[str], hypothesis: List[str]) -> float:
    """
    Calculate the Phoneme Error Rate (PER).

    :param reference: List of phonemes in the reference transcription.
    :param hypothesis: List of phonemes in the hypothesis transcription.
    :return: Phoneme Error Rate as a float.
    """
    insertions, deletions, substitutions = levenshtein_distance(reference, hypothesis)
    total_errors = insertions + deletions + substitutions
    total_phonemes = len(reference)

    return total_errors / total_phonemes

# Example usage
reference_phonemes = ["p", "h", "o", "n", "e", "m", "e"]
hypothesis_phonemes = ["p", "h", "o", "m", "e", "n", "e"]

per = phoneme_error_rate(reference_phonemes, hypothesis_phonemes)
print(f"Phoneme Error Rate: {per:.2%}")
