# Exercise 4: Automatic Speech Recognition (ASR) System

## Libraries

In [1]:
!pip install vosk soundfile jiwer noisereduce scipy webrtcvad tabulate

import wave
import json
import numpy as np
import unicodedata
import string
import scipy.signal as sp
import librosa as lr
import noisereduce as nr
from vosk import Model, KaldiRecognizer
import soundfile as sf
from tabulate import tabulate

Collecting vosk
  Downloading vosk-0.3.45-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (1.8 kB)
Collecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Collecting webrtcvad
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting srt (from vosk)
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting websockets (from vosk)
  Downloading websockets-11.0.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting click<9.0.0,>=8.1.3 (from jiwer)
  Downloading click-8.1.8-py3-none-any.whl.

## Filters

In [2]:
# Noise reduction
def dynamic_noise_reduction(audio, sample_rate):
    noise_profile = audio[:int(0.5 * sample_rate)]  # Use first 0.5 seconds as noise sample
    reduced_audio = nr.reduce_noise(y=audio, sr=sample_rate, y_noise=noise_profile, prop_decrease=0.8)
    return reduced_audio

# Apply high-pass filter with dynamic cutoff frequency
def apply_highpass_filter(audio, sample_rate, lowcut):
    sos_high = sp.butter(10, lowcut, btype='high', fs=sample_rate, output='sos')
    filtered_audio = sp.sosfilt(sos_high, audio)
    return filtered_audio

# Normalize and Amplify audio
def adaptive_amplify_audio(audio, target_peak=0.9):

    max_val = np.max(np.abs(audio))
    if max_val > 0:
        factor = target_peak / max_val
        audio = audio * factor
        audio = np.clip(audio, -1.0, 1.0)  # Avoid clipping
    return audio

# Preprocess audio for a specific lowcut frequency
def preprocess_audio(audio_path, target_sample_rate=16000, lowcut=180):
    audio, sample_rate = lr.load(audio_path, sr=target_sample_rate, mono=True)
    audio_cleaned = dynamic_noise_reduction(audio, sample_rate)
    audio_filtered = apply_highpass_filter(audio_cleaned, sample_rate, lowcut)
    audio_normalized = adaptive_amplify_audio(audio_filtered, target_peak=0.5)
    return audio_normalized, sample_rate

## Post Process

In [3]:
# Normalize text
def normalize_text(text):
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    text = text.lower()
    contractions = {
        "i've": "i have",
        "where's": "where is",
    }
    for contraction, expanded in contractions.items():
        text = text.replace(contraction, expanded)
    text = text.translate(str.maketrans('', '', string.punctuation + '¿¡'))
    text = text.replace("checkin", "check in")
    return text

# Calculate WER
def calculate_wer(reference, hypothesis):
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    n = len(ref_words)
    d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1))
    for i in range(len(ref_words) + 1):
        d[i, 0] = i
    for j in range(len(hyp_words) + 1):
        d[0, j] = j
    for i in range(1, len(ref_words) + 1):
        for j in range(1, len(hyp_words) + 1):
            if ref_words[i - 1] == hyp_words[j - 1]:
                d[i, j] = d[i - 1, j - 1]
            else:
                d[i, j] = min(
                    d[i - 1, j - 1] + 1,  # Substitution
                    d[i, j - 1] + 1,      # Insertion
                    d[i - 1, j] + 1       # Deletion
                )
    return d[-1, -1] / n

## Model Initialization & Transcription

In [4]:
# Initialize model
def initialize_model(language_model_path):
    model = Model(language_model_path)
    return model

# Transcribe audio
def transcribe_audio(model, audio_path, lowcut):
    audio, sample_rate = preprocess_audio(audio_path, lowcut=lowcut)
    sf.write(f"filtered_audio_{lowcut}.wav", audio, sample_rate)
    recognizer = KaldiRecognizer(model, sample_rate)
    with wave.open(f"filtered_audio_{lowcut}.wav", "rb") as wf:
        results = []
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if recognizer.AcceptWaveform(data):
                result = json.loads(recognizer.Result())
                results.append(result["text"])
        final_result = json.loads(recognizer.FinalResult())
        results.append(final_result["text"])
    return " ".join(results)

## Logic Process 

In [5]:
if __name__ == "__main__":
    models = {
        "EN": "Models/vosk-model-small-en-us-0.15", 
        "IT": "Models/vosk-model-small-it-0.22",
        "ES": "Models/vosk-model-small-es-0.42"
    }
    language_models = {lang: initialize_model(path) for lang, path in models.items()}
    audio_files = {
        "EN": {
            "Audio/EN/checkin.wav": "Where is the check-in desk?",
            "Audio/EN/checkin_child.wav": "Where is the check-in desk?",
            "Audio/EN/parents.wav": "I have lost my parents.",
            "Audio/EN/parents_child.wav": "I have lost my parents.",
            "Audio/EN/suitcase.wav": "Please, I have lost my suitcase.",
            "Audio/EN/suitcase_child.wav": "Please, I have lost my suitcase.",
            "Audio/EN/what_time.wav": "What time is my plane?",
            "Audio/EN/what_time_child.wav": "What time is my plane?",
            "Audio/EN/where.wav": "Where are the restaurants and shops?",
            "Audio/EN/where_child.wav": "Where are the restaurants and shops?",
            "Audio/i_love_computer_science.wav": "I love computer science",
            "Audio/my_name_is_david.wav": "My name is david"
        },
        "IT": {
            "Audio/IT/checkin_it.wav": "Dove e' il bancone?",
            "Audio/IT/parents_it.wav": "Ho perso i miei genitori.",
            "Audio/IT/suitcase_it.wav": "Per favore, ho perso la mia valigia.",
            "Audio/IT/what_time_it.wav": "A che ora e’ il mio aereo?",
            "Audio/IT/where_it.wav": "Dove sono i ristoranti e i negozi?"
        },
        "ES": {
            "Audio/ES/checkin_es.wav": "¿Dónde están los mostradores?",
            "Audio/ES/parents_es.wav": "He perdido a mis padres.",
            "Audio/ES/suitcase_es.wav": "Por favor, he perdido mi maleta.",
            "Audio/ES/what_time_es.wav": "¿A qué hora es mi avión?",
            "Audio/ES/where_es.wav": "¿Dónde están los restaurantes y las tiendas?"
        }
    }

for language, files in audio_files.items():
        print(f"\n--- Evaluating {language} Audio Files ---")
        table_data = []
        for audio_path, reference in files.items():
            # Evaluate for both 180 Hz and 250 Hz
            results = {}
            for lowcut in [180, 250]:
                transcription = transcribe_audio(language_models[language], audio_path, lowcut)
                transcription_normalized = normalize_text(transcription)
                reference_normalized = normalize_text(reference)
                wer = calculate_wer(reference_normalized, transcription_normalized)
                results[lowcut] = wer

            # Find the best lowcut frequency
            best_lowcut = min(results, key=results.get)
            best_wer = results[best_lowcut]

            # Append row data
            table_data.append([
                language,
                audio_path,
                f"{results[180] * 100:.2f}%",
                f"{results[250] * 100:.2f}%",
                f"{best_wer * 100:.2f}%"
            ])

        # Print table
        print(tabulate(table_data, headers=["Language", "File", "180Hz WER", "250Hz WER", "Best WER"], tablefmt="grid"))


--- Evaluating EN Audio Files ---
+------------+-----------------------------------+-------------+-------------+------------+
| Language   | File                              | 180Hz WER   | 250Hz WER   | Best WER   |
| EN         | Audio/EN/checkin.wav              | 0.00%       | 0.00%       | 0.00%      |
+------------+-----------------------------------+-------------+-------------+------------+
| EN         | Audio/EN/checkin_child.wav        | 0.00%       | 0.00%       | 0.00%      |
+------------+-----------------------------------+-------------+-------------+------------+
| EN         | Audio/EN/parents.wav              | 20.00%      | 60.00%      | 20.00%     |
+------------+-----------------------------------+-------------+-------------+------------+
| EN         | Audio/EN/parents_child.wav        | 20.00%      | 20.00%      | 20.00%     |
+------------+-----------------------------------+-------------+-------------+------------+
| EN         | Audio/EN/suitcase.wav         

In [6]:
for language, files in audio_files.items():
    print(f"\n--- Evaluating {language} Audio Files ---")
    for audio_path, reference in files.items():
        print(f"\nProcessing file: {audio_path}")

        # Dictionary to store transcriptions for both lowcuts
        transcriptions = {}

        # Evaluate for both 180 Hz and 250 Hz
        for lowcut in [180, 250]:
            transcription = transcribe_audio(language_models[language], audio_path, lowcut)
            transcriptions[lowcut] = transcription

        # Print the reference and transcriptions
        print(f"Reference: {reference}")
        print(f"Transcription at 180 Hz: {transcriptions[180]}")
        print(f"Transcription at 250 Hz: {transcriptions[250]}\n")


--- Evaluating EN Audio Files ---

Processing file: Audio/EN/checkin.wav
Reference: Where is the check-in desk?
Transcription at 180 Hz: where is the check in desk
Transcription at 250 Hz: where is the check in desk


Processing file: Audio/EN/checkin_child.wav
Reference: Where is the check-in desk?
Transcription at 180 Hz: where's the check in desk 
Transcription at 250 Hz: where is the check in desk


Processing file: Audio/EN/parents.wav
Reference: I have lost my parents.
Transcription at 180 Hz: i lost my parents
Transcription at 250 Hz: i lost my air is


Processing file: Audio/EN/parents_child.wav
Reference: I have lost my parents.
Transcription at 180 Hz: i have lost my area 
Transcription at 250 Hz: i've lost my area


Processing file: Audio/EN/suitcase.wav
Reference: Please, I have lost my suitcase.
Transcription at 180 Hz: please i've lost my suitcase
Transcription at 250 Hz: please i've lost my suitcase


Processing file: Audio/EN/suitcase_child.wav
Reference: Please, I hav