In [1]:
# Install the required libraries
!pip install vosk
!pip install soundfile

Collecting vosk
  Downloading vosk-0.3.45-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Collecting srt (from vosk)
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting websockets (from vosk)
  Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: srt
  Building wheel for srt (setup.py) ... [?25l[?25hdone
  Created wheel for srt: filename=srt-3.5.3-py3-none-any.whl size=22428 sha256=f49d9fdbbc6a12b74f5d2b8dd99294590c2c9b932650ed85a778261e9e2b97d3
  Stored in directory: /root/.cache/pip/wheels/d7/31/a1/18e1e7e8bfdafd19e6803d7eb919b563dd11de380e4304e332
Successfu

In [2]:


import os
import re
import pandas as pd
import wave
import json
from tqdm import tqdm
from vosk import Model, KaldiRecognizer

# Download the Vosk model
!wget -O vosk-model-small-en-us-0.15.zip https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
!unzip vosk-model-small-en-us-0.15.zip -d .

# Load Vosk model
model_path = "vosk-model-small-en-us-0.15"
model = Model(model_path)

# Function to transcribe audio using Vosk
def transcribe_audio_vosk(audio_path):
    try:
        wf = wave.open(audio_path, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return "", 0.0

        rec = KaldiRecognizer(model, wf.getframerate())
        result_text = ""
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                result = json.loads(rec.Result())
                result_text += result.get("text", "")
        result = json.loads(rec.FinalResult())
        result_text += result.get("text", "")
        return result_text.upper().strip(), 1.0  # Assume confidence as 1.0 for simplicity
    except Exception as e:
        print(f"An error occurred: {e}")
        return "", 0.0

# Function to clean and normalize expected words
def clean_expected_word(word):
    # Remove any numeric prefix and spaces
    word = re.sub(r'^\d+\s*', '', word)
    return word.upper().strip()

# Audio files directory
audio_dir = "/content/drive/MyDrive/audio_files/Dataset 1 (Cleaned with Separated Words)/NART Words Ss 1 to 71"

# Get all wav files
wav_files = [os.path.join(root, file) for root, _, files in os.walk(audio_dir) for file in files if file.endswith(".wav")]

# Process each wav file
results = []
for file_path in tqdm(wav_files, desc="Processing audio files"):
    detected_word, confidence = transcribe_audio_vosk(file_path)
    expected_word = os.path.basename(os.path.dirname(file_path)).upper().strip()
    # Clean the expected word
    expected_word = clean_expected_word(expected_word)

    # Check if the detected word matches the expected word
    correct = 1 if detected_word == expected_word else 0

    results.append({
        "File": file_path,
        "Expected Word": expected_word,
        "Detected Word": detected_word,
        "Confidence": confidence,
        "Correct": correct
    })

# Convert results to a DataFrame
df = pd.DataFrame(results)

# Save results to CSV file
df.to_csv("transcription_results_vosk.csv", index=False)
print(df)


--2024-07-08 10:29:15--  https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
Resolving alphacephei.com (alphacephei.com)... 188.40.21.16, 2a01:4f8:13a:279f::2
Connecting to alphacephei.com (alphacephei.com)|188.40.21.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41205931 (39M) [application/zip]
Saving to: ‘vosk-model-small-en-us-0.15.zip’


2024-07-08 10:29:15 (94.2 MB/s) - ‘vosk-model-small-en-us-0.15.zip’ saved [41205931/41205931]

Archive:  vosk-model-small-en-us-0.15.zip
   creating: ./vosk-model-small-en-us-0.15/
   creating: ./vosk-model-small-en-us-0.15/am/
  inflating: ./vosk-model-small-en-us-0.15/am/final.mdl  
   creating: ./vosk-model-small-en-us-0.15/graph/
  inflating: ./vosk-model-small-en-us-0.15/graph/disambig_tid.int  
  inflating: ./vosk-model-small-en-us-0.15/graph/HCLr.fst  
  inflating: ./vosk-model-small-en-us-0.15/graph/Gr.fst  
   creating: ./vosk-model-small-en-us-0.15/graph/phones/
  inflating: ./vosk-model-small

Processing audio files: 100%|██████████| 3500/3500 [1:01:26<00:00,  1.05s/it]

                                                   File Expected Word  \
0     /content/drive/MyDrive/audio_files/Dataset 1 (...       BEATIFY   
1     /content/drive/MyDrive/audio_files/Dataset 1 (...       BEATIFY   
2     /content/drive/MyDrive/audio_files/Dataset 1 (...       BEATIFY   
3     /content/drive/MyDrive/audio_files/Dataset 1 (...       BEATIFY   
4     /content/drive/MyDrive/audio_files/Dataset 1 (...       BEATIFY   
...                                                 ...           ...   
3495  /content/drive/MyDrive/audio_files/Dataset 1 (...       PRELATE   
3496  /content/drive/MyDrive/audio_files/Dataset 1 (...       PRELATE   
3497  /content/drive/MyDrive/audio_files/Dataset 1 (...       PRELATE   
3498  /content/drive/MyDrive/audio_files/Dataset 1 (...       PRELATE   
3499  /content/drive/MyDrive/audio_files/Dataset 1 (...       PRELATE   

      Detected Word  Confidence  Correct  
0          EAT FOIE         1.0        0  
1           BEATIFY         1.0      


