In [None]:
# Install the required libraries
!pip install vosk
!pip install soundfile

Collecting vosk
  Downloading vosk-0.3.45-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting srt (from vosk)
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting websockets (from vosk)
  Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: srt
  Building wheel for srt (setup.py) ... [?25l[?25hdone
  Created wheel for srt: filename=srt-3.5.3-py3-none-any.whl size=22428 sha256=4e994a62c08c509ffb169369fef8aa85b329a81b8ef61d023da9b74675787b3c
  Stored in directory: /root/.cache/pip/wheels/d7/31/a1/18e1e7e8bfdafd19e6803d7eb919b563dd11de380e4304e332
Successfu

In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import os

# Audio files directory
audio_dir = "/content/drive/MyDrive/audio_files/Dataset_2_Cleaned"

# Get all wav files
wav_files = [os.path.join(root, file) for root, _, files in os.walk(audio_dir) for file in files if file.endswith(".wav")]

# Get the count of wav files
wav_file_count = len(wav_files)

# Print the count
print(f"Number of .wav files in '{audio_dir}': {wav_file_count}")

Number of .wav files in '/content/drive/MyDrive/audio_files/Dataset_2_Cleaned': 1848


In [None]:
import os
import re
import shutil
import pandas as pd
import wave
import json
from tqdm import tqdm
from pydub import AudioSegment
from vosk import Model, KaldiRecognizer

# Install pydub and ffmpeg
!pip install pydub
!apt-get install ffmpeg

# Download the Vosk model
!wget -O vosk-model-small-en-us-0.15.zip https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
!unzip vosk-model-small-en-us-0.15.zip -d .

# Load Vosk model
model_path = "vosk-model-small-en-us-0.15"
model = Model(model_path)

# Function to convert audio to required format
def convert_audio_to_mono_pcm(source_path, target_path):
    audio = AudioSegment.from_file(source_path)
    audio = audio.set_channels(1)
    audio = audio.set_frame_rate(16000)
    audio.export(target_path, format="wav")

# Function to check and convert audio if necessary
def check_and_convert_audio(file_path, temp_dir):
    try:
        wf = wave.open(file_path, "rb")
        if wf.getnchannels() == 1 and wf.getsampwidth() == 2 and wf.getcomptype() == "NONE":
            return file_path  # File is already in the correct format
        else:
            # Convert the file to the correct format and save in temporary directory
            temp_path = os.path.join(temp_dir, os.path.basename(file_path))
            convert_audio_to_mono_pcm(file_path, temp_path)
            return temp_path
    except Exception as e:
        print(f"An error occurred while checking/converting file {file_path}: {e}")
        return None

# Function to transcribe audio using Vosk
def transcribe_audio_vosk(audio_path):
    try:
        wf = wave.open(audio_path, "rb")
        rec = KaldiRecognizer(model, wf.getframerate())
        result_text = ""
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                result = json.loads(rec.Result())
                result_text += result.get("text", "")
        result = json.loads(rec.FinalResult())
        result_text += result.get("text", "")
        return result_text.upper().strip(), 1.0  # Assume confidence as 1.0 for simplicity
    except Exception as e:
        print(f"An error occurred: {e}")
        return "", 0.0

# Function to clean and normalize expected words
def clean_expected_word(word):
    # Remove any numeric prefix, underscores, and spaces
    word = re.sub(r'^[\d_]+\s*', '', word)
    return word.upper().strip()

# Audio files directory
audio_dir = "/content/drive/MyDrive/audio_files/Dataset_2_Cleaned"
# Temporary directory for converted audio files
temp_dir = "/content/temp_audio"

# Ensure temp directory exists
os.makedirs(temp_dir, exist_ok=True)

# Get all wav files
wav_files = [os.path.join(root, file) for root, _, files in os.walk(audio_dir) for file in files if file.endswith(".wav")]

# Print the count of wav files
wav_file_count = len(wav_files)
print(f"Number of .wav files in '{audio_dir}': {wav_file_count}")

# Process each wav file
results = []
for file_path in tqdm(wav_files, desc="Processing audio files"):
    checked_file_path = check_and_convert_audio(file_path, temp_dir)
    if checked_file_path:
        detected_word, confidence = transcribe_audio_vosk(checked_file_path)
        expected_word = os.path.basename(os.path.dirname(file_path)).upper().strip()
        # Clean the expected word
        expected_word = clean_expected_word(expected_word)

        # Check if the detected word matches the expected word
        correct = 1 if detected_word == expected_word else 0

        results.append({
            "File": file_path,
            "Expected Word": expected_word,
            "Detected Word": detected_word,
            "Confidence": confidence,
            "Correct": correct
        })

# Convert results to a DataFrame
df = pd.DataFrame(results)

# Save results to CSV file
df.to_csv("transcription_results_vosk.csv", index=False)
print(df)

print("Transcription results saved to transcription_results_vosk.csv")

# Optionally, remove the temporary directory after processing
shutil.rmtree(temp_dir)
print(f"Temporary directory '{temp_dir}' has been removed.")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
--2024-07-09 21:04:55--  https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
Resolving alphacephei.com (alphacephei.com)... 188.40.21.16, 2a01:4f8:13a:279f::2
Connecting to alphacephei.com (alphacephei.com)|188.40.21.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41205931 (39M) [application/zip]
Saving to: ‘vosk-model-small-en-us-0.15.zip’


2024-07-09 21:04:58 (18.4 MB/s) - ‘vosk-model-small-en-us-0.15.zip’ saved [41205931/41205931]

Archive:  vosk-model-small-en-us-0.15.zip
replace ./vosk-model-small-en-us-0.15/am/final.mdl? [y]es, [n]o, [A]ll, [N]one, [r]ename: all
error:  invalid response [all]
replace ./vosk-model-small-en-us-0.15/am/final.mdl? [y]es, [n]o, [A]ll, [N]one, [r]ename: All
  inflating: ./vosk-model-sm

Processing audio files: 100%|██████████| 1848/1848 [29:43<00:00,  1.04it/s]

                                                   File Expected Word  \
0     /content/drive/MyDrive/audio_files/Dataset_2_C...        ZEALOT   
1     /content/drive/MyDrive/audio_files/Dataset_2_C...        ZEALOT   
2     /content/drive/MyDrive/audio_files/Dataset_2_C...        ZEALOT   
3     /content/drive/MyDrive/audio_files/Dataset_2_C...        ZEALOT   
4     /content/drive/MyDrive/audio_files/Dataset_2_C...        ZEALOT   
...                                                 ...           ...   
1843  /content/drive/MyDrive/audio_files/Dataset_2_C...         THYME   
1844  /content/drive/MyDrive/audio_files/Dataset_2_C...         THYME   
1845  /content/drive/MyDrive/audio_files/Dataset_2_C...         THYME   
1846  /content/drive/MyDrive/audio_files/Dataset_2_C...         THYME   
1847  /content/drive/MyDrive/audio_files/Dataset_2_C...         THYME   

     Detected Word  Confidence  Correct  
0         SEE THAT         1.0        0  
1            XO IT         1.0        0


