# Speech-to-Text Application for Accessibility

In [1]:
# Install dependencies
!pip install soundfile vosk SpeechRecognition pydub pandas --quiet
!pip install openai-whisper --quiet

import os
import json
import time
import pandas as pd
import soundfile as sf
import speech_recognition as sr

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for srt (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone


In [2]:
# Upload audio file (use Colab's upload button)
from google.colab import files
uploaded = files.upload()

# Pick the uploaded file
audio_file = list(uploaded.keys())[0]
print(f"Using audio file: {audio_file}")

Saving lab3sample.wav to lab3sample.wav
Using audio file: lab3sample.wav


In [3]:
# ==============================================
# 1. Google Web Speech Recognition (Online)
# ==============================================
def recognize_with_google(audio_path):
    r = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = r.record(source)
    print("Recognizing... (Google Web Speech)")
    try:
        text = r.recognize_google(audio)
        print("Speech successfully converted to text!")
        return {"success": True, "text": text, "error": None}
    except sr.UnknownValueError:
        msg = "Speech Recognition could not understand audio. Please try speaking more clearly."
        return {"success": False, "text": None, "error": msg}
    except sr.RequestError as e:
        msg = f"Google API unavailable: {e}"
        return {"success": False, "text": None, "error": msg}

In [4]:
# ==============================================
# 2. Vosk (Offline)
# ==============================================
!wget -q https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
!unzip -q vosk-model-small-en-us-0.15.zip
vosk_model_path = "vosk-model-small-en-us-0.15"

from vosk import Model, KaldiRecognizer

def recognize_with_vosk(audio_path, model_path=vosk_model_path):
    wf = sf.SoundFile(audio_path)
    model = Model(model_path)
    rec = KaldiRecognizer(model, wf.samplerate)

    result_texts = []
    while True:
        data = wf.buffer_read(4000, dtype='int16')
        if not data:
            break
        # ✅ FIX: convert CFFI buffer to bytes
        byte_data = bytes(data)
        if rec.AcceptWaveform(byte_data):
            res = json.loads(rec.Result())
            if res.get("text"):
                result_texts.append(res["text"])

    final = json.loads(rec.FinalResult())
    if final.get("text"):
        result_texts.append(final["text"])

    text = " ".join(result_texts).strip()
    if text == "":
        return {"success": False, "text": None, "error": "Vosk could not understand audio."}
    return {"success": True, "text": text, "error": None}

In [5]:
import whisper

def recognize_with_whisper(audio_path, model_size="small"):
    print(f"Recognizing... (Whisper {model_size})")
    model = whisper.load_model(model_size)
    result = model.transcribe(audio_path)
    text = result.get("text", "").strip()
    if text == "":
        return {"success": False, "text": None, "error": "Whisper could not transcribe."}
    return {"success": True, "text": text, "error": None}

In [6]:
# ==============================================
# Run All Methods and Compare
# ==============================================
results = []

print("\n--- Google Web Speech ---")
res_google = recognize_with_google(audio_file)
results.append({
    "method": "Google Web Speech (Online)",
    "success": res_google["success"],
    "recognized_text": res_google["text"],
    "error": res_google["error"]
})

print("\n--- Vosk ---")
res_vosk = recognize_with_vosk(audio_file)
results.append({
    "method": "Vosk (Offline)",
    "success": res_vosk["success"],
    "recognized_text": res_vosk["text"],
    "error": res_vosk["error"]
})

print("\n--- Whisper ---")
res_whisper = recognize_with_whisper(audio_file, model_size="small")
results.append({
    "method": "Whisper Small (Offline)",
    "success": res_whisper["success"],
    "recognized_text": res_whisper["text"],
    "error": res_whisper["error"]
})

# Show Comparison Table
df = pd.DataFrame(results)
print("\nComparison Results:")
display(df)


--- Google Web Speech ---
Recognizing... (Google Web Speech)
Speech successfully converted to text!

--- Vosk ---

--- Whisper ---
Recognizing... (Whisper small)


100%|███████████████████████████████████████| 461M/461M [00:05<00:00, 93.4MiB/s]



Comparison Results:


Unnamed: 0,method,success,recognized_text,error
0,Google Web Speech (Online),True,I believe you are just talking nonsense,
1,Vosk (Offline),True,i believe you're just talking nonsense,
2,Whisper Small (Offline),True,I believe you're just talking nonsense.,


# Workflow Overview
The pipeline takes an uploaded audio file.

It runs speech-to-text conversion using three methods:

Google Web Speech (Online, via SpeechRecognition library)

Vosk (Offline, lightweight open model)

OpenAI Whisper (Offline, powerful neural model)

Results (including success/failure and transcribed text) are compiled into a comparison table for review.

# Methods Compared
| Method                | Mode    | Key Strengths                          | Limitations                 |
|-----------------------|---------|----------------------------------------|-----------------------------|
| Google Web Speech     | Online  | High accuracy, easy to use, fast       | Requires internet           |
| Vosk                  | Offline | Fast, works offline, multilingual      | May be less accurate        |
| Whisper (OpenAI)      | Offline | Handles noise & accents, robust model  | Computationally intensive   |

# Summary

Google Web Speech is simple and generally accurate but needs an internet connection.

Vosk is suitable for real-time or privacy-focused offline tasks, supporting many languages on modest hardware, but with some accuracy trade-offs.

Whisper is highly robust and accurate even in noisy or multilingual scenarios, but it requires more computational resources and is best suited for offline bulk or high-quality transcription.

The approach allows users to compare output quality and performance, helping select the best speech-to-text engine for their needs.