In [2]:
!pip install SpeechRecognition pydub

Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub, SpeechRecognition
Successfully installed SpeechRecognition-3.14.3 pydub-0.25.1


In [3]:
# -*- coding: utf-8 -*-
"""
Speech Recognition System

Deliverable:
A functional system capable of transcribing short audio clips.
"""

import speech_recognition as sr
from pydub import AudioSegment
from pydub.utils import make_chunks
import os

def transcribe_audio(audio_file_path, chunk_duration=10000):
    """
    Transcribes an audio file (wav or mp3) to text.  For longer files, it chunks
    the audio to avoid memory issues.

    Args:
        audio_file_path (str): Path to the audio file.
        chunk_duration (int, optional): Duration of each audio chunk in milliseconds.
            Defaults to 10000 (10 seconds).  Adjust this based on your system's memory.

    Returns:
        str: The transcribed text, or an error message.
    """
    recognizer = sr.Recognizer()

    # Check if the file exists
    if not os.path.exists(audio_file_path):
        return f"Error: Audio file not found at {audio_file_path}"

    try:
        # Determine file type and load audio
        if audio_file_path.lower().endswith(".wav"):
            audio = AudioSegment.from_wav(audio_file_path)
        elif audio_file_path.lower().endswith(".mp3"):
            audio = AudioSegment.from_mp3(audio_file_path)
        else:
            return "Error: Unsupported audio format.  Please use .wav or .mp3."

        # Check if the audio file is empty
        if len(audio) == 0:
            return "Error: Audio file is empty."

        # If the audio is longer than chunk_duration, process it in chunks
        if len(audio) > chunk_duration:
            chunks = make_chunks(audio, chunk_duration)  # Split audio into chunks
            transcribed_text = ""
            for i, chunk in enumerate(chunks):
                # Export chunk to a temporary WAV file.  Important for compatibility.
                chunk_file_path = f"temp_chunk_{i}.wav"
                chunk.export(chunk_file_path, format="wav")
                try:
                    with sr.AudioFile(chunk_file_path) as source:
                        audio_data = recognizer.record(source)  # Read the audio data
                        text = recognizer.recognize_google(audio_data)  # Transcribe
                        transcribed_text += text + " "
                except sr.UnknownValueError:
                    transcribed_text += f"Chunk {i}: Unable to recognize speech. "
                except sr.RequestError as e:
                    transcribed_text += f"Chunk {i}: Error making request; {e} "
                finally:
                    os.remove(chunk_file_path)  # Clean up the temporary file
            return transcribed_text.strip()  # Remove leading/trailing spaces

        else:
            # Process the entire audio file directly
            with sr.AudioFile(audio_file_path) as source:
                audio_data = recognizer.record(source)
                try:
                    text = recognizer.recognize_google(audio_data)
                    return text
                except sr.UnknownValueError:
                    return "Unable to recognize speech"
                except sr.RequestError as e:
                    return f"Error making request; {e}"

    except Exception as e:
        return f"An error occurred: {e}"



def main():
    """
    Main function to demonstrate the speech-to-text system.
    """
    # Set the audio file path here
    audio_file_path = "/content/audio_1_test.mp3"  # <--  Set your path here

    # Transcribe the audio file
    transcribed_text = transcribe_audio(audio_file_path)

    # Print the transcribed text
    print("\nTranscribed Text:")
    print(transcribed_text)



if __name__ == "__main__":
    main()



Transcribed Text:
chapter 11 of Tom Swift and the caves of ice this is a liver box recording all over Fox recordings are in the public domain for more information or to volunteer please visit librivox.org Tom Swift and the caves of ice by Victor Appleton off for the Frozen North Tom Swift felt as if he was struggling in some dream or nightmare he felt strong hands holding him and saw evil faces his muscles that had been weakened by the Cowardly blow grew strong he felt his fist then came the sound of footsteps running Tom heard the pain of a no I can't find so suddenly that he staggered about and almost fell the next moment Tom was looking into play some of the big policeman who is half supporting him what's the matter ask the officer hold up I guess Mom of the lad there they he pointed toward two dark forms slipping along down the dimly Light Street the officer drew his revolver and fired two shots in the air but the Chunk 9: Unable to recognize speech. he saw the packages containing