# displaying speedch to text using whisper 

In [1]:
!pip install speechrecognition whisper torch




In [2]:
import io
import os
import speech_recognition as sr
import whisper
import torch
import time
from datetime import datetime, timedelta
from queue import Queue
from tempfile import NamedTemporaryFile
from sys import platform


In [3]:
def speech_to_text(whisper_model="tiny", duration=10, energy_threshold=1000, record_timeout=2, phrase_timeout=3):
    data_queue = Queue()
    recorder = sr.Recognizer()
    recorder.energy_threshold = energy_threshold
    recorder.dynamic_energy_threshold = False

    source = sr.Microphone(sample_rate=16000)

    model = f"{whisper_model}.en" if whisper_model != "tiny" else whisper_model
    audio_model = whisper.load_model(model)

    temp_file = NamedTemporaryFile().name
    transcription = []

    with source:
        recorder.adjust_for_ambient_noise(source)

    def record_callback(_, audio: sr.AudioData) -> None:
        data_queue.put(audio.get_raw_data())

    recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)

    print("Model loaded.\n")
    print("Start recording or say something that you would like to record...\n")
    os.system("clear")

    end_time = time.time() + duration
    last_sample = bytes()
    phrase_time = None

    while time.time() < end_time:
        try:
            if data_queue.empty():
                time.sleep(0.1)
                continue

            now = datetime.utcnow()
            phrase_complete = phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout)

            if phrase_complete:
                last_sample = bytes()

            phrase_time = now

            while not data_queue.empty():
                last_sample += data_queue.get()

            audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
            wav_data = io.BytesIO(audio_data.get_wav_data())

            with open(temp_file, 'wb') as f:
                f.write(wav_data.getvalue())

            result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available())
            text = result['text'].strip()

            if phrase_complete or not transcription:
                transcription.append(text)
            else:
                transcription[-1] = text

            os.system('cls' if os.name == 'nt' else 'clear')
            print("\n".join(transcription))
            print('', end='', flush=True)

        except KeyboardInterrupt:
            break

    return transcription


In [6]:
# Run this cell to start recording
print("Starting transcription process...\n")
transcription = speech_to_text(whisper_model="medium", duration=10)

# Display the final transcription after 10 seconds
print("\nFinal Transcription:")
print("\n".join(transcription))


Starting transcription process...



  checkpoint = torch.load(fp, map_location=device)


Model loaded.

Start recording or say something that you would like to record...

[H[2J[H[2JDog running on moon

Final Transcription:
Dog running on moon
