In [1]:
import pyaudio, wave, os, torch, keyboard
import numpy as np
from faster_whisper import WhisperModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from queue import Queue
from threading import Thread

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
messages = Queue()
recordings = Queue()
transcribed_text = Queue()

def start_recording():
    messages.put(True)

    print("Start Recording:")
    recording = Thread(target=record_microphone)
    recording.start()
    
    transcribing = Thread(target=transcription)
    transcribing.start()

    translation = Thread(target=translate)
    translation.start()

def stop_recording():
    messages.get()
    print("Translated Text: ", " ".join(translated_text))

In [3]:
CHANNELS = 1
FRAME_RATE = 16000
RECORD_SECONDS = 3
FORMAT = pyaudio.paInt16
def record_microphone(chunk=1024):
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                   channels=CHANNELS,
                   rate=FRAME_RATE,
                   input=True,
                   input_device_index=1,
                   frames_per_buffer=chunk)
    
    frames = []
    
    while not messages.empty():
        data = stream.read(chunk)
        frames.append(data)
        if len(frames) >= (FRAME_RATE * RECORD_SECONDS) / chunk:

            if np.max(np.abs(np.frombuffer(b''.join(frames), dtype=np.int16))) < 1000:
                stop_recording()

            recordings.put(frames.copy())
            frames = []
            i = 0
    
    stream.stop_stream()
    stream.close()
    p.terminate()

In [4]:
transcription_model = WhisperModel("large-v3", device="cuda" if torch.cuda.is_available() else "cpu", compute_type="float16" if torch.cuda.is_available() else "int8")
def transcription():
    while not messages.empty():
        frames = recordings.get()
        
        with wave.open("audio.wav", 'wb') as wf:
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(2)
            wf.setframerate(FRAME_RATE)
            wf.writeframes(b''.join(frames))

        segments, _ = transcription_model.transcribe(f'audio.wav', beam_size=5)
        text = " ".join([segment.text for segment in segments])
        transcribed_text.put(text)
        print(f"live transcription: ", text)
        os.remove(f"audio.wav")
        
        if keyboard.is_pressed('q'):
            stop_recording()

In [5]:
translated_text = []
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
translation_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
def translate():
    while not messages.empty():
        text = transcribed_text.get()
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: inputs[k].to(translation_model.device) for k in inputs}
        outputs = translation_model.generate(**inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("hin_Deva"))
        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        translated_text.append(outputs)



In [6]:
start_recording()

Start Recording:


live transcription:   Hi, this is Raunak and I am
live transcription:   trying new project, this should
live transcription:   work right now. And if it is working, thank you.
live transcription:   Thank you so much.
live transcription:   And hey, this is Arnold.
live transcription:   മേന്ടുന്ന അണ്ളമുത്
live transcription:   सब्सक्राइब
live transcription:   अगर मैं बोलना  अरे तो अगर तो
live transcription:   तो fine tune करना ही पड़ेगा मेरे को  अब इसमें मैंने यह भी implement करवा करना
live transcription:   बोलना चुप कर दो तो ऑटोमाटिकली स्टॉप हो जाती है
live transcription:   Aqui ó, em semelheira.
live transcription:   अगर मैं बोलना हूँ अरे तो अगर तो
live transcription:   चाहिए
live transcription:   सब्सक्राइब
live transcription:   नहीं बना रहे के फास्ता मिस्पर तो तेज है
live transcription:   Gopane aise.
live transcription:   नहीं है
live transcription:   सब्सक्राइब
live transcription:   सब्सक्राइब
live transcription:   Chukwu Chukwu Chukwu  and they clap their hands.
live transcription:

: 