In [5]:
import base64
from threading import Lock, Thread

import cv2
import openai
from cv2 import VideoCapture, imencode
from dotenv import load_dotenv
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.messages import SystemMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from pyaudio import PyAudio, paInt16
from speech_recognition import Microphone, Recognizer, UnknownValueError


load_dotenv()


class WebcamStream:
    def __init__(self):
        self.stream = VideoCapture(index=0)
        _, self.frame = self.stream.read()
        self.running = False
        self.lock = Lock()

    def start(self):
        if self.running:
            return self

        self.running = True

        self.thread = Thread(target=self.update, args=())
        self.thread.start()
        return self

    def update(self):
        while self.running:
            _, frame = self.stream.read()

            self.lock.acquire()
            self.frame = frame
            self.lock.release()

    def read(self, encode=False):
        self.lock.acquire()
        frame = self.frame.copy()
        self.lock.release()

        if encode:
            _, buffer = imencode(".jpeg", frame)
            return base64.b64encode(buffer)

        return frame

    def stop(self):
        self.running = False
        if self.thread.is_alive():
            self.thread.join()

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.stream.release()


class Assistant:
    def __init__(self, model):
        self.chain = self._create_inference_chain(model)

    def answer(self, prompt, image):
        print(f"Received prompt: {prompt}")  # Print the prompt
        if not prompt:
            print("Prompt is empty or None, returning.")
            return


        print("Prompt:", prompt)

        response = self.chain.invoke(
            {"prompt": prompt, "image_base64": image.decode()},
            config={"configurable": {"session_id": "unused"}},
        ).strip()

        print("Response:", response)

        if response:
            self._tts(response)

    def _tts(self, response):
        player = PyAudio().open(format=paInt16, channels=1, rate=24000, output=True)

        with openai.audio.speech.with_streaming_response.create(
            model="tts-1",
            voice="alloy",
            response_format="pcm",
            input=response,
        ) as stream:
            for chunk in stream.iter_bytes(chunk_size=1024):
                player.write(chunk)


    # def _tts(self, response):
    #     player = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)

    #     # Create a Bark instance
    #     bark_model = Bark()

    #     # Generate audio data from the text response
    #     audio_data = bark_model.generate_audio(response)

    #     # Stream audio data in chunks
    #     chunk_size = 1024
    #     for i in range(0, len(audio_data), chunk_size):
    #         player.write(audio_data[i:i + chunk_size])

    #     # Stop and close the player
    #     player.stop_stream()
    #     player.close()


    def _create_inference_chain(self, model):
        SYSTEM_PROMPT = """
        You are a witty assistant that will use the chat history and the image 
        provided by the user to answer its questions.

        Use few words on your answers. Go straight to the point. Do not use any
        emoticons or emojis. Do not ask the user any questions.

        Be friendly and helpful. Show some personality. Do not be too formal.
        """

        prompt_template = ChatPromptTemplate.from_messages(
            [
                SystemMessage(content=SYSTEM_PROMPT),
                MessagesPlaceholder(variable_name="chat_history"),
                (
                    "human",
                    [
                        {"type": "text", "text": "{prompt}"},
                        {
                            "type": "image_url",
                            "image_url": "data:image/jpeg;base64,{image_base64}",
                        },
                    ],
                ),
            ]
        )

        chain = prompt_template | model | StrOutputParser()

        chat_message_history = ChatMessageHistory()
        return RunnableWithMessageHistory(
            chain,
            lambda _: chat_message_history,
            input_messages_key="prompt",
            history_messages_key="chat_history",
        )


webcam_stream = WebcamStream().start()

model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")

# You can use OpenAI's GPT-4o model instead of Gemini Flash
# by uncommenting the following line:
# model = ChatOpenAI(model="gpt-4o")

assistant = Assistant(model)


def audio_callback(recognizer, audio):
    try:
        print("Processing audio...")
        # Try using Google instead of Whisper for testing
        prompt = recognizer.recognize_whisper(audio, model="base", language="english")
        print(f"Recognized Prompt: {prompt}")
        assistant.answer(prompt, webcam_stream.read(encode=True))
    except UnknownValueError:
        print("Speech Recognition could not understand the audio.")
    except Exception as e:
        print(f"An error occurred: {e}")



# Add debug information to confirm the callback is running
print("Starting to listen in background...")


recognizer = Recognizer()
microphone = Microphone()
with microphone as source:
    recognizer.adjust_for_ambient_noise(source)

print("Say something!")
stop_listening = recognizer.listen_in_background(microphone, audio_callback)
print('Listening...')

while True:
    cv2.imshow("webcam", webcam_stream.read())
    if cv2.waitKey(1) in [27, ord("q")]:
        break

webcam_stream.stop()
cv2.destroyAllWindows()
stop_listening(wait_for_stop=False)

Starting to listen in background...
Say something!
Listening...


KeyboardInterrupt: 

In [4]:
# Add debug information to confirm the callback is running
print("Starting to listen in background...")


recognizer = Recognizer()
microphone = Microphone()
with microphone as source:
    recognizer.adjust_for_ambient_noise(source)

print("Say something!")
stop_listening = recognizer.listen_in_background(microphone, audio_callback)
print('Listening...')

while True:
    cv2.imshow("webcam", webcam_stream.read())
    if cv2.waitKey(1) in [27, ord("q")]:
        break

webcam_stream.stop()
cv2.destroyAllWindows()
stop_listening(wait_for_stop=False)

Starting to listen in background...
Say something!
Listening...
Processing audio...
Speech Recognition could not understand the audio.
Processing audio...
Speech Recognition could not understand the audio.


KeyboardInterrupt: 

In [5]:
from speech_recognition import Microphone, Recognizer, UnknownValueError


def audio_callback(recognizer, audio):
    try:
        print("Processing audio...")
        # Try using Google instead of Whisper for testing
        prompt = recognizer.recognize_whisper(audio, model="base", language="english")
        print(f"Recognized Prompt: {prompt}")
        # assistant.answer(prompt, webcam_stream.read(encode=True))
    except UnknownValueError:
        print("Speech Recognition could not understand the audio.")
    except Exception as e:
        print(f"An error occurred: {e}")



# Add debug information to confirm the callback is running
print("Starting to listen in background...")


recognizer = Recognizer()
microphone = Microphone()
with microphone as source:
    recognizer.adjust_for_ambient_noise(source)

print("Say something!")
stop_listening = recognizer.listen_in_background(microphone, audio_callback)
print('Listening...')

Starting to listen in background...
Say something!
Listening...


Processing audio...


  checkpoint = torch.load(fp, map_location=device)


Recognized Prompt:  Can you listen us?
Processing audio...
Recognized Prompt:  Is that your name? Yes, yes. Can you listen us?
Processing audio...
Recognized Prompt:  Is that clear, man? Yes.
Processing audio...
Recognized Prompt:  Why do you think you are dumb?
Processing audio...
Recognized Prompt:  Yes, yes, you do think that you are dumb. You do think that you are dumb.
Processing audio...
Recognized Prompt:  dogare moore itani haraja teacherna chun chun khai ho maau
Processing audio...
Recognized Prompt: 
Processing audio...
Recognized Prompt: 
Processing audio...
Recognized Prompt: 
Processing audio...
Recognized Prompt: 
Processing audio...
Recognized Prompt:  I'm going to go to the next station. I'm going to go to the next station. I'm going to go to the next station. I'm going to go to the next station. I'm going to go to the next station. I'm going to go to the next station. I'm going to go to the next station. I'm going to go to the next station. I'm going to go to the next 

In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())  # Check if GPU is available

2.4.0
False


In [10]:
from bark import SAMPLE_RATE, generate_audio, preload_models
from scipy.io.wavfile import write as write_wav
from IPython.display import Audio

# download and load all models
# preload_models()

# generate audio from text
text_prompt = """
     Hello, my name is Suno. And, uh — and I like pizza. [laughs] 
     But I also have other interests such as playing tic tac toe.
"""
audio_array = generate_audio(text_prompt)

# save audio to disk
write_wav("bark_generation.wav", SAMPLE_RATE, audio_array)
  
# play text in notebook
Audio(audio_array, rate=SAMPLE_RATE)

In [15]:
import pyaudio
from bark import generate_audio

def _tts(response):
    player = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)

    audio_data = generate_audio(response)

    chunk_size = 1024
    for i in range(0, len(audio_data), chunk_size):
        player.write(audio_data[i:i + chunk_size])

    player.stop_stream()
    player.close()

In [1]:
%pip install gtts


Collecting gtts
  Using cached gTTS-2.5.3-py3-none-any.whl.metadata (4.1 kB)
Using cached gTTS-2.5.3-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.3
Note: you may need to restart the kernel to use updated packages.


In [8]:
from gtts import gTTS
language = "en"
text = "Universe and Laws of Physics are eternal. They are fundamental. They are not changing. They Don't need Creator, as the Stephan hawking said, universe is under no obilgation to make sense to us."
speech = gTTS(text=text, lang=language, slow=False, tld="com.au")
speech.save('TextToSpeech.mp3')

In [6]:
import io
from pyaudio import PyAudio, paInt16

# Save to a bytes buffer rather than a file
with io.BytesIO() as audio_buffer:
    speech.write_to_fp(audio_buffer)
    audio_buffer.seek(0)

    # Open PyAudio stream for playback
    player = PyAudio().open(format=paInt16, channels=1, rate=24000, output=True)

    # Read and play audio in chunks
    chunk_size = 1024
    while chunk := audio_buffer.read(chunk_size):
        player.write(chunk)

    # Close the audio stream
    player.stop_stream()
    player.close()

In [12]:
%pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Note: you may need to restart the kernel to use updated packages.
