In [1]:
!pip install livekit-api openai-whisper gradio gTTS diffusers transformers accelerate safetensors torch

import whisper
from livekit import api
import gradio as gr
from gtts import gTTS
import asyncio
import os
from diffusers import StableDiffusionPipeline
import torch

# ------------------ LiveKit Connection ------------------
LIVEKIT_URL = "https://genai-mq7kpfih.livekit.cloud"
LIVEKIT_API_KEY = "APIVZJeW43TvC4Q"
LIVEKIT_API_SECRET = "w2WWueQHqYu3xlbTwdjpuhzst0tf55zWtwtV8bHTq3A"

async def test_livekit():
    client = api.LiveKitAPI(
        LIVEKIT_URL,
        api_key=LIVEKIT_API_KEY,
        api_secret=LIVEKIT_API_SECRET
    )
    rooms = await client.room.list_rooms(api.ListRoomsRequest())
    print("✅ LiveKit connected. Rooms:", rooms)
    await client.aclose()

await test_livekit()

# ------------------ Whisper Model ------------------
model = whisper.load_model("base")

# ------------------ Stable Diffusion Turbo (FAST) ------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = StableDiffusionPipeline.from_pretrained(
    "stabilityai/sd-turbo",
    torch_dtype=torch.float16 if device=="cuda" else torch.float32
).to(device)

# ------------------ Simple Rule-Based Chatbot ------------------
def generate_reply(user_text):
    text = user_text.lower()
    if "how are you" in text:
        return "I'm good, how about you?"
    elif "hello" in text or "hi" in text:
        return "Hello! Nice to meet you."
    elif "your name" in text:
        return "I’m your AI Voice Assistant connected with LiveKit."
    elif "bye" in text:
        return "Goodbye! Have a wonderful day."
    else:
        return "I heard you say: " + user_text

# ------------------ Voice Chat Pipeline ------------------
def voice_chat(audio_file):
    try:
        if audio_file is None:
            return "⚠️ No audio received", None, None

        # Step 1: Transcribe speech
        result = model.transcribe(str(audio_file))
        user_text = result["text"]

        # Step 2: Generate chatbot reply
        reply = generate_reply(user_text)

        # Step 3: Convert reply to speech
        reply_path = "reply.mp3"
        tts = gTTS(reply)
        tts.save(reply_path)

        # Step 4: Generate image ONLY if user asks
        image_path = None
        if any(word in user_text.lower() for word in ["draw", "generate", "create", "picture", "image"]):
            image = pipe(user_text, num_inference_steps=4).images[0]  # ⚡ super fast
            image_path = "generated.png"
            image.save(image_path)

        return f"🗣 You said: {user_text}\n🤖 Reply: {reply}", reply_path, image_path

    except Exception as e:
        return f"❌ Error: {str(e)}", None, None

# ------------------ Gradio UI ------------------
with gr.Blocks(css=".gradio-container {background: linear-gradient(to right, #fce4ec, #e3f2fd);} ") as demo:
    gr.Markdown("<h1 style='color:#1976d2;'>🎤 AI Voice + Image Assistant (LiveKit + Whisper + SD-Turbo)</h1>")
    gr.Markdown("Speak into the mic → AI will reply with text + voice 🎙 + generate an image if you ask 🎨")

    with gr.Row():
        inp = gr.Audio(sources=["microphone"], type="filepath", label="🎙 Speak here")

    with gr.Row():
        out_text = gr.Textbox(label="Chatbot Response", lines=3)
        out_audio = gr.Audio(label="🔊 AI Voice Reply")
        out_image = gr.Image(label="🎨 AI Generated Image")

    submit_btn = gr.Button("Submit 🎯")

    submit_btn.click(fn=voice_chat, inputs=inp, outputs=[out_text, out_audio, out_image])

demo.launch()


Collecting livekit-api
  Downloading livekit_api-1.0.5-py3-none-any.whl.metadata (1.5 kB)
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/803.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting types-protobuf>=4 (from livekit-api)
  Downloading types_protobuf-6.32.1.20250918-py3-none-any.whl.metadata (2.2 kB)
Collecting livekit-protocol<2.0.0,>=1.0.4 (from livekit-api)
  Downloading

100%|███████████████████████████████████████| 139M/139M [00:04<00:00, 31.5MiB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

scheduler_config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

text_encoder/model.safetensors:   0%|          | 0.00/1.36G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

vae/diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://36e823d1818fc3f020.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


