In [1]:
import whisper
import sounddevice as sd
import numpy as np
import queue
import time
from diffusers import StableDiffusionPipeline
import torch
from PIL import Image
from IPython.display import display

In [2]:
model = whisper.load_model("base") 

# Initialize audio queue
audio_queue = queue.Queue()

# Callback function for audio streaming
def callback(indata, frames, time, status):
    if status:
        print(status)
    audio_queue.put(indata.copy())

# Function to transcribe audio
def transcribe_audio(duration):
    start_time = time.time()
    audio_data_list = []

    # Collect audio data for the specified duration
    while time.time() - start_time < duration:
        audio_data = audio_queue.get()
        audio_data_list.append(audio_data)

    # Process audio data
    audio_data_combined = np.concatenate(audio_data_list, axis=0)
    audio_data_combined = np.squeeze(audio_data_combined)

    # Prepare audio for Whisper
    audio_data_combined = whisper.pad_or_trim(audio_data_combined)
    mel = whisper.log_mel_spectrogram(audio_data_combined).to(model.device)

    # Decode audio to text
    options = whisper.DecodingOptions(language="en")
    result = model.decode(mel, options)
    print("Transcription:", result.text)
    return result.text

# Function to generate an image using Stable Diffusion
def generate_image(prompt):
    # Load Stable Diffusion model
    pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to("cuda")
    pipe.scheduler.num_inference_steps = 50

    # Configure parameters
    guidance_scale = 8.0  
    print(f"Generating image for prompt: '{prompt}'")

    # Generate image
    with torch.autocast("cuda"):
        generated_image = pipe(prompt, guidance_scale=guidance_scale).images[0]

    # Display the generated image
    generated_image.show()
    display(generated_image)

  checkpoint = torch.load(fp, map_location=device)


In [None]:
def main():
    duration = 15  

    # Start recording and transcribing audio
    with sd.InputStream(callback=callback, channels=1, samplerate=16000):
        print("Recording for 15 seconds...")
        transcription = transcribe_audio(duration)
        print("Finished recording.")

    # Use transcription as a prompt for image generation
    generate_image(transcription)

if __name__ == "__main__":
    main()

Recording for 15 seconds...
Transcription: The futuristic cityscape was sunset with flying cars.
Finished recording.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Generating image for prompt: 'The futuristic cityscape was sunset with flying cars.'


  0%|          | 0/50 [00:00<?, ?it/s]