<a href="https://colab.research.google.com/github/sliscak/notebooks/blob/main/Advanced_Whisper%2BStable_Diffusion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Speech to image using [Whisper](https://github.com/openai/whisper) and [Stable Diffusion](https://github.com/CompVis/stable-diffusion) from [Diffusers](https://github.com/huggingface/diffusers) library

---




### Install requirements

In [None]:
!pip install --upgrade diffusers
!pip install --upgrade gradio
!pip install --upgrade ftfy
!pip install git+https://github.com/openai/whisper.git

In [None]:
import gradio as gr
import whisper
import os
import torch
from torch import autocast
from diffusers import StableDiffusionPipeline
from google.colab import output
from huggingface_hub import notebook_login

In [None]:
output.enable_custom_widget_manager()

In [None]:
!nvidia-smi -L

In [None]:
# login to verify license
notebook_login()

In [None]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cuda'
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=True)
pipe = pipe.to(device)
model = whisper.load_model("base").to(device) # or small, etc

def transcribe(audio, language=None):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    if language is None or language == 'Autodetect':
      _, probs = model.detect_language(mel)
      language = max(probs, key=probs.get)
    options = whisper.DecodingOptions(language=language, task='translate')
    prompt = whisper.decode(model, mel, options, ).text
    # print(prompt)
    with autocast(device):
        image = pipe(prompt).images[0]
    return f'Detected language: {language}', prompt, image

# block = gr.Blocks(css=".container { margin: auto; }")
demo = gr.Interface(
        fn=transcribe,
        inputs=[gr.Audio(source="microphone", type="filepath"),
                # let the user choose a language in case it was not correctly detected.
                gr.Dropdown(["Autodetect"] + list(whisper.tokenizer.LANGUAGES.keys()), value="Autodetect")],
        outputs=["text","text", "image"])

demo.launch(debug=True)