In [1]:
!pip install -q transformers==4.37.2
!pip install bitsandbytes==0.41.3 accelerate==0.25.0
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q gradio
!pip install -q gTTS


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


^C


In [None]:
import torch
from transformers import BitsAndBytesConfig, pipeline

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_compute_dtype= torch.float16
)

In [None]:
model_id = "llava-hf/llava-1.5-7b-hf"

In [None]:
pipe = pipeline(
    "image-to-text",
    model=model_id,
    model_kwargs={"quantization_config": quant_config}
)

RuntimeError: No GPU found. A GPU is needed for quantization.

In [None]:
pipe

In [None]:
import whisper
import gradio as gr
import time
import warnings
import os
from gtts import gTTS
from PIL import Image

In [None]:
image_path = "../static/skin+problems+2.jpg"

In [None]:
image = Image.open((image_path))

In [None]:
image

In [None]:
import nltk
nltk.download('punkt')
from nltk import sent_tokenize

In [None]:
max_new_tokens = 250

In [None]:
prompt_instructions ="""
Describe the image using as much as detail as possible.
You are a helpful AI assistant who is able to answer questions about the image.
What is the image all about?
Now generate the helpful answer.
"""

In [None]:
prompt = "User: <image>\n" + prompt_instructions + "\nAssistant:"

In [None]:
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": max_new_tokens})

In [None]:
outputs

In [None]:
for sent in sent_tokenize(outputs[0]["generated_text"]):
  print(sent)

In [None]:
warnings.filterwarnings("ignore")

In [None]:
import numpy as np

In [None]:
torch.cuda.is_available()

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
print(f"Using torch {torch.__version__} ({DEVICE})")

In [None]:
import whisper

In [None]:
model = whisper.load_model("small", device=DEVICE)

In [None]:
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'}"
    f"and has {sum(np.prod(v.shape) for v in model.parameters()):,} parameters."
)

In [None]:
import re
import datetime

In [None]:
##Logger file
tstamp = datetime.datetime.now()
tstamp = str(tstamp).replace(" ", "_")
logfile = f"log_{tstamp}.txt"

In [None]:
def writehistory(text):
    with open(logfile, "a", encoding='utf-8') as f:
      f.write(text)
      f.write("\n")
    f.close()

In [None]:
import requests

In [None]:
def img2txt(input_text, input_image):

    # load the image
    image = Image.open(input_image)

    writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}")
    if type(input_text) == tuple:
        prompt_instructions = """
        Describe the image using as much as detail as possible.
        You are a helpful AI assistant who is able to answer questions about the image.
        What is the image all about?
        Now generate the helpful answer.
        """
    else:
      prompt_instructions = """
      Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
      """+ input_text

    writehistory(f"prompt_instructions: {prompt_instructions}")
    prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"

    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

    #properly extract the response text
    if outputs is not None and len(outputs[0]["generated_text"]) > 0:
        match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
        if match:
            #Extract the text after "ASSISTANT:"
            reply = match.group(1)
        else:
            reply = "No response found."
    else:
      reply = "No response generated."

    return reply

In [None]:
def transcribe(audio):

    # Check if the audio input is None or empty
    if audio is None or audio == '':
        return ('','',None)  # Return empty strings and None audio file

    # language = 'en'

    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    _, probs = model.detect_language(mel)

    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    result_text = result.text

    return result_text

In [None]:
def text_to_speech(text, file_path):
    language = 'en'

    audioobj = gTTS(text = text,
                    lang = language,
                    slow = False)

    audioobj.save(file_path)

    return file_path

In [None]:
import locale

In [None]:
print(locale.getlocale())

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3

In [None]:
import gradio as gr
import base64
import os

# A function to handle audio and image inputs
def process_inputs(audio_path, image_path):
    # Process the audio file (assuming this is handled by a function called 'transcribe')
    speech_to_text_output = transcribe(audio_path)

    # Handle the image input
    if image_path:
        chatgpt_output = img2txt(speech_to_text_output, image_path)
    else:
        chatgpt_output = "No image provided."

    # Assuming 'transcribe' also returns the path to a processed audio file
    processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3")  # Replace with actual path if different

    return speech_to_text_output, chatgpt_output, processed_audio_path

# Create the interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath"),
        gr.Image(type="filepath")
    ],
    outputs=[
        gr.Textbox(label="Speech to Text"),
        gr.Textbox(label="AI Output"),
        gr.Audio("Temp.mp3")
    ],
    title="LLM powered voice assistant",
    description="Upload an image and interact via voice input and audio response."
)

# Launch the interface
iface.launch(debug=True)