# Airline Customer Assitant Chatbot with Multi-modality 

We can use DALL-E-3, the image generation model behind GPT-4o, to make us some images

Let's put this in a function called artist.

### Price alert: each time I generate an image it costs about 4 cents - Image generation is bit expensive!

In [1]:
# imports

import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr

In [2]:
# Some imports for handling images and audio

import base64
from io import BytesIO
from PIL import Image
from IPython.display import Audio, display

In [3]:
# Initialization

load_dotenv(override=True)

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
MODEL = "gpt-4o-mini"
openai = OpenAI()

# As an alternative, if you'd like to use Ollama instead of OpenAI
# Check that Ollama is running for you locally (see week1/day2 exercise) then uncomment these next 2 lines
# MODEL = "llama3.2"
# openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')


OpenAI API Key exists and begins sk-proj-


In [4]:
system_message = "You are a helpful assistant for an Airline called FlightAI. "
system_message += "Give short, courteous answers, no more than 1 sentence. "
system_message += "Always be accurate. If you don't know the answer, say so."

In [5]:
# Let's start by making a  function tool to get ticket price

ticket_prices = {"chennai": "$799", "Bangalore": "$899", "Delhi": "$999", "Kochin": "$1100"}

def get_ticket_price(destination_city):
    print(f"Tool get_ticket_price called for {destination_city}")
    city = destination_city.lower()
    return ticket_prices.get(city, "Unknown")

In [6]:
# There's a particular dictionary structure that's required to describe our function:

price_function = {
    "name": "get_ticket_price",
    "description": "Get the price of a return ticket to the destination city. Call this whenever you need to know the ticket price, for example when a customer asks 'How much is a ticket to this city'",
    "parameters": {
        "type": "object",
        "properties": {
            "destination_city": {
                "type": "string",
                "description": "The city that the customer wants to travel to",
            },
        },
        "required": ["destination_city"],
        "additionalProperties": False
    }
}

In [7]:
# And this is included in a list of tools:

tools = [{"type": "function", "function": price_function}]

In [8]:
## Getting OpenAI to use our Tool
#What we actually do is give the LLM the opportunity to inform us that it wants us to run the tool.

# We have to write that function handle_tool_call:

def handle_tool_call(message):
    tool_call = message.tool_calls[0]
    arguments = json.loads(tool_call.function.arguments)
    city = arguments.get('destination_city')
    price = get_ticket_price(city)
    response = {
        "role": "tool",
        "content": json.dumps({"destination_city": city,"price": price}),
        "tool_call_id": tool_call.id
    }
    return response, city

In [9]:
# Function to generate images
def artist(city):
    image_response = openai.images.generate(
            model="dall-e-3",
            prompt=f"An image representing a vacation in {city}, showing tourist spots and everything unique about {city}, in a vibrant pop-art style",
            size="1024x1024",
            n=1,
            response_format="b64_json",
        )
    image_base64 = image_response.data[0].b64_json
    image_data = base64.b64decode(image_base64)
    return Image.open(BytesIO(image_data))

In [None]:
#image = artist("New York City")
#display(image)

In [10]:
# Function to generate audio

def talker(message):
    response = openai.audio.speech.create(
        model="tts-1",
        voice="fable", #"alloy", "echo", "fable", "onyx", "nova", "shimmer"
        input=message)

    audio_stream = BytesIO(response.content)
    output_filename = "output_audio.mp3"
    with open(output_filename, "wb") as f:
        f.write(audio_stream.read())

    # Play the generated audio
    display(Audio(output_filename, autoplay=True))



In [None]:
#talker("Hello All! Welcome to AI learning session")

In [11]:
def transcribe_audio(audio_file):
    with open(audio_file, "rb") as f:
        transcript = openai.audio.transcriptions.create(
            model="whisper-1",
            file=f
        )
    return transcript.text

In [12]:
# Translate between languages using GPT
def translate(text, source_lang, target_lang):
    translation_prompt = (
        f"Translate the following text from {source_lang} to {target_lang}:\n\n{text}"
    )
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": translation_prompt}]
    )
    return response.choices[0].message.content.strip()

In [13]:
# Chatbot logic: handle both text and audio input
def chatbot_dual(history):
    messages = [{"role": "system", "content": system_message}] + history
    response = openai.chat.completions.create(model=MODEL, messages=messages, tools=tools)
    image = None
    
    if response.choices[0].finish_reason=="tool_calls":
        message = response.choices[0].message
        response, city = handle_tool_call(message)
        messages.append(message)
        messages.append(response)
        image = artist(city)
        response = openai.chat.completions.create(model=MODEL, messages=messages)
        
    reply = response.choices[0].message.content
    history += [{"role":"assistant", "content":reply}]

    # Comment out or delete the next line if you'd rather skip Audio for now..
    # audio_response = talker(reply)
    talker(reply)
    return history, image# Chatbot logic here — replace with real logic

In [14]:
# More involved Gradio code as we're not using the preset Chat interface!
# Passing in inbrowser=True in the last line will cause a Gradio window to pop up immediately.

with gr.Blocks() as ui:
    with gr.Row():
        chatbot = gr.Chatbot(height=500, type="messages")
        image_output = gr.Image(height=500)

    with gr.Row():
        text_input = gr.Textbox(label="Chat with our AI Assistant:")
        audio_input = gr.Audio(sources="microphone", type="filepath", label="Or speak to the assistant")

    with gr.Row():
        # voice_output = gr.Audio(label="Bot Voice Reply", autoplay=True)
        clear = gr.Button("Clear")

    def do_entry(message, audio, history):
        if message:
            history += [{"role":"user", "content":message}]
        if audio:
            history += [{"role":"user", "content":transcribe_audio(audio)}]
        return "", None, history

    text_input.submit(do_entry, inputs=[text_input, audio_input, chatbot], outputs=[text_input, audio_input, chatbot]).then(chatbot_dual, inputs=chatbot, outputs=[chatbot, image_output]
    )

    audio_input.change(do_entry, inputs=[text_input, audio_input, chatbot], outputs=[text_input, audio_input, chatbot]).then(chatbot_dual, inputs=chatbot, outputs=[chatbot, image_output]
    )

    clear.click(lambda: None, inputs=None, outputs=chatbot, queue=False)

ui.launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7869
* To create a public link, set `share=True` in `launch()`.




ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "c:\Users\rathi\Projects\AI-Coding\.venv\Lib\site-packages\uvicorn\protocols\http\httptools_impl.py", line 409, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\rathi\Projects\AI-Coding\.venv\Lib\site-packages\uvicorn\middleware\proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\rathi\Projects\AI-Coding\.venv\Lib\site-packages\fastapi\applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "c:\Users\rathi\Projects\AI-Coding\.venv\Lib\site-packages\starlette\applications.py", line 112, in __call__
    await self.middleware_stack(scope, receive, send)
  File "c:\Users\rathi\Projects\AI-Coding\.venv\Lib\site-packages\starlette\middleware\errors.py", line 187, in __call_

Tool get_ticket_price called for Chennai


In [None]:
from pydub import AudioSegment
import tempfile
import os


In [None]:
def text_to_speech(text):
    """Convert text to speech using OpenAI TTS"""
    try:
        response = openai.audio.speech.create(
            model="tts-1",
            voice="alloy",
            input=text
        )
        # Save to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
            response.stream_to_file(tmp_file.name)
            return tmp_file.name
    except Exception as e:
        print(f"TTS Error: {e}")
        return None

In [None]:
def speech_to_text(audio):
    """Convert speech to text using OpenAI Whisper"""
    try:
        if audio is None:
            return ""
        
        # Convert audio to the format OpenAI expects
        with open(audio, "rb") as audio_file:
            transcript = openai.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file
            )
        return transcript.text
    except Exception as e:
        print(f"STT Error: {e}")
        return ""

In [None]:
def chat_with_ai(message, history):
    """Chat with AI and return both text and audio"""
    if not message:
        return "", None, history
    
    # Add user message to history
    history.append([message, None])
    
    # Get AI response
    messages = [{"role": "user", "content": message}]
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )
    
    ai_reply = response.choices[0].message.content
    
    # Update history with AI response
    history[-1][1] = ai_reply
    
    # Generate audio for AI response
    audio_path = text_to_speech(ai_reply)
    
    return "", audio_path, history


In [None]:
def process_audio_input(audio, history):
    """Process audio input and get AI response"""
    if audio is None:
        return "", None, history
    
    # Convert audio to text
    message = speech_to_text(audio)
    
    if not message:
        return "", None, history
    
    # Get AI response
    return chat_with_ai(message, history)

def clear_chat():
    """Clear the chat history"""
    return [], None

In [None]:
# Create the Gradio interface
with gr.Blocks(title="AI Chatbot with Audio") as ui:
    with gr.Row():
        chatbot = gr.Chatbot(
            label="Chat History",
            height=400,
            show_label=True
        )
    
    with gr.Row():
        with gr.Column(scale=3):
            text_input = gr.Textbox(
                label="Type your message",
                placeholder="Enter your message here...",
                lines=2
            )
        
        with gr.Column(scale=1):
            audio_input = gr.Audio(
                label="Or record audio",
                type="filepath"
            )

    with gr.Row():
        with gr.Column(scale=1):
            submit_btn = gr.Button("Send Message", variant="primary")

        with gr.Column(scale=1):
            voice_output = gr.Audio(
                label="AI Voice Response (Auto-play)",
                type="filepath",
                interactive=False,
                autoplay=True  # Enable auto-play
            )    
            
        with gr.Column(scale=1):
            clear = gr.Button("Clear Chat", variant="secondary")
    
    # Event handlers
    text_input.submit(
        fn=chat_with_ai,
        inputs=[text_input, chatbot],
        outputs=[text_input, voice_output, chatbot]
    )

    submit_btn.click(
        fn=chat_with_ai,
        inputs=[text_input, chatbot],
        outputs=[text_input, voice_output, chatbot]
    )
    
    audio_input.change(
        fn=process_audio_input,
        inputs=[audio_input, chatbot],
        outputs=[text_input, voice_output, chatbot]
    )
    
    clear.click(
        fn=clear_chat,
        outputs=[chatbot, voice_output]
    )

# Launch the interface
ui.launch(inbrowser=True)