In [1]:
import os
import json
import tempfile
from io import BytesIO
from typing import Tuple, Optional

from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr
from PIL import Image
import base64

In [2]:
load_dotenv(override=True)

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
MODEL_TEXT = "gpt-4o-mini"        # Text chat
MODEL_STT = "whisper-1"           # Speech-to-text
MODEL_TTS = "tts-1"               # Text-to-speech
MODEL_IMG = "dall-e-3"            # Image generation
openai = OpenAI()

OpenAI API Key exists and begins sk-proj-


In [3]:
system_message = (
    "You are FriendlyAfrica, a helpful, concise airline-style assistant focused on African cities. "
    "Keep answers short (1–2 sentences), accurate, and say when you don't know. "
    "If a user mentions a city, consider using available tools to fetch a quick, curated snapshot."
)

In [4]:
city_facts = {
    "lagos": {
        "country": "Nigeria",
        "population_approx": "15–20M (metro)",
        "highlights": ["Eko Atlantic", "Lekki Conservation Centre", "National Theatre"],
        "blurb": "Lagos is West Africa’s commercial hub and one of the world’s fastest-growing cities."
    },
    "nairobi": {
        "country": "Kenya",
        "population_approx": "4–5M (city)",
        "highlights": ["Nairobi National Park", "Karen Blixen Museum", "Giraffe Centre"],
        "blurb": "Nairobi blends urban energy with wildlife at its doorstep."
    },
    "cape town": {
        "country": "South Africa",
        "population_approx": "4–5M (metro)",
        "highlights": ["Table Mountain", "V&A Waterfront", "Cape Point"],
        "blurb": "Cape Town is famed for dramatic landscapes, beaches, and wine country."
    },
    "cairo": {
        "country": "Egypt",
        "population_approx": "20–22M (metro)",
        "highlights": ["Pyramids of Giza", "Egyptian Museum", "Khan el-Khalili"],
        "blurb": "Cairo sits along the Nile and anchors millennia of history."
    },
    "accra": {
        "country": "Ghana",
        "population_approx": "2–3M (city)",
        "highlights": ["Jamestown", "Kwame Nkrumah Mausoleum", "Labadi Beach"],
        "blurb": "Accra offers coastal vibes, art, and growing culinary scenes."
    },
}

In [5]:
def get_city_info(city: str) -> dict:
    """
    Return a compact, curated snapshot for an African city from a small local dataset.
    """
    c = (city or "").strip().lower()
    if c in city_facts:
        return {"city": city, **city_facts[c]}
    return {"city": city, "blurb": "Unknown", "country": "Unknown", "population_approx": "Unknown", "highlights": []}

In [6]:
city_info_function = {
    "name": "get_city_info",
    "description": "Get a compact snapshot for the named African city (country, approximate population, highlights, and a short blurb).",
    "parameters": {
        "type": "object",
        "properties": {
            "city": {
                "type": "string",
                "description": "The city name, e.g., 'Lagos', 'Nairobi', 'Cairo'."
            }
        },
        "required": ["city"],
        "additionalProperties": False
    }
}

tools = [{"type": "function", "function": city_info_function}]

In [7]:
def handle_tool_call(message) -> Tuple[dict, str]:
    """
    Handle the first tool call suggested by the model.
    """
    tool_call = message.tool_calls[0]
    fn = tool_call.function
    args = json.loads(fn.arguments) if fn.arguments else {}
    city = args.get("city", "")
    data = get_city_info(city)
    response = {
        "role": "tool",
        "content": json.dumps(data),
        "tool_call_id": tool_call.id
    }
    return response, city

In [8]:
def artist(city: str) -> Optional[Image.Image]:
    if not city:
        return None
    prompt = (
        f"An inviting travel poster of {city} in Africa, showing iconic landmarks and local vibe, "
        f"in a vibrant pop-art style, high detail."
    )
    image_response = openai.images.generate(
        model=MODEL_IMG,
        prompt=prompt,
        size="1024x1024",
        n=1,
        response_format="b64_json",
    )
    image_base64 = image_response.data[0].b64_json
    image_data = base64.b64decode(image_base64)
    return Image.open(BytesIO(image_data))

In [9]:
def transcribe(filepath: str) -> str:
    """
    Transcribe a local audio file path to text using Whisper.
    Acceptable formats include WAV/MP3/M4A/WEBM etc.
    """
    if not filepath:
        return ""
    with open(filepath, "rb") as f:
        tr = openai.audio.transcriptions.create(
            model=MODEL_STT,
            file=f,
        )
    return tr.text or ""

In [10]:
def synthesize_tts(text: str, voice: str = "alloy") -> str:
    """
    Synthesize speech to a temporary mp3 file and return the path.
    """
    if not text:
        return ""
    resp = openai.audio.speech.create(
        model=MODEL_TTS,
        voice=voice,
        input=text
    )
    tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
    tmp.write(resp.content)
    tmp.flush()
    tmp.close()
    return tmp.name

In [11]:
def chat_once(user_message: str, history_messages: list, enable_tools: bool = True) -> Tuple[str, Optional[Image.Image], Optional[str]]:
    """
    One LLM turn, with optional function-calling tool and optional image generation.
    Returns: reply_text, image_or_None, city_name_or_None_detected_in_tool
    """
    messages = [{"role": "system", "content": system_message}] + history_messages + [{"role": "user", "content": user_message}]

    response = openai.chat.completions.create(
        model=MODEL_TEXT,
        messages=messages,
        tools=tools if enable_tools else None
    )

    img = None
    chosen_city = None

    if enable_tools and response.choices[0].finish_reason == "tool_calls":
        message = response.choices[0].message
        tool_response, city = handle_tool_call(message)
        chosen_city = city
        messages.append(message)
        messages.append(tool_response)
        response = openai.chat.completions.create(model=MODEL_TEXT, messages=messages)

    reply = response.choices[0].message.content
    return reply, img, chosen_city

In [12]:
def build_ui():
    with gr.Blocks(title="FriendlyAfrica: Multimodal AI for African Cities") as demo:
        gr.Markdown("## FriendlyAfrica: Ask about African cities via text or voice")
        with gr.Row():
            chatbot = gr.Chatbot(label="FriendlyAfrica", type="messages", height=400)
            with gr.Column():
                tts_audio = gr.Audio(label="Assistant Voice", type="filepath", autoplay=True)
                city_image = gr.Image(label="City Image (optional)", interactive=False)

        with gr.Row():
            with gr.Column(scale=3):
                text_in = gr.Textbox(placeholder="Type your question (e.g., 'Tell me about Nairobi') and press Enter", show_label=False)
                mic_in = gr.Audio(sources=["microphone"], type="filepath", label="Or ask by voice")
                with gr.Row():
                    btn_transcribe_and_ask = gr.Button("Transcribe & Ask (from mic)", variant="primary")
                    btn_clear = gr.Button("Clear")

            with gr.Column(scale=2):
                enable_tts = gr.Checkbox(value=True, label="Speak answers (TTS)")
                voice = gr.Radio(choices=["alloy", "onyx"], value="alloy", label="Voice")
                enable_image = gr.Checkbox(value=False, label="Generate city image if a city is detected")
                enable_tools = gr.Checkbox(value=True, label="Use curated city facts tool", info="Calls get_city_info when helpful")

        state = gr.State([])  # LLM message history (list of dicts)

        def on_text_submit(user_msg, hist, do_tts, voice_opt, do_image, do_tools):
            user_msg = (user_msg or "").strip()
            if not user_msg:
                return gr.update(), None, None, ""  # no change

            reply, img, city = chat_once(user_msg, hist or [], enable_tools=bool(do_tools))
            new_hist = (hist or []) + [{"role": "user", "content": user_msg}, {"role": "assistant", "content": reply}]

            audio_path = ""
            if do_tts:
                audio_path = synthesize_tts(reply, voice=voice_opt or "alloy")

            # Image generation
            img_out = None
            if do_image and city:
                try:
                    img_out = artist(city)
                except Exception:
                    img_out = None

            return new_hist, img_out, audio_path, ""

        def on_transcribe_and_ask(mic_path, hist, do_tts, voice_opt, do_image, do_tools):
            # STT
            user_msg = transcribe(mic_path) if mic_path else ""
            if not user_msg:
                return gr.update(), None, None, ""

            reply, img, city = chat_once(user_msg, hist or [], enable_tools=bool(do_tools))
            new_hist = (hist or []) + [{"role": "user", "content": user_msg}, {"role": "assistant", "content": reply}]

            audio_path = ""
            if do_tts:
                audio_path = synthesize_tts(reply, voice=voice_opt or "alloy")

            img_out = None
            if do_image and city:
                try:
                    img_out = artist(city)
                except Exception:
                    img_out = None

            return new_hist, img_out, audio_path, ""

        def on_clear():
            return [], None, None, ""

        # Wire events
        text_in.submit(
            on_text_submit,
            inputs=[text_in, state, enable_tts, voice, enable_image, enable_tools],
            outputs=[chatbot, city_image, tts_audio, text_in],
        )

        btn_transcribe_and_ask.click(
            on_transcribe_and_ask,
            inputs=[mic_in, state, enable_tts, voice, enable_image, enable_tools],
            outputs=[chatbot, city_image, tts_audio, text_in],
        )

        btn_clear.click(
            on_clear,
            inputs=[],
            outputs=[chatbot, city_image, tts_audio, text_in],
        )

    return demo

In [13]:
try:
    demo = build_ui()
    demo.launch()
except Exception as e:
    print("Failed to launch UI:", e)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.
