# Install required Libraries

In [106]:
!pip install openai gradio pandas networkx matplotlib pypdf speechrecognition pydub python-dotenv beautifulsoup4seaborn


# Imports

In [1]:
import os
import io
import json
import sys
import tempfile
from contextlib import redirect_stdout
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import numpy as np
import gradio as gr
from gradio.themes import Default
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns


#  Initialization 

In [2]:
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
if not API_KEY:
    raise ValueError("OPENAI_API_KEY not set in environment")
client = OpenAI(api_key=API_KEY)
MODEL = "gpt-4o"


# Global state 

In [3]:
df_state = None
current_system_prompt = None


# System prompt template 

In [4]:
SYSTEM_PROMPT_TEMPLATE = """
You are DataBot, an expert data-analysis assistant.  
The dataframe columns are: {columns}.  
Use only these exact names when calling functions; never invent or guess new ones.

Available functions:
 • ingest_csv(file_path) → load the CSV and refresh column list  
 • execute_code(code) → run Python code with `df` as the dataframe, return stdout and any plot  
 

when required you can call summarize_csv function to provide summary.

Whenever you need to execute any kind of code or plot, generate a Python snippet and call execute_code.
"""

# Tools

In [5]:
# — Tool: ingest_csv —
def ingest_csv(file_path: str) -> dict:
    global df_state, current_system_prompt
    df_state = pd.read_csv(file_path)
    cols = df_state.columns.tolist()
    current_system_prompt = SYSTEM_PROMPT_TEMPLATE.format(columns=cols)
    return {
        "rows": df_state.shape[0],
        "columns": cols,
        "dtypes": df_state.dtypes.astype(str).to_dict()
    }
# — Tool: summarize_csv —
def summarize_csv() -> dict:
    if df_state is None:
        raise ValueError("No dataframe loaded.")
    md = df_state.describe(include="all").round(3).to_markdown()
    
    return {"markdown": md}
    
# — Tool: execute_code —
def execute_code(code: str) -> dict:
    """
    Execute user-provided Python code snippet in a sandboxed namespace
    where `df` refers to df_state. Capture stdout and any Matplotlib figure.
    """
    global df_state
    if df_state is None:
        raise ValueError("No DataFrame loaded. Please upload a CSV first.")
    # Prepare namespace
    namespace = {
        "df": df_state,
        "pd": pd,
        "np": np,
        "plt": plt,
        "sns": sns,
    }
    # Capture stdout
    stdout_buffer = io.StringIO()
    fig = None
    try:
        with redirect_stdout(stdout_buffer):
            exec(code, namespace)
        # Capture current figure if any
        fig = plt.gcf()
        img_bytes = None
        if fig.axes:
            buf = io.BytesIO()
            fig.tight_layout()
            fig.savefig(buf, format="png")
            plt.close(fig)
            buf.seek(0)
            img_bytes = buf.getvalue()
        return {
            "stdout": stdout_buffer.getvalue(),
            "has_image": bool(img_bytes),
            "image_bytes": img_bytes
        }
    except Exception as e:
        raise RuntimeError(f"Error executing code: {e}")


# Function schemas for LLM 

In [6]:
def get_function_schemas():
    return [
        {
            "name": "ingest_csv",
            "description": "Load a CSV file into a DataFrame.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string"}
                },
                "required": ["file_path"]
            }
        },
        {
            "name": "execute_code",
            "description": "Execute Python code snippet with `df` loaded, returning stdout and plot if any.",
            "parameters": {
                "type": "object",
                "properties": {
                    "code": {"type": "string"}
                },
                "required": ["code"]
            }
        },
        {
            "name": "summarize_csv",
            "description": "Return descriptive statistics for numeric and categorical columns as markdown.",
            "parameters": {"type": "object", "properties": {}, "required": []}
        }
    ]


# Chat logic

In [7]:
def chat(message, history, file):
    global df_state, current_system_prompt
    # Build system prompt
    system_msg = current_system_prompt or SYSTEM_PROMPT_TEMPLATE.format(columns="(none yet)")
    msgs = [{"role": "system", "content": system_msg}]
    # Replay history
    for u, b in history:
        msgs.append({"role": "user", "content": u})
        msgs.append({"role": "assistant", "content": b})
    # Decide whether to ingest CSV
    if file is not None and df_state is None:
        msgs.append({"role": "user", "content": file.name})
        resp = client.chat.completions.create(
            model=MODEL,
            messages=msgs,
            functions=get_function_schemas(),
            function_call={
                "name": "ingest_csv",
                "arguments": json.dumps({"file_path": file.name})
            }
        )
    else:
        msgs.append({"role": "user", "content": message})
        resp = client.chat.completions.create(
            model=MODEL,
            messages=msgs,
            functions=get_function_schemas(),
            function_call="auto"
        )

    choice = resp.choices[0].message

    # Plain text reply
    if choice.content:
        history.append((message, choice.content))
        return history, None, ""

    # Function call
    fn = choice.function_call.name
    args = json.loads(choice.function_call.arguments)
    # Attempt tool execution
    try:
        if fn == "ingest_csv":
            result = ingest_csv(**args)
            tool_result = {"message": f"Loaded CSV with {result['rows']} rows and {len(result['columns'])} columns."}
            img = None
        elif fn == "execute_code":
            exec_res = execute_code(args["code"])
            # Prepare tool result for LLM
            tool_result = {"stdout": exec_res["stdout"], "has_image": exec_res["has_image"]}
            img = Image.open(io.BytesIO(exec_res["image_bytes"])) if exec_res["has_image"] else None
        elif fn == "summarize_csv":
            res = summarize_csv()
            tool_result = {"markdown": res["markdown"]}
            img = None
        else:
            raise ValueError(f"Unknown function {fn}")
    except Exception as e:
        # Send error back to LLM
        error_payload = {"error": str(e)}
        msgs.append({"role": "function", "name": fn, "content": json.dumps(error_payload)})
        follow = client.chat.completions.create(
            model=MODEL,
            messages=msgs
        ).choices[0].message.content
        history.append((message, follow))
        return history, None, ""

    # Inject function result and get follow-up from LLM
    msgs.append({"role": "function", "name": fn, "content": json.dumps(tool_result)})
    follow = client.chat.completions.create(
        model=MODEL,
        messages=msgs
    ).choices[0].message.content

    label = f"[Uploaded {file.name}]" if (fn == "ingest_csv") else message
    history.append((label, follow))
    return history, img, ""


# Gradio UI 

In [8]:
with gr.Blocks(theme=Default()) as app:
    gr.Markdown("## 📊 Data Analysis Chatbot\nUpload a CSV, then ask me to summarize or plot — I’ll run code under the hood and show you the results.")
    file_input = gr.File(label="Upload CSV (.csv)")
    chatbot = gr.Chatbot()
    with gr.Row():
        user_input = gr.Textbox(placeholder="Type your question…", label=None)
        send = gr.Button("Send")
    image_out = gr.Image()

    send.click(chat, inputs=[user_input, chatbot, file_input], outputs=[chatbot, image_out, user_input], queue=True)
    user_input.submit(chat, inputs=[user_input, chatbot, file_input], outputs=[chatbot, image_out, user_input], queue=True)

    app.launch(inbrowser=True)

  chatbot = gr.Chatbot()


* Running on local URL:  http://127.0.0.1:7865
* To create a public link, set `share=True` in `launch()`.
