# Data Analysis Agent

In this notebook, we will create an agent that given a dataframe will answer user questions about it. This example is based on the [Simple Data Analysis Agent](https://github.com/NirDiamant/GenAI_Agents/blob/main/all_agents_tutorials/simple_data_analysis_agent_notebook-pydanticai.ipynb) from the [GenAI Agents](https://github.com/NirDiamant/GenAI_Agents) repository.

In [1]:
import numpy as np
from datetime import datetime, timedelta
import pandas as pd
import gradio as gr

from typing import List,Annotated
from function_schema import Doc
from pydantic import Field

from agente.core.base import BaseAgent
from agente.core.decorators import function_tool

from dotenv import load_dotenv
## Load and set environment variables from .env file
load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

### We start by simulating a dataset of car sales

In [2]:
# Generate sample data
n_rows = 1000

# Generate dates
start_date = datetime(2022, 1, 1)
dates = [start_date + timedelta(days=i) for i in range(n_rows)]

# Define data categories
makes = ['Toyota', 'Honda', 'Ford', 'Chevrolet', 'Nissan', 'BMW', 'Mercedes', 'Audi', 'Hyundai', 'Kia']
models = ['Sedan', 'SUV', 'Truck', 'Hatchback', 'Coupe', 'Van']
colors = ['Red', 'Blue', 'Black', 'White', 'Silver', 'Gray', 'Green']

# Create the dataset
data = {
    'Date': dates,
    'Make': np.random.choice(makes, n_rows),
    'Model': np.random.choice(models, n_rows),
    'Color': np.random.choice(colors, n_rows),
    'Year': np.random.randint(2015, 2023, n_rows),
    'Price': np.random.uniform(20000, 80000, n_rows).round(2),
    'Mileage': np.random.uniform(0, 100000, n_rows).round(0),
    'EngineSize': np.random.choice([1.6, 2.0, 2.5, 3.0, 3.5, 4.0], n_rows),
    'FuelEfficiency': np.random.uniform(20, 40, n_rows).round(1),
    'SalesPerson': np.random.choice(['Alice', 'Bob', 'Charlie', 'David', 'Eva'], n_rows)
}

# Create DataFrame and sort by date
df = pd.DataFrame(data).sort_values('Date')
df

Unnamed: 0,Date,Make,Model,Color,Year,Price,Mileage,EngineSize,FuelEfficiency,SalesPerson
0,2022-01-01,Hyundai,Coupe,Red,2020,79032.69,18911.0,2.5,33.9,Charlie
1,2022-01-02,Ford,Hatchback,Silver,2022,73039.34,9060.0,4.0,30.7,David
2,2022-01-03,Kia,Sedan,Blue,2016,60934.47,59155.0,1.6,25.2,Eva
3,2022-01-04,Kia,Sedan,Blue,2021,56941.75,42754.0,2.5,37.4,Bob
4,2022-01-05,Chevrolet,SUV,Blue,2017,37281.09,66963.0,3.0,38.2,Eva
...,...,...,...,...,...,...,...,...,...,...
995,2024-09-22,Kia,Sedan,Blue,2020,62600.87,82421.0,2.5,29.1,Charlie
996,2024-09-23,Ford,Coupe,White,2015,35216.18,42201.0,3.5,28.6,Bob
997,2024-09-24,Hyundai,Van,Black,2017,62000.08,25283.0,3.0,34.0,Bob
998,2024-09-25,Mercedes,Truck,White,2020,57534.83,20707.0,3.5,39.6,Charlie


In [3]:
SYSTEM_PROMPT="""You are an AI assistant that helps extract information from a pandas DataFrame.
If asked about columns, be sure to check the column names first.
Be concise in your answers."""


class SimpleAnalysisAgent(BaseAgent):
    agent_name: str = "SimpleAnalysisAgent"
    system_prompt:str = SYSTEM_PROMPT
    df: pd.DataFrame = Field(default_factory=pd.DataFrame) 

    completion_kwargs: dict = {
        "model": "gpt-4o",
        "stream": False,
    }

    @function_tool
    async def df_query(self,query: str):
        """A tool for running queries on the `pandas.DataFrame`. Use this tool to interact with the DataFrame.

        `query` will be executed using `pd.eval(query, target=df)`, so it must contain syntax compatible with
        `pandas.eval`.

        Args:
            query: The query to run on the DataFrame.
        """
        # Print the query for debugging purposes and fun :)
        print(f'Running query: `{query}`')
        return str(pd.eval(query, target=self.df))

In [4]:
agent = SimpleAnalysisAgent()
agent.add_message( role = "user", content = "What are the column names in this dataset?")

responses = [r async for r in agent.run()]

Executing tool: df_query from agent SimpleAnalysisAgent
Running query: `list(df.columns)`


In [5]:
responses[-1].dict()

{'call_id': 'chatcmpl-AvBTSBeStEGhGK8w2ONL4e9vHiEhR',
 'agent_name': 'SimpleAnalysisAgent',
 'role': 'assistant',
 'content': "The column names in the dataset are: 'Date', 'Make', 'Model', 'Color', 'Year', 'Price', 'Mileage', 'EngineSize', 'FuelEfficiency', and 'SalesPerson'.",
 'tool_calls': [],
 'usage': {'completion_tokens': 44, 'prompt_tokens': 207, 'total_tokens': 251}}

In [7]:
agent.add_message("user","What is the most sold brand?")
responses = [r async for r in agent.run()]
responses[-1].dict()

Executing tool: df_query from agent SimpleAnalysisAgent
Running query: `df['Make'].mode()[0]`


{'call_id': 'chatcmpl-AvBTsRynhqoXTre9CMJ82rzwfkr8M',
 'agent_name': 'SimpleAnalysisAgent',
 'role': 'assistant',
 'content': 'The most sold brand is BMW.',
 'tool_calls': [],
 'usage': {'completion_tokens': 9, 'prompt_tokens': 296, 'total_tokens': 305}}

In [9]:
agent.add_message("user","What is the average price of the cars?")
responses = [r async for r in agent.run()]
responses[-1].dict()

Executing tool: df_query from agent SimpleAnalysisAgent
Running query: `df['Price'].mean()`


{'call_id': 'chatcmpl-AvBUACbIMd3MYvhvUkLyvZCtcBnv0',
 'agent_name': 'SimpleAnalysisAgent',
 'role': 'assistant',
 'content': 'The average price of the cars is approximately 50,202.30.',
 'tool_calls': [],
 'usage': {'completion_tokens': 17, 'prompt_tokens': 356, 'total_tokens': 373}}

## Streaming

### Now the same but with streaming with Gradio

In [10]:
import uuid

class SimpleAnalysisAgent(BaseAgent):
    agent_name: str = "SimpleAnalysisAgent"
    df: pd.DataFrame = df
    completion_kwargs: dict = {
        "model": "gpt-4o",
        "stream": False,
    }

    @function_tool
    async def df_query(self,query: Annotated[str, Doc("The query to run on the DataFrame.")]) -> str:
        """A tool for running queries on the `pandas.DataFrame`. Use this tool to interact with the DataFrame.

        `query` will be executed using `pd.eval(query, target=df)`, so it must contain syntax compatible with
        `pandas.eval`.
        """
        # Print the query for debugging purposes and fun :)
        print(f'Running query: `{query}`')
        return str(pd.eval(query, target=self.df))


In [10]:
def get_new_agent():
    """Create a fresh agent instance"""
    new_agent = SimpleAnalysisAgent()
    new_agent.add_message("system", SYSTEM_PROMPT)
    return new_agent

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(type="messages")
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    # Initialize with a function call instead of direct instantiation
    agent_state = gr.State(value=None)

    def user(user_message, agent):
        if agent is None:
            agent = get_new_agent()
        agent.add_message("user", user_message)
        return "", [{"role": "user", "content": user_message}], agent

    async def bot(history, agent):
        if not history:
            yield [], agent
            return
        
        history.append({"role": "assistant", "content": ""})
        async for chunk in agent.run():
            if chunk.content:
                history[-1]["content"] += chunk.content
                yield history, agent

    def reset_state():
        return None, get_new_agent()

    msg.submit(user, [msg, agent_state], [msg, chatbot, agent_state], queue=False).then(
        bot, [chatbot, agent_state], [chatbot, agent_state]
    )
    clear.click(reset_state, None, [chatbot, agent_state], queue=False)

demo.launch()

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Executing tool: df_query from agent SimpleAnalysisAgent
Running query: `columns`
Executing tool: df_query from agent SimpleAnalysisAgent
Running query: `list(df.columns)`
