In [16]:
import os

import dotenv
import numpy as np
import pandas as pd
import phitter
from llama_index.core.agent.react import ReActAgent
from llama_index.core.tools import FunctionTool
from llama_index.llms.together import TogetherLLM

In [17]:
dotenv.load_dotenv(dotenv.find_dotenv())

True

In [18]:
os.environ["TOGETHER_API_KEY"] = os.getenv("TOGETHER_API_KEY")

In [19]:
df = pd.read_csv("https://gist.githubusercontent.com/phitterio/6e1cdb92a5f518c7459484c619185fc5/raw/4072c5233d03204d8999943398b23c2393bcd637/iris.csv")
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [20]:
phitter_cont: phitter.PHITTER = None
global_id_distribution = ""

In [21]:
def get_column_data(column_name: str) -> list[float]:
    """
    Gets the data from a specific column of the global DataFrame.
    """
    return df[column_name].tolist()


def fit_distributions_to_data(
    data: list[float],
    fit_type="continuous",
    num_bins=None,
    confidence_level=0.95,
) -> float:
    """
    Fit the best probability distribution to a dataset
    """
    global phitter_cont
    global global_id_distribution
    phitter_cont = phitter.PHITTER(
        data=data,
        fit_type=fit_type,
        num_bins=num_bins,
        confidence_level=confidence_level,
    )
    phitter_cont.fit(n_workers=2)
    global_id_distribution = phitter_cont.best_distribution["id"]
    id_distribution = phitter_cont.best_distribution["id"]
    parameters = phitter_cont.best_distribution["parameters"]
    parameters_str = ", ".join([f"{k}: {v:.4g}" for k, v in parameters.items()])
    return f"The best distribution is {id_distribution} with parameters {parameters_str}"


def plot_histogram():
    """
    Plot histogram
    """
    global phitter_cont
    phitter_cont.plot_histogram()
    return "showing histogram ..."


def plot_charts_best_distribution():
    """
    Plot charts best distribution
    """
    global phitter_cont
    phitter_cont.plot_distribution(global_id_distribution)
    phitter_cont.plot_ecdf_distribution(global_id_distribution)
    phitter_cont.qq_plot_regression(global_id_distribution)
    return "showing histogram ..."

In [22]:
get_column_tool = FunctionTool.from_defaults(
    fn=get_column_data,
    name="get_column_data",
    description="Gets the data from a specific column of the global DataFrame.",
)
fit_distribution_tool = FunctionTool.from_defaults(
    fn=fit_distributions_to_data,
    name="fit_distribution",
    description="Find the best probability distribution to a dataset and returns the distribution name and parameters. By default fit_type is continuous. By default num_bins is None. By default confidence_level is 0.95",
)
plot_histogram_tool = FunctionTool.from_defaults(
    fn=plot_histogram,
    name="plot_histogram",
    description="Plot hitogram to the current phitter process",
)
plot_charts_best_distribution_tool = FunctionTool.from_defaults(
    fn=plot_charts_best_distribution,
    name="plot_charts_best_distribution",
    description="Plot charts best distribution",
)
plot_histogram_tool = FunctionTool.from_defaults(
    fn=plot_histogram,
    name="plot_histogram",
    description="Plot hitogram to the current phitter process",
)

In [23]:
llm = TogetherLLM(model="meta-llama/Llama-3-70b-chat-hf", temperature=0)

In [24]:
tools = [
    get_column_tool,
    fit_distribution_tool,
    # plot_histogram_tool,
    plot_charts_best_distribution_tool,
]
agent = ReActAgent.from_tools(tools, llm=llm, verbose=True)

In [25]:
column_name = "SepalWidthCm"

In [30]:
query = f"Find the best probability distribution to the SepalWidthCm columna and visualize"
response = agent.chat(query)
response.response

[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: get_column_data
Action Input: {'column_name': 'SepalWidthCm'}
[0m[1;3;34mObservation: [3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3.0, 3.0, 4.0, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3.0, 3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.1, 3.0, 3.4, 3.5, 2.3, 3.2, 3.5, 3.8, 3.0, 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1, 2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2.0, 3.0, 2.2, 2.9, 2.9, 3.1, 3.0, 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3.0, 2.8, 3.0, 2.9, 2.6, 2.4, 2.4, 2.7, 2.7, 3.0, 3.4, 3.1, 2.3, 3.0, 2.5, 2.6, 3.0, 2.6, 2.3, 2.7, 3.0, 2.9, 2.9, 2.5, 2.8, 3.3, 2.7, 3.0, 2.9, 3.0, 3.0, 2.5, 2.9, 2.5, 3.6, 3.2, 2.7, 3.0, 2.5, 2.8, 3.2, 3.0, 3.8, 2.6, 2.2, 3.2, 2.8, 2.8, 2.7, 3.3, 3.2, 2.8, 3.0, 2.8, 3.0, 2.8, 3.8, 2.8, 2.8, 2.6, 3.0, 3.4, 3.1, 3.0, 3.1, 3.1, 3.1, 2.7, 3.2, 3.3, 3.0, 2.5, 3.0, 3.4, 3.0]
[0m[1;3;38;5;200mThought: I 

[1;3;34mObservation: showing histogram ...
[0m[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to answer.
Answer: The best probability distribution to the SepalWidthCm column is burr_4p with parameters A: 3.148, B: 12.14, C: 1.171, loc: -0.06067, and a histogram of the data has been plotted.
[0m

'The best probability distribution to the SepalWidthCm column is burr_4p with parameters A: 3.148, B: 12.14, C: 1.171, loc: -0.06067, and a histogram of the data has been plotted.'

In [27]:
query = f"Find the best probability distribution to the SepalWidthCm column with 14 bins"
response = agent.chat(query)
response.response

[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: get_column_data
Action Input: {'column_name': 'SepalWidthCm'}
[0m[1;3;34mObservation: [3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3.0, 3.0, 4.0, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3.0, 3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.1, 3.0, 3.4, 3.5, 2.3, 3.2, 3.5, 3.8, 3.0, 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1, 2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2.0, 3.0, 2.2, 2.9, 2.9, 3.1, 3.0, 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3.0, 2.8, 3.0, 2.9, 2.6, 2.4, 2.4, 2.7, 2.7, 3.0, 3.4, 3.1, 2.3, 3.0, 2.5, 2.6, 3.0, 2.6, 2.3, 2.7, 3.0, 2.9, 2.9, 2.5, 2.8, 3.3, 2.7, 3.0, 2.9, 3.0, 3.0, 2.5, 2.9, 2.5, 3.6, 3.2, 2.7, 3.0, 2.5, 2.8, 3.2, 3.0, 3.8, 2.6, 2.2, 3.2, 2.8, 2.8, 2.7, 3.3, 3.2, 2.8, 3.0, 2.8, 3.0, 2.8, 3.8, 2.8, 2.8, 2.6, 3.0, 3.4, 3.1, 3.0, 3.1, 3.1, 3.1, 2.7, 3.2, 3.3, 3.0, 2.5, 3.0, 3.4, 3.0]
[0m[1;3;38;5;200mThought: I 

'The best probability distribution to the SepalWidthCm column with 14 bins is burr_4p with parameters A: 3.148, B: 12.14, C: 1.171, loc: -0.06067.'

In [28]:
query = f"Find the best probability distribution to the SepalWidthCm column with 14 bins and confidence level of 0.98"
response = agent.chat(query)
response.response

[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: get_column_data
Action Input: {'column_name': 'SepalWidthCm'}
[0m[1;3;34mObservation: [3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3.0, 3.0, 4.0, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3.0, 3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.1, 3.0, 3.4, 3.5, 2.3, 3.2, 3.5, 3.8, 3.0, 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1, 2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2.0, 3.0, 2.2, 2.9, 2.9, 3.1, 3.0, 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3.0, 2.8, 3.0, 2.9, 2.6, 2.4, 2.4, 2.7, 2.7, 3.0, 3.4, 3.1, 2.3, 3.0, 2.5, 2.6, 3.0, 2.6, 2.3, 2.7, 3.0, 2.9, 2.9, 2.5, 2.8, 3.3, 2.7, 3.0, 2.9, 3.0, 3.0, 2.5, 2.9, 2.5, 3.6, 3.2, 2.7, 3.0, 2.5, 2.8, 3.2, 3.0, 3.8, 2.6, 2.2, 3.2, 2.8, 2.8, 2.7, 3.3, 3.2, 2.8, 3.0, 2.8, 3.0, 2.8, 3.8, 2.8, 2.8, 2.6, 3.0, 3.4, 3.1, 3.0, 3.1, 3.1, 3.1, 2.7, 3.2, 3.3, 3.0, 2.5, 3.0, 3.4, 3.0]
[0m[1;3;38;5;200mThought: I 

'The best probability distribution to the SepalWidthCm column with 14 bins and a confidence level of 0.98 is burr_4p with parameters A: 3.148, B: 12.14, C: 1.171, loc: -0.06067.'

In [29]:
query = "I want to see charts of the best distribution"
response = agent.chat(query)

[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: get_column_data
Action Input: {'column_name': 'SepalWidthCm'}
[0m[1;3;34mObservation: [3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3.0, 3.0, 4.0, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3.0, 3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.1, 3.0, 3.4, 3.5, 2.3, 3.2, 3.5, 3.8, 3.0, 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1, 2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2.0, 3.0, 2.2, 2.9, 2.9, 3.1, 3.0, 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3.0, 2.8, 3.0, 2.9, 2.6, 2.4, 2.4, 2.7, 2.7, 3.0, 3.4, 3.1, 2.3, 3.0, 2.5, 2.6, 3.0, 2.6, 2.3, 2.7, 3.0, 2.9, 2.9, 2.5, 2.8, 3.3, 2.7, 3.0, 2.9, 3.0, 3.0, 2.5, 2.9, 2.5, 3.6, 3.2, 2.7, 3.0, 2.5, 2.8, 3.2, 3.0, 3.8, 2.6, 2.2, 3.2, 2.8, 2.8, 2.7, 3.3, 3.2, 2.8, 3.0, 2.8, 3.0, 2.8, 3.8, 2.8, 2.8, 2.6, 3.0, 3.4, 3.1, 3.0, 3.1, 3.1, 3.1, 2.7, 3.2, 3.3, 3.0, 2.5, 3.0, 3.4, 3.0]
[0m[1;3;38;5;200mThought: I 

[1;3;34mObservation: showing histogram ...
[0m[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to answer
Answer: The best probability distribution to the SepalWidthCm column with 14 bins and a confidence level of 0.98 is burr_4p with parameters A: 3.148, B: 12.14, C: 1.171, loc: -0.06067. The chart of this distribution has been plotted.
[0m