In [1]:
from llm_workflow.agents import Tool, OpenAIFunctions

import yaml
with open('/code/source/notebooks/openai_functions.yml') as f:
    tools = yaml.safe_load(f)

tools = {x['name']: Tool.from_dict(x) for x in tools}
print(tools)

{'xxxxx': <llm_workflow.agents.Tool object at 0xffff6330a350>, 'yyyyyy': <llm_workflow.agents.Tool object at 0xffff63308410>}


In [2]:
# tools = OpenAIFunctions(
#     model_name='gpt-3.5-turbo-0613',
#     tools=tools.values(),
# )
# question = "I want to plot a graph of amount which is numeric and amount_2 which is numeric."
# response = tools(question)
# response

```
- name: ask_weather
  description: Use this function to answer questions about the weather for a particular city.
  inputs:
    location:
      description: The city and state, e.g. San Francisco, CA
      type: string
    unit:
      description: The temperature unit to use. The model needs to infer this from the `location`.
      type: string
      enum:
        - celsius
        - fahrenheit
  required:
    - location
    - unit
```

```
 'unit': {
                'type': 'string',
                'enum': ['celsius', 'fahrenheit'],
                'description': "The temperature unit to use. The model needs to infer this from the `location`.",  # noqa
        }
```

In [3]:
column_types = {'checking_balance': 'string', 'months_loan_duration': 'numeric', 'credit_history': 'string', 'purpose': 'string', 'amount': 'numeric', 'savings_balance': 'string', 'employment_duration': 'string', 'percent_of_income': 'numeric', 'years_at_residence': 'numeric', 'age': 'numeric', 'other_credit': 'string', 'housing': 'string', 'existing_loans_count': 'numeric', 'job': 'string', 'dependents': 'numeric', 'phone': 'string', 'default': 'string'}
column_types

{'checking_balance': 'string',
 'months_loan_duration': 'numeric',
 'credit_history': 'string',
 'purpose': 'string',
 'amount': 'numeric',
 'savings_balance': 'string',
 'employment_duration': 'string',
 'percent_of_income': 'numeric',
 'years_at_residence': 'numeric',
 'age': 'numeric',
 'other_credit': 'string',
 'housing': 'string',
 'existing_loans_count': 'numeric',
 'job': 'string',
 'dependents': 'numeric',
 'phone': 'string',
 'default': 'string'}

In [61]:
import uuid

with open('/code/source/config/graphing_configurations.yml') as f:
    configs = yaml.safe_load(f)['configurations']

print(len(configs))

def build_tools_from_graph_configs(configs: dict, column_types: dict) -> list[Tool]:
    # TODO: need to add graph_type in order to support e.g. P( y | x ) chart
    # TODO: i might have to add graph_type as another variable in yaml since it's not always
    # valid. Only certain graphs need to specify the graph_type. Actually i'm not sure that
    # is True.. i think i can just pass graph_type in the same way as x_variable, etc.
    # I just need to add it to the inputs dict or extract it from the tool name. or something
    # BUT there are other variables like `Aggregation:` (sum, avg, etc.) that are not always
    # valid.
    #TODO: would also be nice to specify monthly/weekly/daily etc. for date variables
    tools = []
    for config in configs:
        # config = configs[0]
        variables = {k:v for k, v in config['selected_variables'].items() if v is not None}
        required_variables = list(variables.keys())
        for graph_type in config['graph_types']:
            if 'agent' not in graph_type:
                continue
            # description = graph_type['info']
            if graph_type['agent'] and 'description' in graph_type['agent']:
                agent_description = graph_type['agent']['description']
                agent_description = agent_description.strip()
            else:
                agent_description = ''
            description = f"({graph_type['name']}) {graph_type['description']} {agent_description}"
            for var, types in variables.items():
                replacement = f" axis variable (which can be a column of type {', '.join(types)})"
                description = description.replace(
                    f"{{{{{var}}}}}",
                    f"{var.replace('_variable',replacement)}"
                ).strip()
            # description = f"({graph_type['name']}) "

            # graph_type = config['graph_types'][0]
            if 'optional_variables' in graph_type:
                optional_variables = graph_type['optional_variables']
                variables.update({k:v['types'] for k, v in optional_variables.items() if v is not None})
            # print(f"variables: {variables}")
            # print(f"required_variables: {required_variables}")
            valid_graph = True
            inputs = {}
            for k, valid_column_types in variables.items():
                valid_column_names = [n for n, t in column_types.items() if t in valid_column_types]
                inputs[k] = {
                    'type': 'string',
                    'description': f"{k.replace('_variable', ' axis')} that supports {', '.join(valid_column_types)} columns",   # noqa
                }
                if len(valid_column_names) == 0:
                    if k in required_variables:
                        # if there are no valid columns that support the corresponding types and the
                        # variable is required (e.g. there are no dates columns in the dataset and a
                        # date is required for the graph), then skip this graph/tool altogether
                        valid_graph = False
                        break
                else:
                    inputs[k]['enum'] = valid_column_names

            if graph_type['agent'] and 'variables' in graph_type['agent']:
                for agent_var in graph_type['agent']['variables']:
                    assert len(agent_var) == 1
                    agent_var_name = next(iter(agent_var.keys()))
                    inputs[agent_var_name] = {
                        'type': 'string',
                        'description': agent_var[agent_var_name]['description'],
                        'enum': agent_var[agent_var_name]['options'],
                    }

            if valid_graph:
                # inputs = {'inputs': inputs}
                tool = Tool(
                    name=str(uuid.uuid4()),
                    description=description,
                    inputs=inputs,
                    required=required_variables,
                )
                tool.graph_name = graph_type['name']
                tools.append(tool)
    return tools

tools = build_tools_from_graph_configs(configs, column_types)
print(len(tools))

15
10


In [59]:
for tool in tools:
    print(tool.name)
    print(tool.description)
    print(tool.inputs)
    print(tool.required)
    print()

1c726ae3-4f14-4407-9c8e-1bdef1661764
(box) Shows the distribution of x axis variable (which can be a column of type numeric). Boxplots are useful for understanding the distribution of the data and identifying outliers.
{'x_variable': {'type': 'string', 'description': 'x axis that supports numeric columns', 'enum': ['months_loan_duration', 'amount', 'percent_of_income', 'years_at_residence', 'age', 'existing_loans_count', 'dependents']}, 'color_variable': {'type': 'string', 'description': 'color axis that supports date, boolean, string, categorical columns', 'enum': ['checking_balance', 'credit_history', 'purpose', 'savings_balance', 'employment_duration', 'other_credit', 'housing', 'job', 'phone', 'default']}, 'facet_variable': {'type': 'string', 'description': 'facet axis that supports date, boolean, string, categorical columns', 'enum': ['checking_balance', 'credit_history', 'purpose', 'savings_balance', 'employment_duration', 'other_credit', 'housing', 'job', 'phone', 'default']}}
[

In [14]:
# prompt = "Plot the counts of checking_balance on the x axis and credit_history on the y axis."
# prompt = "Plot the duration of the loan against the amount."
# prompt = "Plot the duration of the loan against the amount of the loan and age."
prompt = "Plot a 3d scatter the duration of the loan against the amount of the loan and age."

In [7]:
agent = OpenAIFunctions(
    model_name='gpt-3.5-turbo-1106',
    tools=tools,
)
formatted_colum_names = '\n'.join([f"{k}: {v}" for k, v in column_types.items()])
template = f"""
The user is asking to create a plot based on the following column names and types. Infer
the correct column names and the correct axes from the users question.
Choose a tool that uses all of the columns listed by the user. Prioritize the required columns.

Valid columns and types: 

{formatted_colum_names}

User question: {prompt}
"""
print(template)
response = agent(template)
response


The user is asking to create a plot based on the following column names and types. Infer
the correct column names and the correct axes from the users question.
Choose a tool that uses all of the columns listed by the user. Prioritize the required columns.

Valid columns and types: 

checking_balance: string
months_loan_duration: numeric
credit_history: string
purpose: string
amount: numeric
savings_balance: string
employment_duration: string
percent_of_income: numeric
years_at_residence: numeric
age: numeric
other_credit: string
housing: string
existing_loans_count: numeric
job: string
dependents: numeric
phone: string
default: string

User question: Plot a 3d scatter the duration of the loan against the amount of the loan and age.



[(<llm_workflow.agents.Tool at 0xffff632a4750>,
  {'x_variable': 'months_loan_duration',
   'y_variable': 'amount',
   'z_variable': 'age'})]

In [8]:
tool, args = response[0]
print(tool.description)
print(tool.required)
print(args)
# tool.inputs|

(scatter-3d) Shows the relationship between x axis variable (which can be a column of type numeric), y axis variable (which can be a column of type numeric), and z axis variable (which can be a column of type numeric). A 3D scatter plot is the best option when there are three numeric variables.
['x_variable', 'y_variable', 'z_variable']
{'x_variable': 'months_loan_duration', 'y_variable': 'amount', 'z_variable': 'age'}


In [11]:
print(f"Total Cost:           ${agent.history()[0].cost:.5f}")
print(f"Total Tokens:          {agent.history()[0].total_tokens:,}")
print(f"Total Prompt Tokens:   {agent.history()[0].input_tokens:,}")
print(f"Total Response Tokens: {agent.history()[0].response_tokens:,}")

Total Cost:           $0.00356
Total Tokens:          3,507
Total Prompt Tokens:   3,457
Total Response Tokens: 50


In [10]:
tool, args = response[1]
print(tool.description)
print(args)

IndexError: list index out of range