### Initialization:

In [35]:
# Install packages

# %pip install openai==0.28
# %pip install --upgrade langchain
# %pip install --upgrade pydantic
# %pip install --upgrade typing

In [36]:
# Import packages

from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
import langchain
import os
import openai

In [37]:
# Store OpenAI API key in the venv
import json
import os

with open('/Users/robertford/Documents/OpenAI-API/config.json', 'r') as f:
    config = json.load(f)

    for key in config:
        os.environ['OPENAI_API_KEY'] = config[key]

api_key=os.environ['OPENAI_API_KEY']

# print(os.environ['OPENAI_API_KEY']) # Check that it imported the key ok

### Extract information into a structured output using a Pydantic (data validation) function:

In [38]:
# create a class inheriting from the BaseModel class and define all the fields (arguments):

from pydantic.v1 import BaseModel, Field
from typing import Optional

class RequestStructure(BaseModel):
  """extracts information"""
  metric: str = Field(description = "main metric we need to calculate, for example, 'number of users' or 'number of sessions'")
  filters: Optional[str] = Field(description = "filters to apply to the calculation (do not include filters on dates here)")
  dimensions: Optional[str] = Field(description = "parameters to split you metric by")
  period_start: Optional[str] = Field(description = "start day of the period for report")
  period_end: Optional[str] = Field(description = "end day of the period for report")
  output_type: Optional[str] = Field(description = "the desired output", enum = ["number", "visualisation"])

In [39]:
# Use LangChain to convert this Pydantic class into an OpenAI function:

from langchain.utils.openai_functions import convert_pydantic_to_openai_function

extract_info_function = convert_pydantic_to_openai_function(RequestStructure, name = 'extract_information')

In [40]:
# LangChain validates the class needed to use OpenAI:

extract_info_function

{'name': 'extract_information',
 'description': 'extracts information',
 'parameters': {'title': 'RequestStructure',
  'description': 'extracts information',
  'type': 'object',
  'properties': {'metric': {'title': 'Metric',
    'description': "main metric we need to calculate, for example, 'number of users' or 'number of sessions'",
    'type': 'string'},
   'filters': {'title': 'Filters',
    'description': 'filters to apply to the calculation (do not include filters on dates here)',
    'type': 'string'},
   'dimensions': {'title': 'Dimensions',
    'description': 'parameters to split you metric by',
    'type': 'string'},
   'period_start': {'title': 'Period Start',
    'description': 'start day of the period for report',
    'type': 'string'},
   'period_end': {'title': 'Period End',
    'description': 'end day of the period for report',
    'type': 'string'},
   'output_type': {'title': 'Output Type',
    'description': 'the desired output',
    'enum': ['number', 'visualisation'

In [41]:
# Defining a LangChain chain

from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information from the provided request. \
            Extract ONLY the information presented in the initial request. \
            Don't add anything else. \
            Return partial information if something is missing."),
    ("human", "{request}")
])

model = ChatOpenAI(temperature=0.1, model = 'gpt-3.5-turbo-1106')\
  .bind(functions = [extract_info_function])

extraction_chain = prompt | model

In [42]:
# Invoke 'request' defined in the chain into a structured ouput:

extraction_chain.invoke({'request': "How many customers visited our site on iOS in April 2023 from different countries?"})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'extract_information', 'arguments': '{"metric":"number of customers","filters":"iOS","dimensions":"country","period_start":"April 1, 2023","period_end":"April 30, 2023","output_type":"number"}'}})

### Define a tool that performs an explicit calculation:

In [43]:
# Teach the LLM data analyst to calculate the difference between two metrics using an explicit calculation.

from langchain.agents import tool

@tool
def percentage_difference(metric1: float, metric2: float) -> float:
    """Calculates the percentage difference between metrics"""
    return (metric2 - metric1)/metric1*100

In [44]:
# Now the function has the name and description parameters to be passed to the LLM:
print(percentage_difference.name)
print(percentage_difference.args)
print(percentage_difference.description)

percentage_difference
{'metric1': {'title': 'Metric1', 'type': 'number'}, 'metric2': {'title': 'Metric2', 'type': 'number'}}
percentage_difference(metric1: float, metric2: float) -> float - Calculates the percentage difference between metrics


In [45]:
# Use the above parameters to create an OpenAI function spec:

from langchain.tools.render import format_tool_to_openai_function

# format_tool_to_openai_function(percentage_difference)
format_tool_to_openai_function(percentage_difference)

{'name': 'percentage_difference',
 'description': 'percentage_difference(metric1: float, metric2: float) -> float - Calculates the percentage difference between metrics',
 'parameters': {'title': 'percentage_differenceSchemaSchema',
  'type': 'object',
  'properties': {'metric1': {'title': 'Metric1', 'type': 'number'},
   'metric2': {'title': 'Metric2', 'type': 'number'}},
  'required': ['metric1', 'metric2']}}

In [46]:
# Use Pydantic to specify a schema for the arguments:
import pydantic.v1
from typing import Type

pydantic.__version__

'2.5.3'

In [47]:
# Import from pydantic.v1 because there is an error if using just pydantic
# Use Pyndantic to specify a schema:
from pydantic.v1 import BaseModel

class Metrics(BaseModel):
    metric1: float = Field(description="Base metric value to calculate the difference")
    metric2: float = Field(description="New metric value that we compare with the baseline")

@tool(args_schema=Metrics)
def percentage_difference(metric1: float, metric2: float) -> float:
    """Calculates the percentage difference between metrics"""
    return (metric2 - metric1)/metric1*100

In [48]:
percentage_difference

StructuredTool(name='percentage_difference', description='percentage_difference(metric1: float, metric2: float) -> float - Calculates the percentage difference between metrics', args_schema=<class '__main__.Metrics'>, func=<function percentage_difference at 0x14406b920>)

In [49]:
# Using a tool in practice to defining a chain and passing our tool to the function
model = ChatOpenAI(temperature=0.1, model = 'gpt-3.5-turbo-1106')\
  .bind(functions = [format_tool_to_openai_function(percentage_difference)])

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a product analyst willing to help your product team. You are very strict to the point and accurate. You use only facts."),
    ("user", "{request}")
])

analyst_chain = prompt | model
analyst_chain.invoke({'request': "In April we had 100 users and in May only 95. What is difference in percent?"})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'percentage_difference', 'arguments': '{"metric1":100,"metric2":95}'}})

In [50]:
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
analyst_chain = prompt | model | OpenAIFunctionsAgentOutputParser()
result = analyst_chain.invoke({'request': "There were 100 users in April and 110 users in May. How did the number of users changed?"})
result

AgentActionMessageLog(tool='percentage_difference', tool_input={'metric1': 100, 'metric2': 110}, log="\nInvoking: `percentage_difference` with `{'metric1': 100, 'metric2': 110}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'percentage_difference', 'arguments': '{"metric1":100,"metric2":110}'}})])

In [51]:
# Execute the function as the LLM requested like this

observation = percentage_difference(result.tool_input)
print(observation)
# 10

10.0


In [52]:
# Define a message list to pass to the model observations to get a final answer from the model:

from langchain.prompts import MessagesPlaceholder

model = ChatOpenAI(temperature=0.1, model = 'gpt-3.5-turbo-1106')\
  .bind(functions = [format_tool_to_openai_function(percentage_difference)])

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a product analyst willing to help your product team. You are very strict to the point and accurate. You use only facts, not inventing information."),
    ("user", "{request}"),
    MessagesPlaceholder(variable_name="observations")
])

analyst_chain = prompt | model | OpenAIFunctionsAgentOutputParser()
result1 = analyst_chain.invoke({
    'request': "There were 100 users in April and 110 users in May. How did the number of users changed?",
    "observations": []
})

observation = percentage_difference(result1.tool_input)
print(observation)
# 10

10.0


In [53]:
# Then, we need to add the observation to our observations variable. We could use format_to_openai_functions function to format our results in an expected way for the model.

from langchain.agents.format_scratchpad import format_to_openai_functions
format_to_openai_functions([(result1, observation), ])

[AIMessage(content='', additional_kwargs={'function_call': {'name': 'percentage_difference', 'arguments': '{"metric1":100,"metric2":110}'}}),
 FunctionMessage(content='10.0', name='percentage_difference')]

In [54]:
# Let’s invoke our chain one more time, passing the function execution result as an observation.

import langchain
langchain.debug = True

result2 = analyst_chain.invoke({
    'request': "There were 100 users in April and 110 users in May. How did the number of users changed?",
    "observations": format_to_openai_functions([(result1, observation)])
})

print(result2)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:
[0m[inputs]
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [0ms] Exiting Prompt run with output:
[0m{
  "lc": 1,
  "type": "constructor",
  "id": [
    "langchain",
    "prompts",
    "chat",
    "ChatPromptValue"
  ],
  "kwargs": {
    "messages": [
      {
        "lc": 1,
        "type": "constructor",
        "id": [
          "langchain",
          "schema",
          "messages",
          "SystemMessage"
        ],
        "kwargs": {
          "content": "You are a product analyst willing to help your product team. You are very strict to the point and accurate. You use only facts, not inventing information.",
          "additional_kwargs": {}
        }
      },
      {
        "lc": 1,
        "type": "constr

In [55]:
# Let’s add a couple more tools to our analyst’s toolkit,
# using Pydantic to specify the input arguments for our function.

import datetime
import random

class Filters(BaseModel):
    month: str = Field(description="Month of customer's activity in the format %Y-%m-%d")
    city: Optional[str] = Field(description="City of residence for customers (by default no filter)", 
                    enum = ["London", "Berlin", "Amsterdam", "Paris"])

@tool(args_schema=Filters)
def get_monthly_active_users(month: str, city: str = None) -> int:
    """Returns number of active customers for the specified month"""
    dt = datetime.datetime.strptime(month, '%Y-%m-%d')
    total = dt.year + 10*dt.month
    if city is None:
        return total
    else:
        return int(total*random.random())

In [56]:
# Use the Wikipedia package to allow the LLM to look up information it needs:

# !pip install wikipedia
import wikipedia

class Wikipedia(BaseModel):
    term: str = Field(description="Term to search for")

@tool(args_schema=Wikipedia)
def get_summary(term: str) -> str:
    """Returns basic knowledge about the given term provided by Wikipedia"""
    return wikipedia.summary(term)

In [57]:
# Let’s define a dictionary with all the functions our model knows now. This dictionary will help us to do routing later.

toolkit = {
    'percentage_difference': percentage_difference,
    'get_monthly_active_users': get_monthly_active_users,
    'get_summary': get_summary
}

analyst_functions = [format_tool_to_openai_function(f) 
  for f in toolkit.values()]

In [58]:
# Force LLM to consult with Wikipedia if it needs some basic knowledge.
# Changed the model to GPT 4 because it’s better for handling tasks requiring reasoning.

from langchain.prompts import MessagesPlaceholder

model = ChatOpenAI(temperature=0.1, model = 'gpt-4-1106-preview')\
  .bind(functions = analyst_functions)

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a product analyst willing to help your product team. You are very strict to the point and accurate. \
        You use only information provided in the initial request. \
        If you need to determine some information i.e. what is the name of the capital, you can use Wikipedia."),
    ("user", "{request}"),
    MessagesPlaceholder(variable_name="observations")
])

analyst_chain = prompt | model | OpenAIFunctionsAgentOutputParser()

print(analyst_chain)

first=ChatPromptTemplate(input_variables=['observations', 'request'], input_types={'observations': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a product analyst willing to help your product team. You are very strict to the point and accurate.         You use only information provided in the initial request.         If you need to determine some information i.e. what is the name of the capital, you can use Wikipedia.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['request'], template='{request}')), MessagesPlaceholder(variable_name='observations')]) middle=[RunnableBinding(bound=ChatOpenAI(client=<class 'openai.api_resources.c

In [59]:
# We can invoke our chain with all the functions. Let’s start with a pretty straightforward query.
# Turn off debug mode

import langchain
langchain.debug = False

result1 = analyst_chain.invoke({
    'request': "How many users were in April 2023 from Berlin?",
    "observations": []
})
print(result1)

tool='get_monthly_active_users' tool_input={'month': '2023-04-01', 'city': 'Berlin'} log="\nInvoking: `get_monthly_active_users` with `{'month': '2023-04-01', 'city': 'Berlin'}`\n\n\n" message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'get_monthly_active_users', 'arguments': '{"month":"2023-04-01","city":"Berlin"}'}})]


In [60]:
# Let’s try to make task a bit more complex by not telling the LLM what the capitol of Botswana is:

result1 = analyst_chain.invoke({
    'request': "How did the number of users from the capital of Botswana change between April and May 2023?",
    "observations": []
})

result1

AgentActionMessageLog(tool='get_summary', tool_input={'term': 'Capital of Botswana'}, log="\nInvoking: `get_summary` with `{'term': 'Capital of Botswana'}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'get_summary', 'arguments': '{"term":"Capital of Botswana"}'}})])

In [61]:
# So now the model has to do a few things:
# call Wikipedia to get the capital of Botswana
# call the get_monthly_active_users function twice to get MAU for April and May
# call percentage_difference to calculate the difference between metrics.

observation1 = toolkit[result1.tool](result1.tool_input)
print(observation1)

Gaborone (UK:  GAB-ə-ROH-nee, HAB-, US:  GAH-bə-ROH-nee, -⁠nay, Tswana: [χabʊˈrʊnɛ]) is the capital and largest city of Botswana with a population of 246,325 based on the 2022 census, about 10% of the total population of Botswana. Its agglomeration is home to 421,907 inhabitants at the 2011 census.
Gaborone is situated between Kgale Hill and Oodi Hill, near the confluence of the Notwane River and Segoditshane River in the south-eastern corner of Botswana, 15 kilometres (9.3 mi) from the South African border. The city is served by the Sir Seretse Khama International Airport. It is an administrative district in its own right, but is the capital of the surrounding South-East District. Locals often refer to the city as GC or Motse-Mshate.The city of Gaborone is named after Chief Gaborone of the Tlokwa tribe, who once controlled land nearby. Because it had no tribal affiliation and was close to fresh water, the city was planned to be the capital in the mid-1960s when the Bechuanaland Protec

In [62]:
result2 = analyst_chain.invoke({
    'request': "How did the number of users from the capital of Botswana change between April and May 2023?",
    "observations": format_to_openai_functions([(result1, observation1)])
})

print(result2)

tool='get_monthly_active_users' tool_input={'month': '2023-04-01', 'city': 'Gaborone'} log="\nInvoking: `get_monthly_active_users` with `{'month': '2023-04-01', 'city': 'Gaborone'}`\n\n\n" message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'get_monthly_active_users', 'arguments': '{"month":"2023-04-01","city":"Gaborone"}'}})]


In [63]:
# The model wants to execute get_monthly_active_users with arguments {'month': '2023-04-01', 'city': 'Berlin'}. Let's do it and return the information to the model again:

observation2 = toolkit[result2.tool](result2.tool_input)

print(observation2)

5


In [64]:
result3 = analyst_chain.invoke({
    'request': "How did the number of users from the capital of Botswana change between April and May 2023?",
    "observations": format_to_openai_functions([(result1, observation1), (result2, observation2)])
})

print(result3)

tool='get_monthly_active_users' tool_input={'month': '2023-05-01', 'city': 'Gaborone'} log="\nInvoking: `get_monthly_active_users` with `{'month': '2023-05-01', 'city': 'Gaborone'}`\n\n\n" message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'get_monthly_active_users', 'arguments': '{"month":"2023-05-01","city":"Gaborone"}'}})]


In [65]:
observation3 = toolkit[result3.tool](result3.tool_input)

print(observation3)

533


In [66]:
result4 = analyst_chain.invoke({
    'request': "How did the number of users from the capital of Botswana change between April and May 2023?",
    "observations": format_to_openai_functions(
      [(result1, observation1), (result2, observation2), 
      (result3, observation3)])
})

print(result4)

tool='percentage_difference' tool_input={'metric1': 5, 'metric2': 533} log="\nInvoking: `percentage_difference` with `{'metric1': 5, 'metric2': 533}`\n\n\n" message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'percentage_difference', 'arguments': '{"metric1":5,"metric2":533}'}})]


In [67]:
observation4 = toolkit[result4.tool](result4.tool_input)
print(observation4)

result5 = analyst_chain.invoke({
    'request': "How did the number of users from the capital of Botswana change between April and May 2023?",
    "observations": format_to_openai_functions(
      [(result1, observation1), (result2, observation2), 
      (result3, observation3), (result4, observation4)])
})



10560.0


### Output

In [68]:
output = result5.return_values['output']
print(output)

The number of users from Gaborone, the capital of Botswana, increased by 10,560% between April and May 2023.
