In [3]:
from datetime import datetime
from openai import OpenAI

In [4]:


client = OpenAI()

In [12]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "scrape_url",
            "description": "Scrapes a webpage and returns its markdown equivalent",
            "parameters": {
                "type": "object",
                "required": ["url"],
                "properties": {
                    "url": {
                        "type": "string",
                        "description": "The URL of the webpage to scrape",
                    },
                    "use_cache": {
                        "type": "boolean",
                        "description": "Flag to indicate whether to use cached data (default is true). Set to false when fetching a webpage with live info such as espn.com/nba/scoreboard or anything with dynamic content",
                    },
                },
                "additionalProperties": False,
            },
            "strict": False,
        },
    }
]

In [10]:

system_prompt = f"You are given an objective. Achieve the objective using the tools and knowledge you have. Today's date is {datetime.now().strftime('%b %d, %Y %H:%M:%S')}"
print(system_prompt)

You are given an objective. Achieve the objective using the tools and knowledge you have. Today's date is Feb 11, 2025 16:30:45


In [9]:
user_message = "Extract the roster details for the Boston Celtics from their Basketball-Reference page (https://www.basketball-reference.com/teams/BOS/2025.html). Specifically, retrieve the list of players along with their birthdates so that later we can identify which players are at least 30 years old as of October 25, 2023."
print(user_message)

Extract the roster details for the Boston Celtics from their Basketball-Reference page (https://www.basketball-reference.com/teams/BOS/2025.html). Specifically, retrieve the list of players along with their birthdates so that later we can identify which players are at least 30 years old as of October 25, 2023.


In [38]:
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": system_prompt,
                }
            ],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": user_message,
                }
            ],
        },
    ],
    response_format={"type": "text"},
    tools=tools,
    temperature=0.13,
    max_completion_tokens=13790,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
)

In [39]:
response

ChatCompletion(id='chatcmpl-B0DQnazOg8p0GzVXKDaD0Gw9uvbD1', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_y2aklqfh6ihpOOHrzAcFPmTX', function=Function(arguments='{"url":"https://www.basketball-reference.com/teams/BOS/2025.html"}', name='scrape_url'), type='function')]))], created=1739391329, model='gpt-4o-2024-08-06', object='chat.completion', service_tier='default', system_fingerprint='fp_50cad350e4', usage=CompletionUsage(completion_tokens=31, prompt_tokens=204, total_tokens=235, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))

In [None]:
if response.choices[0].finish_reason == "tool_calls":
    import requests
    import json

    tool_calls = response.choices[0].message.tool_calls

    # If there's only one tool call, use the /scrape endpoint
    if len(tool_calls) == 1:
        tool_call = tool_calls[0]
        if tool_call.function.name == "scrape_url":
            args = json.loads(tool_call.function.arguments)
            params = {
                "url": args["url"],
                "force_fetch": not args.get("use_cache", True),
            }
            response = requests.get("http://localhost:8084/scrape", params=params)
            scraped_content = response.json()
            scraped_content = f"=======\n{scraped_content['metadata']['url']} content:\n=======\n{scraped_content['markdown']}"

    # If there are multiple tool calls, use the batch endpoint
    else:
        urls = []
        force_fetch = False
        for tool_call in tool_calls:
            if tool_call.function.name == "scrape_url":
                args = json.loads(tool_call.function.arguments)
                urls.append(args["url"])
                # If any call requests no caching, set force_fetch to True
                if not args.get("use_cache", True):
                    force_fetch = True

        payload = {"urls": urls, "force_fetch": force_fetch}
        response = requests.post(
            "http://localhost:8084/batch_scrape_urls", json=payload
        )
        scraped_content = response.json()
        scraped_content = f"=======\n{scraped_content['metadata']['url']} content:\n=======\n{scraped_content['markdown']}"

In [45]:
response.choices[0].message.tool_calls[0].function.arguments

'{"url":"https://www.basketball-reference.com/teams/BOS/2025.html"}'

In [None]:
tool_call["function"]["arguments"]

In [26]:
scraped_content.keys()

dict_keys(['markdown', 'metadata'])

In [31]:
f"=======\n{scraped_content['metadata']['url']} content:\n=======\n{scraped_content['markdown']}"



In [46]:
from openai import OpenAI
from typing import List, Dict, Any
import json


class ConversationManager:
    def __init__(self):
        self.client = OpenAI()
        self.conversation_history = []
        self.system_message = {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "You are given an objective. Achieve the objective using the tools and knowledge you have.",
                }
            ],
        }

        # Define available tools
        self.tools = [
            {
                "type": "function",
                "function": {
                    "name": "scrape_url",
                    "description": "Scrapes a webpage and returns its markdown equivalent",
                    "parameters": {
                        "type": "object",
                        "required": ["url"],
                        "properties": {
                            "url": {
                                "type": "string",
                                "description": "The URL of the webpage to scrape",
                            },
                            "use_cache": {
                                "type": "boolean",
                                "description": "Flag to indicate whether to use cached data",
                            },
                        },
                        "additionalProperties": False,
                    },
                },
            }
        ]

    def execute_tool(self, tool_call: Dict[str, Any]) -> Dict[str, Any]:
        """
        Execute the tool based on the tool call from the assistant
        """
        if tool_call.function.name == "scrape_url":
            import requests
            
            args = json.loads(tool_call.function.arguments)
            params = {
                "url": args["url"],
                "force_fetch": not args.get("use_cache", True),
            }
            
            try:
                # Make request to local scraping service
                response = requests.get("http://localhost:8084/scrape", params=params)
                response.raise_for_status()
                
                scraped_data = response.json()
                formatted_content = f"=======\n{scraped_data['metadata']['url']} content:\n=======\n{scraped_data['markdown']}"
                
                return {
                    "role": "tool",
                    "content": [
                        {"type": "text", "text": formatted_content}
                    ],
                    "tool_call_id": tool_call.id
                }
                
            except requests.exceptions.RequestException as e:
                # Handle errors gracefully
                error_message = f"Error scraping URL: {str(e)}"
                return {
                    "role": "tool",
                    "content": [
                        {"type": "text", "text": error_message}
                    ],
                    "tool_call_id": tool_call.id
                }

    def process_conversation(self, user_input: str) -> str:
        """
        Process the conversation with the LLM and execute any tools
        """
        # Add user message to conversation history
        self.conversation_history.append(
            {"role": "user", "content": [{"type": "text", "text": user_input}]}
        )

        while True:
            # Get response from LLM
            response = self.client.chat.completions.create(
                model="gpt-4o",
                messages=[self.system_message] + self.conversation_history,
                tools=self.tools,
                temperature=0.13,
                max_tokens=13790,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
            )

            assistant_message = response.choices[0].message

            # Check if the assistant wants to use a tool
            if assistant_message.tool_calls:
                for tool_call in assistant_message.tool_calls:
                    # Execute the tool
                    tool_response = self.execute_tool(tool_call)

                    # Add assistant's tool call to history
                    self.conversation_history.append(
                        {"role": "assistant", "content": [], "tool_calls": [tool_call]}
                    )

                    # Add tool response to history
                    self.conversation_history.append(tool_response)
            else:
                # No tool calls, just add the response to history
                self.conversation_history.append(
                    {"role": "assistant", "content": assistant_message.content}
                )
                # Return final response
                return assistant_message.content


def main():
    manager = ConversationManager()

    # Example usage
    user_input = "Extract the roster details for the Boston Celtics from their Basketball-Reference page (https://www.basketball-reference.com/teams/BOS/2025.html). Specifically, retrieve the list of players along with their birthdates so that later we can identify which players are at least 30 years old as of October 25, 2023."
    response = manager.process_conversation(user_input)
    print(f"Final Response: {response}")


main()

Final Response: Here are the roster details for the Boston Celtics, including the players' birthdates:

1. **Payton Pritchard** - Birthdate: January 28, 1998
2. **Jayson Tatum** - Birthdate: March 3, 1998
3. **Derrick White** - Birthdate: July 2, 1994
4. **Luke Kornet** - Birthdate: July 15, 1995
5. **Jaylen Brown** - Birthdate: October 24, 1996
6. **Sam Hauser** - Birthdate: December 8, 1997
7. **Jrue Holiday** - Birthdate: June 12, 1990
8. **Al Horford** - Birthdate: June 3, 1986
9. **Neemias Queta** - Birthdate: July 13, 1999
10. **Jordan Walsh** - Birthdate: March 3, 2004
11. **Kristaps Porziņģis** - Birthdate: August 2, 1995
12. **Xavier Tillman Sr.** - Birthdate: January 12, 1999
13. **Drew Peterson** - Birthdate: November 9, 1999
14. **Baylor Scheierman** - Birthdate: September 26, 2000
15. **JD Davison** - Birthdate: October 3, 2002
16. **Anton Watson** - Birthdate: October 6, 2000
17. **Torrey Craig** - Birthdate: December 19, 1990

Next, you can identify which players are at 