In [1]:
! pip install -q litellm tqdm PyMuPDF Pillow instructor openai python-dotenv weaviate-client pandas

In [2]:
import os 
from dotenv import load_dotenv
from openai import OpenAI
import json

load_dotenv()

True

In [3]:
client = OpenAI()
O1_MODEL = 'o1-mini'
GPT_MODEL = 'gpt-4o-mini'

# Idea 
Implement a deep research engine using Agentic Framework. This should allow me to generate a research report on a given topic.

# Version 1 : 
1. Take the research question as the input and generate a Plan for the LLM to execute.
2. Loop through the Plan and execute each step.
3. Generate a report on the results of each step.



# First version needs a Web research Engine so I am bring in Perplexity AI

In [4]:
import os
import re
from openai import OpenAI

def search_web(query):
    """
    Function to search the web using Perplexity AI API and return the answer
    with inline citation links instead of numbered references.
    
    Args:
        query (str): The search query to be processed
        
    Returns:
        str: The response content with inline links replacing citations like [1], [2], etc.
    """
    client = OpenAI(api_key=os.environ["PERPLEXITYAI_API_KEY"], base_url="https://api.perplexity.ai")
    
    messages = [
        {
            "role": "system",
            "content": (
                "You are an artificial intelligence assistant and you need to "
                "answer like a Data collection engine with as much information as possible."
            ),
        },
        {   
            "role": "user",
            "content": query,
        },
    ]

    # chat completion without streaming
    response = client.chat.completions.create(
        model="sonar",
        messages=messages,
    )
    
    # Extract the main content and citations list
    content = response.choices[0].message.content
    citations = response.citations  # This is typically a list of URLs or references

    # Function to replace each [number] with the actual URL from the citations list
    def replace_citation_with_link(match):
        # Extract the digit inside the brackets, e.g., [1] -> "1"
        citation_num_str = match.group(0)[1:-1]
        citation_idx = int(citation_num_str) - 1
        
        # Safety check: If citation index is out of range, just return the original match
        if citation_idx < 0 or citation_idx >= len(citations):
            return match.group(0)
        
        # Replace with actual link in parentheses (or any other format you prefer)
        return f"({citations[citation_idx]})"
    
    # Replace patterns like [1], [2], etc., with the corresponding link
    # NOTE: If the AI sometimes returns combined citations like [1][2], you can
    # handle them by capturing consecutive groups or by running the re.sub multiple times.
    content_with_links = re.sub(r'\[\d+\]', replace_citation_with_link, content)

    # If the model sometimes produces multiple consecutive citations (e.g. [1][2]),
    # you can run another pass or use a more complex regex. For most cases, a single
    # pass will suffice if the model typically returns separated citations.
    # Example of a second pass if needed:
    # content_with_links = re.sub(r'\]\[', '], [', content_with_links)

    return content_with_links

# Example usage:

# result = search_web("Tata Consultancy Services Limited threats")
# print(result)

def HumanInput(question: str) -> str:
    """
    Get input from the user with a specific question or prompt.
    
    Args:
        question (str): The question or prompt to display to the user
        
    Returns:
        str: The user's input response
        
    Example:
        >>> response = HumanInput("Do you want to proceed? (yes/no): ")
        >>> stock_price = HumanInput("Enter the stock price: ")
    """
    return input(question)

def calculate(num1: float, num2: float, operator: str) -> float:
        """
        Perform basic arithmetic calculations between two numbers.
    
        Args:
            num1 (float): The first number in the calculation
            num2 (float): The second number in the calculation
            operator (str): The arithmetic operator to use ('+', '-', '*', '/')
        
        Returns:
            float: The result of the arithmetic operation
        
        Raises:
            ValueError: If an invalid operator is provided
            ZeroDivisionError: If attempting to divide by zero
        
        Examples:
            >>> calculate(10, 5, '+')
            15.0
            >>> calculate(10, 2, '/')
            5.0
        """
        operators = {
            '+': lambda x, y: x + y,
            '-': lambda x, y: x - y,
            '*': lambda x, y: x * y,
            '/': lambda x, y: x / y if y != 0 else raise_(ZeroDivisionError("Cannot divide by zero."))
        }
    
        if operator not in operators:
            raise ValueError("Invalid operator. Expected one of '+', '-', '*', '/'.")
    
        return operators[operator](num1, num2)

def raise_(ex):
    """Helper function to raise exceptions in lambda functions."""
    raise ex


In [5]:
o1_prompt = """
You are an expert researcher with deep expertise in finance, legal, and tax matters.
The first input you will receive will be a complex Research task that needs to be carefully reasoned through to solve. 
Your task is to review the challenge, conduct thorough research, and create a detailed plan to analyze information, assess implications, and provide comprehensive insights.


You will have access to an LLM agent that is responsible for executing the plan that you create and will return results.


The LLM agent has access to the following functions:
    - search_web(Question)
        - This function performs a web search and returns relevant information based on the provided query
    - HumanInput(Question)
          - This functions Get the Input from user with a specific question or Prompt. Use it only its absolutely necessary.
        
When creating a plan for the LLM to execute, break your instructions into a logical, step-by-step order, using the specified format:
    - **Main actions are numbered** (e.g., 1, 2, 3).
    - **Sub-actions are lettered** under their relevant main actions (e.g., 1a, 1b).
        - **Sub-actions should start on new lines**
    - **Specify conditions using clear 'if...then...else' statements** (e.g., 'If the financial statement shows a profit, then...').
    - **For actions that require using one of the above functions defined**, write a step to call a function using backticks for the function name (e.g., `call the fetch_context function`).
        - Ensure that the proper input arguments are given to the model for instruction. There should not be any ambiguity in the inputs.
    - **The last step** in the instructions should always be calling the `instructions_complete` function. This is necessary so we know the LLM has completed all of the instructions you have given it.
    - **Detailed steps** The plan generated must be extremely detailed and thorough with explanations at every step.
Use markdown format when generating the plan with each step and sub-step.

Please find the scenario below.
"""

In [6]:
def call_o1(scenario):
    prompt = f"""
    {o1_prompt}
        
    Scenario:
    {scenario}

    Please provide the next steps in your plan.
    """
    
    response = client.chat.completions.create(
        model=O1_MODEL,
        messages=[{'role': 'user', 'content': prompt}]
    )
    plan = response.choices[0].message.content
    return plan

In [7]:
TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "search_web",
            "description": "function performs a web search and returns relevant information based on the provided query",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The search query to be processed"
                    }
                },
                "required": ["query"],
                "additionalProperties": False,
                "strict": True,
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "calculate",
            "description": "Perform basic arithmetic calculations between two numbers",
            "parameters": {
                "type": "object",
                "properties": {
                    "num1": {
                        "type": "number",
                        "description": "The first number in the calculation"
                    },
                    "num2": {
                        "type": "number",
                        "description": "The second number in the calculation"
                    },
                    "operator": {
                        "type": "string",
                        "description": "The arithmetic operator to use",
                        "enum": ["+", "-", "*", "/"]
                    }
                },
                "required": ["num1", "num2", "operator"],
                "additionalProperties": False,
                "strict": True,
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "human_input",
            "description": "Get input from the user with a specific question or prompt",
            "parameters": {
                "type": "object",
                "properties": {
                    "question": {
                        "type": "string",
                        "description": "The question or prompt to display to the user"
                    }
                },
                "required": ["question"],
                "additionalProperties": False,
                "strict": True,
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "instructions_complete",
            "description": "Function should be called when we have completed ALL of the instructions.",
        },
    }
]

function_mapping = {
    'search_web': search_web,
    'human_input': HumanInput,
    'calculate': calculate,
    
}

In [8]:
def append_message(message_list, message):
    message_list.append(message)
    # # Optionally, print the message for immediate feedback
    message_type = message.get('type', '')
    if message_type == 'status':
        print(message['message'])
    elif message_type == 'plan':
        print("\nPlan:\n", message['content'])
    elif message_type == 'assistant':
        print("\nAssistant:\n", message['content'])
    # elif message_type == 'function_call':
    #     # print(f"\nFunction call: {message['function_name']} with arguments {message['arguments']}")
    # elif message_type == 'function_response':
    #     # print(f"\nFunction response for {message['function_name']}: {message['response']}")
    # else:
    #     # Handle any other message types or default case
    #     print(message.get('content', ''))

In [9]:
gpt4o_system_prompt = """
You are a helpful assistant responsible for executing the policy on handling Deep research Tasks. 
Your task is to follow the policy exactly as it is written and perform the necessary actions.

You must explain your decision-making process across various steps.

# Steps

1. **Read and Understand Policy**: Carefully read and fully understand the given policy on Deep research Task.
2. **Identify the exact step in the policy**: Determine which step in the policy you are at, and execute the instructions according to the policy.
3. **Decision Making**: Briefly explain your actions and why you are performing them.
4. **Action Execution**: Perform the actions required by calling any relevant functions and input parameters. 

POLICY:
{policy}
"""

In [10]:
def call_gpt4o(message_list, plan):
    gpt4o_policy_prompt = gpt4o_system_prompt.replace("{policy}", plan)
    messages = [
        {'role': 'system', 'content': gpt4o_policy_prompt},
    ]

    while True:
        response = client.chat.completions.create(
            model=GPT_MODEL,
            messages=messages,
            tools=TOOLS,
            parallel_tool_calls=False
        )
        
        assistant_message = response.choices[0].message.to_dict()
        print(assistant_message)
        messages.append(assistant_message)

        append_message(message_list, {'type': 'assistant', 'content': assistant_message.get('content', '')})

        if (response.choices[0].message.tool_calls and
            response.choices[0].message.tool_calls[0].function.name == 'instructions_complete'):
            break

        if not response.choices[0].message.tool_calls:
            continue

        for tool in response.choices[0].message.tool_calls:
            tool_id = tool.id
            function_name = tool.function.name
            input_arguments_str = tool.function.arguments

            append_message(message_list, {'type': 'tool_call', 'function_name': function_name, 'arguments': input_arguments_str})

            try:
                input_arguments = json.loads(input_arguments_str)
            except (ValueError, json.JSONDecodeError):
                continue

            if function_name in function_mapping:
                print("## Function Call", function_name)
                try:
                    function_response = function_mapping[function_name](**input_arguments)
                except Exception as e:
                    function_response = {'error': str(e)}
            else:
                function_response = {'error': f"Function '{function_name}' not implemented."}

            try:
                serialized_output = json.dumps(function_response)
            except (TypeError, ValueError):
                serialized_output = str(function_response)

            messages.append({
                "role": "tool",
                "tool_call_id": tool_id,
                "content": serialized_output
            })

            append_message(message_list, {'type': 'tool_response', 'function_name': function_name, 'response': serialized_output})

    return messages

In [11]:
def process_scenario(message_list, scenario):
    append_message(message_list, {'type': 'status', 'message': 'Generating plan...'})

    plan = call_o1(scenario)

    append_message(message_list, {'type': 'plan', 'content': plan})

    append_message(message_list, {'type': 'status', 'message': 'Executing plan...'})

    messages = call_gpt4o(message_list, plan)

    append_message(message_list, {'type': 'status', 'message': 'Processing complete.'})

    return messages

In [12]:
logs = process_scenario([], "Why Common Law is Sceptical of Philosophy")

Generating plan...

Plan:
 ```markdown
# Research Plan: Why Common Law is Sceptical of Philosophy

1. **Define the Core Concepts**
    a. Identify and define "Common Law" and its fundamental principles.
        - `search_web("Definition and principles of Common Law")`
    b. Identify and define "Philosophy" in the context of legal theory.
        - `search_web("Definition of Philosophy in legal context")`
    c. Establish the relationship between Common Law and Philosophy.
        - `search_web("Relationship between Common Law and Philosophy")`

2. **Historical Contextualization**
    a. Investigate the historical development of Common Law.
        - `search_web("Historical development of Common Law")`
    b. Explore the historical interactions between Common Law and philosophical thought.
        - `search_web("History of Common Law and philosophy")`
    c. Identify key legal thinkers who influenced Common Law's stance on Philosophy.
        - `search_web("Influential legal thinkers o

In [13]:
logs

[{'role': 'system',
  'content': '\nYou are a helpful assistant responsible for executing the policy on handling Deep research Tasks. \nYour task is to follow the policy exactly as it is written and perform the necessary actions.\n\nYou must explain your decision-making process across various steps.\n\n# Steps\n\n1. **Read and Understand Policy**: Carefully read and fully understand the given policy on Deep research Task.\n2. **Identify the exact step in the policy**: Determine which step in the policy you are at, and execute the instructions according to the policy.\n3. **Decision Making**: Briefly explain your actions and why you are performing them.\n4. **Action Execution**: Perform the actions required by calling any relevant functions and input parameters. \n\nPOLICY:\n```markdown\n# Research Plan: Why Common Law is Sceptical of Philosophy\n\n1. **Define the Core Concepts**\n    a. Identify and define "Common Law" and its fundamental principles.\n        - `search_web("Definition 