In [None]:
# Load environment variables from a .env file
from dotenv import load_dotenv
load_dotenv()


# Import necessary libraries
import os
import time
import pprint
import logging
import pandas as pd
from typing import Any, Dict
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.datasets import fetch_openml

from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.tools import tool
from langchain_community.tools.file_management.write import WriteFileTool
from langchain.agents import AgentExecutor, create_openai_functions_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.agents.format_scratchpad.openai_tools import format_to_openai_tool_messages
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser

# Define the language model and logging directories
MODEL = 'gpt-4o'
LOG_DIR = "logs"
EXPERIMENT_LOG_FILE = os.path.join(LOG_DIR, "experiment_logs.txt")
AGENT_LOG_FILE = os.path.join(LOG_DIR, "agent_log.txt")

# Ensure the log directory exists
os.makedirs(LOG_DIR, exist_ok=True)
# Set an environment variable for the project name in Langsmith
os.environ["LANGCHAIN_PROJECT"] = "Re-Act HPO"

# Configure logging to output to the notebook output area
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.info("Notebook started")


# Information about the dataset being used
dataset_information = """
                    Name: Census Income Dataset
                    Task Description: Predict if an individual earns more than $50,000 per year based on census data.
                    Label: Income (binary classification: ">50K" or "<=50K")
                    Key Features:
                    age: Integer (e.g., 25, 42)
                    workclass: Categorical (e.g., Private, State-gov)
                    education-num: Integer (corresponding to educational level)
                    marital-status: Categorical (e.g., Never-married, Married-civ-spouse)
                    occupation: Categorical (e.g., Exec-managerial, Handlers-cleaners)
                    relationship: Categorical (e.g., Husband, Not-in-family)
                    race: Categorical (e.g., White, Asian-Pac-Islander)
                    sex: Categorical (Male, Female)
                    capital-gain: Continuous
                    capital-loss: Continuous
                    hours-per-week: Continuous
                    native-country: Categorical (e.g., United-States, India)
                    Evaluation Metric: Area Under the ROC Curve (AUC)
                    """

# Model information
model_information = """
                    Model Type: Random Forest Classifier
                    Library Used: Scikit-learn (assuming you are using Python)
                    Purpose: To classify individuals based on their income level (>50K or <=50K).
                    Key Parameters to Optimize:
                    n_estimators: Number of trees in the forest (e.g., 100, 200).
                    max_depth: The maximum depth of the tree (e.g., 10, 20, None).
                    min_samples_split: The minimum number of samples required to split an internal node (e.g., 2, 5).
                    min_samples_leaf: The minimum number of samples required to be at a leaf node (e.g., 1, 2).                  
                    """


# Optimization goal for the hyperparameters

optimization_goal = """Maximize the AUC Score on test data by optimizing the following hyperparameters of the model:
{
    'n_estimators': int   # Range for number of trees in the forest
    'max_depth': int      # Maximum depth of each tree
    'min_samples_split': int # Minimum number of samples required to split an internal node
    'min_samples_leaf': int # Minimum number of samples required at a leaf node
}
"""



In [None]:
# utils

# Define a Pydantic model for our input schema
class Hyperparameters(BaseModel):
    n_estimators: int = Field(description="The number of trees in the forest.")
    max_depth: int = Field(description="The maximum depth of the trees.")
    min_samples_split: int = Field(description="The minimum number of samples required to split an internal node.")
    min_samples_leaf: int = Field(description="The minimum number of samples required to be at a leaf node.")

# Function to load and preprocess the Census Income dataset
def preprocess_data() -> (pd.DataFrame, pd.Series):
    """
    Load and preprocess the Census Income dataset.

    Returns:
        Tuple containing preprocessed features (X) and target (y).
    """
    # Load the Census Income dataset
    census = fetch_openml(name='adult', version=2, as_frame=True)
    X = census.data
    y = (census.target == '>50K').astype(int)  # Convert target to binary classification

    # Preprocess data (convert categorical to numeric)
    X = pd.get_dummies(X)
    return X, y


# Function to log the output of the agent after each cycle of task, thought, action, and observation
def log_agent_output(output, file_path=AGENT_LOG_FILE):
    """
    Logs the 'content' of the output to the specified file path and returns the output unchanged.
    
    Args:
        output (dict): The output from the agent to be logged and returned.
        file_path (str): The path to the file where the output 'content' will be logged.
    """
    try:
        # Extract content from the output
        content = getattr(output, 'content', 'No content available')

        # Open the file in append the content
        with open(file_path, 'a') as f:
            f.write(content + "\n\n")

        # Also log to the main log file
        logging.info(f"Agent's chain of thought logged to {file_path}")

        # Return the output unchanged for further processing
        return output
    except Exception as e:
        logging.error(f"Error in log_agent_output: {e}")

In [None]:
# tools for the agent

# Custom Tool to train a Random Forest model with given hyperparameters
@tool("train_random_forest",args_schema=Hyperparameters)
def train_random_forest(**hyperparameters) -> dict:

    """
    Train a Random Forest model with given hyperparameters.
    
    Parameters:
        hyperparameters: dict
    
    Returns:
        dict
    """
    try:
        logging.info(f"Received hyperparameters by train_random_forest tool: {hyperparameters}")
        # Ensure all hyperparameters are provided
        required_keys = ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf']
        for key in required_keys:
            if key not in hyperparameters:
                raise ValueError(f"Missing required hyperparameter: {key}")
        
        # Preprocess the data and split into training/test set
        X,y = preprocess_data()
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=2)
        
        # Initialize the Random Forest model with the provided hyperparameters
        model = RandomForestClassifier(
            n_estimators=hyperparameters.get('n_estimators'),
            max_depth=hyperparameters.get('max_depth'),
            min_samples_split=hyperparameters.get('min_samples_split'),
            min_samples_leaf=hyperparameters.get('min_samples_leaf'),
            random_state=42
        )

        # Train the model and measure the time taken
        start_time = time.time()
        model.fit(X_train, y_train)
        training_time = time.time() - start_time

        # Predict probabilities and calculate AUC
        y_pred = model.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, y_pred)
        
        # Return the results as a dictionary
        return {'auc': auc, 'training_time': training_time}
    except Exception as e:
        logging.error(f"Error in train_random_forest: {e}")
        raise

# Define tools to be used by the agent, WriteFileTool from langchain_community and train_random_forest
tools = [WriteFileTool(file_path=EXPERIMENT_LOG_FILE), train_random_forest]



In [None]:

# Function to create an agent for optimizing model hyperparameters
def create_agent(llm, tools, file_path, model_information, dataset_information, optimization_goal):
    """
    Create an agent for optimizing model hyperparameters.

    Args:
        llm: The language model instance.
        tools: The tools to be used by the agent.
        file_path (str): Path to the log file.
        model_information (str): Information about the model.
        dataset_information (str): Information about the dataset.
        optimization_goal (str): The optimization goal for hyperparameters.

    Returns:
        agent: The created agent instance.
    """
    # Define the prompt template for the agent
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """You are the machine learning experimenter tasked with optimizing the model’s hyperparameter settings to accomplish the following objective: {optim_goal}.
                To achieve this, propose an initial set of default hyperparameters and test them on the model using the following tool:
                Name: `train-random-forest`
                Description: ”Useful for when you need to train a random forest model with given hyperparameters”
                
                Analyze the outcome of that training and iteratively improve the proposed hyperparameters to methodically reach the final objective.
                Ensure your proposed hyperparameters are distinct from those previously tested.
                Keep iterating until the desired metric has no longer improved for the previous 5 iterations.
                Below is the basic information about the experimental settings:
                Model Info: {model_info}
                Dataset Info: {dataset_info}

                Use the following format:
                Task: the input task you must solve
                Thought: you should always think about what to do
                Action: the action to take
                Action Input: the input to the action
                Observation: the result of the action
                ... (this Thought/Action/Action Input/Observation can repeat N times)
                Thought: I now know the final answer
                Final Answer: the final answer to the original input question

                Finally, analyze all the iterations and make a detailed summary of the entire experiment for a ML beginner.
                Make sure you touch on all of the the following points in your detailed summary:
                - best hyperparameters
                - details of the training trajectory and final training results about this experiment.
                - the thought process behind those adjustments in hyperparameter values and how those parameters impacted the model given the dataset.
                - detailed first principle analysis on what worked and why.

                Once complete, log this summary into {file_path} using the following tool:
                Name: `write_file`
                Description: ”Useful for when you need to write the experiment summary to a file”
                Begin!
                Task: {input}
                Thought: {agent_scratchpad}
                """
            ),
            ("user", "{input}"),
            MessagesPlaceholder(variable_name="agent_scratchpad"),
        ]
    )
    # Partially fill in the prompt with the provided information
    prompt = prompt.partial(file_path=file_path)
    prompt = prompt.partial(model_info=model_information)
    prompt = prompt.partial(dataset_info=dataset_information)
    prompt = prompt.partial(optim_goal=optimization_goal)

    # Bind tools to the language model
    llm_with_tools = llm.bind_tools(tools)

    # Create the agent with the specified components
    agent = (
        {
            "input": lambda x: x["input"],
            "agent_scratchpad": lambda x: format_to_openai_tool_messages(
                x["intermediate_steps"]
            ),
        }
        | prompt
        | llm_with_tools
        | log_agent_output
        | OpenAIToolsAgentOutputParser()
    )

    return agent

# Initialize the language model and create the agent
llm = ChatOpenAI(model=MODEL, streaming=True)
agent = create_agent(llm,tools, EXPERIMENT_LOG_FILE, model_information, dataset_information, optimization_goal)
agent_executor = AgentExecutor(agent=agent, tools=tools)


In [None]:

# Define the input task for the agent
input_task = "Tune the hyperparameters of the given model to achieve the highest AUC score."

# Invoke the agent with the input task
result = agent_executor.invoke({"input": input_task})
print(result)


# Stream the output and capture chunks
chunks = []
try:
    for chunk in agent_executor.stream({"input": input_task}):
        chunks.append(chunk)
        print("------")
        pprint.pprint(chunk)
except Exception as e:
    logging.error(f"Error executing agent: {e}")
    print(f"Error executing agent: {e}")

# NOTE
The following sections in the notebook are written purely to elucidate what some of the code abstractions in langchain do.
This is useful for an explicit understanding .

### What does OpenAIToolsAgentOutputParser() do?


https://api.python.langchain.com/en/latest/agents/langchain.agents.output_parsers.openai_tools.OpenAIToolsAgentOutputParser.html


After the LLM generates a response, this parser takes the LLM's output and converts it into actionable steps (AgentAction objects)
or determines if the process has reached its conclusion (AgentFinish object).


In [94]:
#Example of invoking the output parser
output = AIMessage(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_7RcVoJImJQ2uQbMf3gZL8KxN', 'function': {'arguments': '{"n_estimators": 200, "max_depth": 20, "min_samples_split": 5, "min_samples_leaf": 2}', 'name': 'train_random_forest'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-4b2391cf-90a2-4c38-801c-0988f40c9a80-0', tool_calls=[{'name': 'train_random_forest', 'args': {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2}, 'id': 'call_7RcVoJImJQ2uQbMf3gZL8KxN'}])
parser =  OpenAIToolsAgentOutputParser()
print(parser.invoke(output))

[ToolAgentAction(tool='train_random_forest', tool_input={'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2}, log="\nInvoking: `train_random_forest` with `{'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_7RcVoJImJQ2uQbMf3gZL8KxN', 'function': {'arguments': '{"n_estimators": 200, "max_depth": 20, "min_samples_split": 5, "min_samples_leaf": 2}', 'name': 'train_random_forest'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-4b2391cf-90a2-4c38-801c-0988f40c9a80-0', tool_calls=[{'name': 'train_random_forest', 'args': {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2}, 'id': 'call_7RcVoJImJQ2uQbMf3gZL8KxN'}])], tool_call_id='call_7RcVoJImJQ2uQbMf3gZL8KxN')]


### What does format_to_openai_tool_messages do?
Takes the intermediate steps and converts them to a list of base messages that can be fed back to LLM for context

https://api.python.langchain.com/en/latest/agents/langchain.agents.format_scratchpad.openai_tools.format_to_openai_tool_messages.html

In [95]:
#Example of format_to_openai_tool_messages
from langchain_core.agents import AgentAction
from langchain_core.messages import ToolMessage, AIMessage, BaseMessage
from langchain.agents.output_parsers.openai_tools import OpenAIToolAgentAction


action1 = OpenAIToolAgentAction(
    tool="database_query",
    tool_input={"query": "SELECT * FROM users"},
    log="Querying the database for user information",
    tool_call_id="1",
    message_log=[]
)
observation1 = "[{'id': 1, 'name': 'Alice'}, {'id': 2, 'name': 'Bob'}]"

action2 = OpenAIToolAgentAction(
    tool="data_processing",
    tool_input=observation1,
    log="Processing user data",
    tool_call_id="2",
    message_log=[]
)
observation2 = '{"user_count": 2}'


intermediate_steps = [(action1, observation1), (action2, observation2)]

messages = format_to_openai_tool_messages(intermediate_steps)
print(messages)