In [8]:
import os

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

import pandas as pd
from langchain_openai import ChatOpenAI
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Import your H2O ML Agent (assuming it's saved as h2o_ml_agent_enhanced.py)
from h20 import H2OMLAgentEnhanced

# Check for OpenAI API key
if not os.getenv("OPENAI_API_KEY"):
    print("❌ Error: OPENAI_API_KEY environment variable is not set!")
    print("Please set your OpenAI API key by running one of these commands:")
    print("  export OPENAI_API_KEY='your-api-key-here'")
    print("  or")
    print("  OPENAI_API_KEY='your-api-key-here' python script.py")
    print("\nYou can get an API key from: https://platform.openai.com/api-keys")
    exit(1)

# 1. Set up the language model
llm = ChatOpenAI(
    model="gpt-4o-mini", temperature=0.1  # or "gpt-4" for better performance
)

# 2. Create or Load your dataset
# Option A: Create sample data
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_classes=2,
    random_state=42,
)

# Convert to DataFrame
feature_names = [f"feature_{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
df["target"] = y

# Option B: Load your own data
# df = pd.read_csv("your_data.csv")
# X = df.drop(columns=["target"])  # Replace "target" with your target column
# y = df["target"]

# 3. Split the data into train/test/calibration sets
X = df.drop(columns=["target"])
y = df["target"]

# First split: separate test set (20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Second split: separate calibration set from remaining data (25% of temp = 20% of total)
X_train, X_calib, y_train, y_calib = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print(f"Train set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Calibration set size: {len(X_calib)}")

# 4. Set up directories
LOG_PATH = "logs/"
MODEL_PATH = "models/"
os.makedirs(LOG_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)

# 5. Initialize the H2O ML Agent
ml_agent = H2OMLAgentEnhanced(
    model=llm,
    log=True,
    log_path=LOG_PATH,
    model_directory=MODEL_PATH,
    n_samples=30,
    file_name="h2o_automl_enhanced.py",
    function_name="h2o_automl_enhanced",
    overwrite=True,
    human_in_the_loop=False,  # Set to True if you want to review steps
    bypass_recommended_steps=False,  # Set to True to skip recommendation step
    bypass_explain_code=False,  # Set to True to skip code explanation
    enable_mlflow=False,  # Set to True to enable MLflow logging
    mlflow_tracking_uri=None,
    mlflow_experiment_name="H2O AutoML Enhanced Experiment",
    mlflow_run_name="test_run_1",
    enable_optuna=True,  # Enable Optuna optimization
    optuna_n_trials=20,  # Number of optimization trials
    optuna_timeout=300,  # Timeout in seconds
)

# 6. Run the agent
print("Starting H2O ML Agent Enhanced...")

Train set size: 600
Test set size: 200
Calibration set size: 200
Starting H2O ML Agent Enhanced...


In [9]:
ml_agent.invoke_agent(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    X_calib=X_calib,
    y_calib=y_calib,
    user_instructions="""
    Please create an H2O AutoML model for binary classification.
    Focus on maximizing AUC score while maintaining good precision.
    Use the calibration set for model calibration and threshold optimization.
    Optimize hyperparameters using Optuna for best performance.
    """,
    max_retries=3,
)

---H2O ML AGENT ENHANCED----
    * RECOMMEND MACHINE LEARNING STEPS
---H2O ML AGENT ENHANCED----
    * EXPLAINING ENHANCED H2O AUTOML CODE
    * CREATE ENHANCED H2O AUTOML CODE WITH OPTUNA
Syntax error detected: invalid syntax (<unknown>, line 31)
Could not fix syntax errors, using fallback code
      File saved to: logs/h2o_automl_enhanced.py
    * EXECUTING GENERATED CODE
Executing function 'h2o_automl_enhanced' with 5 arguments...
Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,2 hours 5 mins
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,2 months and 22 days
H2O_cluster_name:,H2O_from_python_shankii_5354hz
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.659 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |





21:11:13.647: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |





███████████████████████████████████████████| (done) 100%







In [10]:
ml_agent.get_h2o_train_function(markdown=True)

```python

def h2o_automl_enhanced(train_data, test_data, calib_data, target_variable, feature_columns, enable_optuna=True, optuna_n_trials=50, optuna_timeout=300, model_directory=None, log_path=None, enable_mlflow=False, mlflow_tracking_uri=None, mlflow_experiment_name="H2O AutoML Enhanced", mlflow_run_name=None, **kwargs):
    import h2o
    from h2o.automl import H2OAutoML
    import pandas as pd
    import numpy as np
    from contextlib import nullcontext
    
    # Optional imports
    if enable_optuna:
        import optuna
        from optuna.samplers import TPESampler
    
    if enable_mlflow:
        import mlflow
        import mlflow.h2o
        if mlflow_tracking_uri:
            mlflow.set_tracking_uri(mlflow_tracking_uri)
        mlflow.set_experiment(mlflow_experiment_name)
        run_context = mlflow.start_run(run_name=mlflow_run_name)
    else:
        run_context = nullcontext()

    # Convert data to DataFrames
    train_df = pd.DataFrame(train_data)
    test_df = pd.DataFrame(test_data)
    calib_df = pd.DataFrame(calib_data)

    with run_context as run:
        # Initialize H2O
        h2o.init()

        # Create H2OFrames
        train_h2o = h2o.H2OFrame(train_df)
        test_h2o = h2o.H2OFrame(test_df)
        calib_h2o = h2o.H2OFrame(calib_df)

        # Convert target variable to categorical if it's binary
        # Check if target has only 2 unique values by converting to pandas first
        target_values = train_h2o[target_variable].as_data_frame().values.flatten()
        if len(set(target_values)) == 2:
            train_h2o[target_variable] = train_h2o[target_variable].asfactor()
            test_h2o[target_variable] = test_h2o[target_variable].asfactor()
            calib_h2o[target_variable] = calib_h2o[target_variable].asfactor()

        # Train AutoML model
        aml = H2OAutoML(
            max_runtime_secs=300,
            max_models=20,
            nfolds=5,
            seed=42,
            sort_metric="AUTO"
        )
        
        aml.train(x=feature_columns, y=target_variable, training_frame=train_h2o)
        
        # Evaluate on test set
        test_perf = aml.leader.model_performance(test_h2o)
        test_metrics = {}
        
        # Handle classification metrics
        try:
            if hasattr(test_perf, 'auc'):
                auc_value = test_perf.auc()
                test_metrics['auc'] = auc_value[0][0] if hasattr(auc_value, '__getitem__') else auc_value
        except:
            pass
            
        try:
            if hasattr(test_perf, 'logloss'):
                logloss_value = test_perf.logloss()
                test_metrics['logloss'] = logloss_value[0][0] if hasattr(logloss_value, '__getitem__') else logloss_value
        except:
            pass
            
        # Calculate Brier Score for binary classification
        try:
            if len(set(target_values)) == 2:  # Binary classification
                # Get predicted probabilities
                test_pred = aml.leader.predict(test_h2o)
                test_probs = test_pred['p1'].as_data_frame().values.flatten()  # Probability of positive class
                test_actual = test_h2o[target_variable].as_data_frame().values.flatten()
                
                # Convert to numeric if categorical
                if test_actual.dtype == 'object':
                    test_actual = (test_actual == test_actual[0]).astype(int)
                
                # Calculate Brier Score
                brier_score = np.mean((test_probs - test_actual) ** 2)
                test_metrics['brier_score'] = brier_score
        except Exception as e:
            print(f"Could not calculate Brier score: {{e}}")
            
        # Handle regression metrics
        try:
            if hasattr(test_perf, 'rmse'):
                rmse_value = test_perf.rmse()
                test_metrics['rmse'] = rmse_value[0][0] if hasattr(rmse_value, '__getitem__') else rmse_value
        except:
            pass
            
        try:
            if hasattr(test_perf, 'mae'):
                mae_value = test_perf.mae()
                test_metrics['mae'] = mae_value[0][0] if hasattr(mae_value, '__getitem__') else mae_value
        except:
            pass
        
        # Evaluate on calibration set
        calib_perf = aml.leader.model_performance(calib_h2o)
        calib_metrics = {}
        
        # Handle classification metrics
        try:
            if hasattr(calib_perf, 'auc'):
                auc_value = calib_perf.auc()
                calib_metrics['auc'] = auc_value[0][0] if hasattr(auc_value, '__getitem__') else auc_value
        except:
            pass
            
        try:
            if hasattr(calib_perf, 'logloss'):
                logloss_value = calib_perf.logloss()
                calib_metrics['logloss'] = logloss_value[0][0] if hasattr(logloss_value, '__getitem__') else logloss_value
        except:
            pass
            
        # Calculate Brier Score for calibration set
        try:
            if len(set(target_values)) == 2:  # Binary classification
                # Get predicted probabilities
                calib_pred = aml.leader.predict(calib_h2o)
                calib_probs = calib_pred['p1'].as_data_frame().values.flatten()  # Probability of positive class
                calib_actual = calib_h2o[target_variable].as_data_frame().values.flatten()
                
                # Convert to numeric if categorical
                if calib_actual.dtype == 'object':
                    calib_actual = (calib_actual == calib_actual[0]).astype(int)
                
                # Calculate Brier Score
                brier_score = np.mean((calib_probs - calib_actual) ** 2)
                calib_metrics['brier_score'] = brier_score
        except Exception as e:
            print(f"Could not calculate Brier score for calibration set: {{e}}")
            
        # Handle regression metrics
        try:
            if hasattr(calib_perf, 'rmse'):
                rmse_value = calib_perf.rmse()
                calib_metrics['rmse'] = rmse_value[0][0] if hasattr(rmse_value, '__getitem__') else rmse_value
        except:
            pass
            
        try:
            if hasattr(calib_perf, 'mae'):
                mae_value = calib_perf.mae()
                calib_metrics['mae'] = mae_value[0][0] if hasattr(mae_value, '__getitem__') else mae_value
        except:
            pass

        # Save model if directory provided
        model_path = None
        if model_directory or log_path:
            save_path = model_directory if model_directory else log_path
            model_path = h2o.save_model(model=aml.leader, path=save_path, force=True)

        # Get leaderboard
        leaderboard_df = aml.leaderboard.as_data_frame()
        leaderboard_dict = leaderboard_df.to_dict()

        # Prepare results
        results = {
            'leaderboard': leaderboard_dict,
            'best_model_id': aml.leader.model_id,
            'model_path': model_path,
            'test_metrics': test_metrics,
            'calibration_metrics': calib_metrics,
            'optimization_results': None,
            'model_results': {
                'model_flavor': 'H2O AutoML Enhanced',
                'model_path': model_path,
                'best_model_id': aml.leader.model_id,
                'test_performance': test_metrics,
                'calibration_performance': calib_metrics
            }
        }

        return results

```

In [11]:
ml_agent.get_leaderboard()

Unnamed: 0,model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
0,StackedEnsemble_AllModels_1_AutoML_15_20250618...,0.98705,0.140601,0.988422,0.044995,0.192473,0.037046
1,StackedEnsemble_BestOfFamily_1_AutoML_15_20250...,0.985761,0.149925,0.987831,0.050001,0.204413,0.041785
2,DeepLearning_grid_3_AutoML_15_20250618_211113_...,0.981105,0.239573,0.984165,0.061762,0.227422,0.051721
3,DeepLearning_grid_1_AutoML_15_20250618_211113_...,0.978416,0.25645,0.979667,0.070056,0.240887,0.058027
4,GBM_grid_1_AutoML_15_20250618_211113_model_1,0.976472,0.19232,0.976863,0.068384,0.235069,0.055257
5,DeepLearning_grid_2_AutoML_15_20250618_211113_...,0.974344,0.25873,0.97462,0.066701,0.24416,0.059614
6,DeepLearning_grid_2_AutoML_15_20250618_211113_...,0.973389,0.261822,0.974758,0.071717,0.253221,0.064121
7,GBM_2_AutoML_15_20250618_211113,0.972811,0.202992,0.972011,0.066723,0.238417,0.056843
8,GBM_4_AutoML_15_20250618_211113,0.972772,0.20411,0.969011,0.070067,0.239119,0.057178
9,GBM_grid_1_AutoML_15_20250618_211113_model_3,0.972705,0.209922,0.97355,0.076779,0.24944,0.06222


In [12]:
ml_agent.get_optimization_results()

In [13]:
import h2o

model = h2o.get_model(ml_agent.get_best_model_id())

model

key,value
Stacking strategy,cross_validation
Number of base models (used / total),6/20
# GBM base models (used / total),2/10
# DeepLearning base models (used / total),4/7
# DRF base models (used / total),0/2
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,0,1,Error,Rate
0,299.0,0.0,0.0,(0.0/299.0)
1,0.0,301.0,0.0,(0.0/301.0)
Total,299.0,301.0,0.0,(0.0/600.0)

metric,threshold,value,idx
max f1,0.6225677,1.0,199.0
max f2,0.6225677,1.0,199.0
max f0point5,0.6225677,1.0,199.0
max accuracy,0.6225677,1.0,199.0
max precision,0.9999879,1.0,0.0
max recall,0.6225677,1.0,199.0
max specificity,0.9999879,1.0,0.0
max absolute_mcc,0.6225677,1.0,199.0
max min_per_class_accuracy,0.6225677,1.0,199.0
max mean_per_class_accuracy,0.6225677,1.0,199.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.999981,1.9933555,1.9933555,1.0,0.9999847,1.0,0.9999847,0.0199336,0.0199336,99.3355482,99.3355482,0.0199336
2,0.02,0.9999755,1.9933555,1.9933555,1.0,0.999979,1.0,0.9999819,0.0199336,0.0398671,99.3355482,99.3355482,0.0398671
3,0.03,0.9999664,1.9933555,1.9933555,1.0,0.9999725,1.0,0.9999787,0.0199336,0.0598007,99.3355482,99.3355482,0.0598007
4,0.04,0.9999603,1.9933555,1.9933555,1.0,0.9999636,1.0,0.999975,0.0199336,0.0797342,99.3355482,99.3355482,0.0797342
5,0.05,0.9999479,1.9933555,1.9933555,1.0,0.9999556,1.0,0.9999711,0.0199336,0.0996678,99.3355482,99.3355482,0.0996678
6,0.1,0.9998672,1.9933555,1.9933555,1.0,0.999913,1.0,0.9999421,0.0996678,0.1993355,99.3355482,99.3355482,0.1993355
7,0.15,0.9997815,1.9933555,1.9933555,1.0,0.9998236,1.0,0.9999026,0.0996678,0.2990033,99.3355482,99.3355482,0.2990033
8,0.2,0.9995294,1.9933555,1.9933555,1.0,0.999662,1.0,0.9998424,0.0996678,0.3986711,99.3355482,99.3355482,0.3986711
9,0.3,0.9985948,1.9933555,1.9933555,1.0,0.9991903,1.0,0.9996251,0.1993355,0.5980066,99.3355482,99.3355482,0.5980066
10,0.4,0.9960654,1.9933555,1.9933555,1.0,0.9974445,1.0,0.9990799,0.1993355,0.7973422,99.3355482,99.3355482,0.7973422

Unnamed: 0,0,1,Error,Rate
0,286.0,13.0,0.0435,(13.0/299.0)
1,14.0,287.0,0.0465,(14.0/301.0)
Total,300.0,300.0,0.045,(27.0/600.0)

metric,threshold,value,idx
max f1,0.506915,0.9550749,204.0
max f2,0.211083,0.9681611,239.0
max f0point5,0.6915716,0.9661836,191.0
max accuracy,0.6538221,0.955,194.0
max precision,0.9999802,1.0,0.0
max recall,0.0008596,1.0,378.0
max specificity,0.9999802,1.0,0.0
max absolute_mcc,0.6538221,0.9106222,194.0
max min_per_class_accuracy,0.506915,0.9534884,204.0
max mean_per_class_accuracy,0.6538221,0.9550606,194.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.9999671,1.9933555,1.9933555,1.0,0.9999784,1.0,0.9999784,0.0199336,0.0199336,99.3355482,99.3355482,0.0199336
2,0.02,0.9999538,1.9933555,1.9933555,1.0,0.999962,1.0,0.9999702,0.0199336,0.0398671,99.3355482,99.3355482,0.0398671
3,0.03,0.9999403,1.9933555,1.9933555,1.0,0.9999458,1.0,0.999962,0.0199336,0.0598007,99.3355482,99.3355482,0.0598007
4,0.04,0.9999156,1.9933555,1.9933555,1.0,0.9999337,1.0,0.9999549,0.0199336,0.0797342,99.3355482,99.3355482,0.0797342
5,0.05,0.9999025,1.9933555,1.9933555,1.0,0.9999085,1.0,0.9999456,0.0199336,0.0996678,99.3355482,99.3355482,0.0996678
6,0.1,0.9994029,1.9933555,1.9933555,1.0,0.9997548,1.0,0.9998502,0.0996678,0.1993355,99.3355482,99.3355482,0.1993355
7,0.15,0.9988232,1.9933555,1.9933555,1.0,0.9991313,1.0,0.9996106,0.0996678,0.2990033,99.3355482,99.3355482,0.2990033
8,0.2,0.9971398,1.9933555,1.9933555,1.0,0.9981808,1.0,0.9992531,0.0996678,0.3986711,99.3355482,99.3355482,0.3986711
9,0.3,0.9842873,1.9601329,1.9822813,0.9833333,0.9923271,0.9944444,0.9969444,0.1960133,0.5946844,96.013289,98.2281285,0.5913399
10,0.4,0.9260341,1.9601329,1.9767442,0.9833333,0.9624659,0.9916667,0.9883248,0.1960133,0.7906977,96.013289,97.6744186,0.7840087

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.9600298,0.0088387,0.95,0.968,0.9655172,0.9508197,0.9658120
aic,47.587803,7.066701,52.61064,42.08517,44.799408,57.319103,41.12469
auc,0.9881887,0.0078822,0.9855395,0.9961597,0.9913381,0.9758242,0.9920821
err,0.0399702,0.0088387,0.05,0.032,0.0344828,0.0491803,0.0341880
err_count,4.8,1.0954452,6.0,4.0,4.0,6.0,4.0
f0point5,0.9529287,0.0193423,0.9355828,0.9516617,0.9677419,0.9328358,0.9768212
f1,0.9601414,0.0114020,0.953125,0.9692308,0.9677419,0.9433962,0.9672131
f2,0.9677062,0.0130784,0.9713376,0.9874608,0.9677419,0.9541985,0.9577922
lift_top_group,2.0047657,0.1959079,1.9354838,1.9841269,1.8709677,2.3461537,1.8870968
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
