# HIRM Performance Evaluation

This notebook runs the `baseline_benchmark` experiment to evaluate the model's performance (PnL) and compares it against baselines (ERM, IRM, GroupDRO, V-REx, Risk Parity).

In [None]:
# Setup Environment
import os

# Ensure we start from the base directory in Colab
if os.path.exists("/content"):
    %cd /content

REPO_NAME = "hirm-research"
REPO_URL = "https://github.com/raei-2748/hirm-research.git"

if os.path.exists(REPO_NAME):
    print(f"Updating existing repository in {REPO_NAME}...")
    %cd {REPO_NAME}
    !git pull
else:
    print(f"Cloning repository to {REPO_NAME}...")
    !git clone {REPO_URL} {REPO_NAME}
    %cd {REPO_NAME}

print(f"Current working directory: {os.getcwd()}")
%pip install -q -r requirements-colab.txt
%pip install -q -e .

In [None]:
# Run Baseline Benchmark
# This runs multiple methods on synthetic and real datasets.
# It may take 30-60 minutes depending on the hardware.
!python scripts/run_grid.py --config configs/experiments/baseline_benchmark.yaml --device cuda

In [None]:
# Summarize Results
!python scripts/summarize_baseline_results.py --config configs/experiments/baseline_benchmark.yaml

In [None]:
# Display Results
import pandas as pd
import glob

summary_files = glob.glob("results/baseline_benchmark/summary/*_summary.csv")
for f in summary_files:
    print(f"\n=== {os.path.basename(f)} ===")
    df = pd.read_csv(f)
    display(df)