In [None]:
# Auto-reload modules when they change (no need to restart kernel)
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

import sys
import os

from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from IPython.display import Image, display

FINAL_PROJECT_DIR = "/Users/sam/Desktop/cee322/final"

if FINAL_PROJECT_DIR not in sys.path:
    sys.path.insert(0, FINAL_PROJECT_DIR)

from src.state import DataEngState
from src.agent import create_agent

In [None]:
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
langsmith_tracing = os.getenv("LANGSMITH_TRACING")

if not api_key:
    raise RuntimeError(
        "OPENAI_API_KEY environment variable is not set; please configure it to run this example."
    )

chat_model = init_chat_model(model="gpt-4o-mini", api_key=api_key)

DATA_FILEPATH = "/Users/sam/Desktop/cee322/final/data/hard.csv"

In [None]:
# Create the agent
clean_impute_agent = create_agent(chat_model)

In [None]:
# Visualize the 2-node structure
display(Image(clean_impute_agent.get_graph().draw_mermaid_png()))

In [None]:
initial_clean_impute_state: DataEngState = {
    "filepath": DATA_FILEPATH,
}

final_clean_impute_state = clean_impute_agent.invoke(initial_clean_impute_state)
print("\n=== 4-NODE AGENT COMPLETE (clean -> reflect_clean -> impute -> reflect_impute) ===")
print("Errors:", final_clean_impute_state.get("errors"))
if final_clean_impute_state.get("data") is not None:
    data = final_clean_impute_state["data"]
    print(f"Final data shape: {len(data)} rows × {len(data.columns)} columns")
    print(f"Missing values remaining: {data.isna().sum().sum()}")
    print(f"\nCleaning records ({len(final_clean_impute_state.get('cleaning_records', []))}):")
    for record in final_clean_impute_state.get("cleaning_records", []):
        print(f"  - {record}")
    print(f"\nImputation records ({len(final_clean_impute_state.get('imputation_records', []))}):")
    for record in final_clean_impute_state.get("imputation_records", [])[:5]:  # Show first 5
        print(f"  - {record}")
    if len(final_clean_impute_state.get("imputation_records", [])) > 5:
        print(f"  ... and {len(final_clean_impute_state.get('imputation_records', [])) - 5} more")
    print(f"\nMissing info: {len(final_clean_impute_state.get('missing_info', {}))} columns with missing values")
    print(f"Summary stats: {len(final_clean_impute_state.get('summary_stats', {}))} numeric columns analyzed")
    print(f"\nCleaning assumptions ({len(final_clean_impute_state.get('cleaning_assumptions', {}))}):")
    for col, rules in final_clean_impute_state.get("cleaning_assumptions", {}).items():
        print(f"  - {col}: {rules}")
    print(f"\nImputation assumptions ({len(final_clean_impute_state.get('imputation_assumptions', {}))}):")
    for col, rules in final_clean_impute_state.get("imputation_assumptions", {}).items():
        print(f"  - {col}: {rules}")
    
    # Save cleaned DataFrame to data folder
    filepath = final_clean_impute_state.get("filepath", "")
    if filepath:
        base_name = os.path.basename(filepath)
        name_without_ext = os.path.splitext(base_name)[0]
        data_dir = "/Users/sam/Desktop/cee322/final/data"
        output_filepath = os.path.join(data_dir, f"{name_without_ext}_cleaned.csv")
        
        try:
            os.makedirs(data_dir, exist_ok=True)
            data.to_csv(output_filepath, index=False)
            print(f"\n✓ Cleaned data saved to: {output_filepath}")
            print(f"  Saved {len(data)} rows × {len(data.columns)} columns")
        except Exception as e:
            print(f"\n⚠ Error saving cleaned data: {e}")
    
    print("\nNote: cleaning_assumptions and imputation_assumptions can be cached and reused in future runs for reproducibility")
