In [1]:
import sys
import os
import pandas as pd

# Add the project root to the system path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.saturation_utils import SaturationAnalyst

# Re-run the critical parts of Scarcity Simulation to get the dataframes
# Or ideally, load them if they were saved. 
# Since they weren't saved to disk in the previous step (only printed/plotted),
# we will quickly regenerate them here using the same logic.

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from src.cold_start_engine import ColdStartEngine
from sklearn.model_selection import train_test_split

# Load Data
data_path = os.path.join('..', 'data', 'raw', 'housing.csv')
if not os.path.exists(data_path):
    data_path = os.path.join('data', 'raw', 'housing.csv') # Fallback

if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    df = df.dropna()
    df = pd.get_dummies(df, drop_first=True)
    X = df.drop("median_house_value", axis=1)
    y = df["median_house_value"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Factories
    def forest_factory(): return RandomForestRegressor(n_estimators=100, random_state=42)
    def linear_factory(): return LinearRegression()
    
    # Milestones
    milestones = [50, 100, 250, 500, 1000, 2500, 5000, 10000]
    
    # Engines
    engine_forest = ColdStartEngine(model_factory=forest_factory)
    engine_linear = ColdStartEngine(model_factory=linear_factory)
    
    print("Regenerating Scarcity Data for Analysis...")
    df_forest = engine_forest.run_scarcity_audit(X_train, y_train, X_test, y_test, milestones)
    df_linear = engine_linear.run_scarcity_audit(X_train, y_train, X_test, y_test, milestones)
else:
    print("Error: Could not load data to regenerate results.")


Regenerating Scarcity Data for Analysis...
Starting Scarcity Audit across 8 milestones...


   [n=   50] MAE: $61,719.95


   [n=  100] MAE: $54,045.92


   [n=  250] MAE: $49,723.33


   [n=  500] MAE: $45,381.36


   [n= 1000] MAE: $43,776.96


   [n= 2500] MAE: $40,104.44


   [n= 5000] MAE: $36,618.78


   [n=10000] MAE: $33,969.36
Starting Scarcity Audit across 8 milestones...
   [n=   50] MAE: $58,956.25
   [n=  100] MAE: $52,578.34
   [n=  250] MAE: $52,028.58
   [n=  500] MAE: $51,838.70
   [n= 1000] MAE: $51,181.69
   [n= 2500] MAE: $50,998.69
   [n= 5000] MAE: $50,818.50
   [n=10000] MAE: $50,581.14


In [2]:
analyst = SaturationAnalyst()

# Finding the exact 'n' where the Forest takes the lead
crossover_n = analyst.calculate_crossover(df_linear, df_forest)

print(f"ðŸŽ¯ The Research Verdict: The Crossover Point is at n = {crossover_n}")
print(f"Interpretation: Before {crossover_n} samples, the 'Golden Model' is actually a liability.")

ðŸŽ¯ The Research Verdict: The Crossover Point is at n = 250.0
Interpretation: Before 250.0 samples, the 'Golden Model' is actually a liability.


In [3]:
# Extract n and MAE values for the Forest
n_vals = df_forest['n_samples'].values
mae_vals = df_forest['mae'].values

# Fit the Power Law: E(n) = a * n^-alpha + b
params = analyst.fit_learning_trajectory(n_vals, mae_vals)
a, alpha, b = params

print(f"ðŸ“Š Model DNA Analysis:")
print(f"   - Learning Rate (alpha): {alpha:.4f}")
print(f"   - Irreducible Error (b): ${b:,.2f}")

ðŸ“Š Model DNA Analysis:
   - Learning Rate (alpha): 0.2010
   - Irreducible Error (b): $21,036.48
