In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from src.cold_start_engine import ColdStartEngine

# The 'Golden Model' Factory
def forest_factory():
    return RandomForestRegressor(n_estimators=100, random_state=42)

# The 'Baseline Challenger' Factory
def linear_factory():
    return LinearRegression()

ModuleNotFoundError: No module named 'src'

In [None]:
# Strategic Milestones
milestones = [50, 100, 250, 500, 1000, 2500, 5000, 10000]

# Initialize Engine
engine_forest = ColdStartEngine(model_factory=forest_factory)
engine_linear = ColdStartEngine(model_factory=linear_factory)

# Run Audits (Assume X_train, y_train, X_test, y_test are pre-loaded)
print("ðŸŒ² Auditing Golden Forest...")
df_forest = engine_forest.run_scarcity_audit(X_train, y_train, X_test, y_test, milestones)

print("\nðŸ“ˆ Auditing Linear Challenger...")
df_linear = engine_linear.run_scarcity_audit(X_train, y_train, X_test, y_test, milestones)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(df_forest['n_samples'], df_forest['mae'], label='Golden Forest (High Variance)', marker='o')
plt.plot(df_linear['n_samples'], df_linear['mae'], label='Linear Model (High Bias)', marker='s')
plt.xscale('log') # Log scale helps see the small-data behavior clearly
plt.title("The Complexity Tax: Random Forest vs. Linear Regression")
plt.xlabel("Number of Training Samples (n)")
plt.ylabel("Mean Absolute Error ($)")
plt.legend()
plt.grid(True)
plt.show()