In [1]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [2]:

# ============================================================
# 0. PATH HANDLING
# ============================================================
if os.path.isdir("/data"):
    DATA_BASE = "/data"
else:
    DATA_BASE = "../data"

PROCESSED_DIR = os.path.join(DATA_BASE, "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

if os.path.isdir("/models"):
    MODELS_DIR = "/models"
else:
    MODELS_DIR = "../models"
os.makedirs(MODELS_DIR, exist_ok=True)

if os.path.isdir("/output"):
    OUTPUT_DIR = "/output"
else:
    OUTPUT_DIR = "../output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def pick_path(sample, full):
    sample_path = os.path.join(PROCESSED_DIR, sample)
    full_path   = os.path.join(PROCESSED_DIR, full)
    return sample_path if os.path.exists(sample_path) else full_path

X_train_path = pick_path("X_train_sample.parquet", "X_train.parquet")
X_test_path  = pick_path("X_test_sample.parquet",  "X_test.parquet")
y_train_path = pick_path("y_train_sample.parquet", "y_train.parquet")
y_test_path  = pick_path("y_test_sample.parquet",  "y_test.parquet")

print("Using paths:")
print("X_train:", X_train_path)
print("X_test :", X_test_path)
print("y_train:", y_train_path)
print("y_test :", y_test_path)

rf_model_out = os.path.join(MODELS_DIR, "random_forest_model.pkl")
rf_feature_importance_out = os.path.join(OUTPUT_DIR, "random_forest_feature_importance.csv")
rf_evaluation_out = os.path.join(OUTPUT_DIR, "random_forest_evaluation.txt")
rf_actual_vs_pred = os.path.join(OUTPUT_DIR, "rf_actual_vs_pred.png")
rf_residual_hist = os.path.join(OUTPUT_DIR, "rf_residual_hist.png")
rf_residual_vs_pred = os.path.join(OUTPUT_DIR, "rf_residual_vs_pred.png")

# ============================================================
# 1. LOAD DATA
# ============================================================
X_train = pd.read_parquet(X_train_path)
X_test  = pd.read_parquet(X_test_path)

y_train = pd.read_parquet(y_train_path).iloc[:, 0]
y_test  = pd.read_parquet(y_test_path).iloc[:, 0]

feature_names = X_train.columns.tolist()

print("\nLoaded shapes:")
print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)

# ============================================================
# 2. MODEL INITIALIZATION & TRAINING
# ============================================================
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42,
    n_jobs=-1
)

start = time.time()
rf.fit(X_train, y_train)
train_time = time.time() - start

print(f"\nRandom Forest training completed in {train_time:.2f} seconds.")

# ============================================================
# 3. PREDICTIONS
# ============================================================
y_pred_train = rf.predict(X_train)
y_pred_test  = rf.predict(X_test)

# ============================================================
# 4. METRICS
# ============================================================
def rmse(a, b): return np.sqrt(mean_squared_error(a, b))
def mae(a, b):  return mean_absolute_error(a, b)
def mape(a, b):
    a = np.array(a)
    b = np.array(b)
    mask = a != 0
    return np.mean(np.abs((a[mask] - b[mask]) / a[mask])) * 100

def rmsle(a, b):
    a = np.clip(a, 0, None)
    b = np.clip(b, 0, None)
    return np.sqrt(np.mean((np.log1p(a) - np.log1p(b))**2))

def evaluate(name, y_true, y_pred):
    return {
        "RMSE": rmse(y_true, y_pred),
        "MAE": mae(y_true, y_pred),
        "MAPE": mape(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
        "RMSLE": rmsle(y_true, y_pred)
    }

metrics = {
    "train": evaluate("train", y_train, y_pred_train),
    "test":  evaluate("test", y_test, y_pred_test),
}

print("\n=== RANDOM FOREST METRICS ===")
for split in ["train", "test"]:
    m = metrics[split]
    print(f"\n{split.upper()} METRICS:")
    print(f"  RMSE : {m['RMSE']:.4f}")
    print(f"  MAE  : {m['MAE']:.4f}")
    print(f"  MAPE : {m['MAPE']:.2f}%")
    print(f"  R²   : {m['R2']:.4f}")
    print(f"  RMSLE: {m['RMSLE']:.4f}")

# ============================================================
# 5. FEATURE IMPORTANCE
# ============================================================
importances = rf.feature_importances_

fi_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

fi_df.to_csv(rf_feature_importance_out, index=False)
print("\nFeature importance saved to:", rf_feature_importance_out)

top20 = fi_df.head(20)

plt.figure(figsize=(10, 8))
plt.barh(top20["feature"][::-1], top20["importance"][::-1])
plt.title("Top 20 Random Forest Feature Importances")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "rf_top20_features.png"))
plt.close()

# ============================================================
# 6. RESIDUAL ANALYSIS
# ============================================================
residuals = y_test - y_pred_test

plt.figure(figsize=(8, 5))
plt.hist(residuals, bins=60)
plt.title("Residual Histogram (RF)")
plt.tight_layout()
plt.savefig(rf_residual_hist)
plt.close()

plt.figure(figsize=(8, 5))
plt.scatter(y_pred_test, residuals, s=5)
plt.axhline(0, color='black', linestyle='--')
plt.title("Residuals vs Predicted (RF)")
plt.tight_layout()
plt.savefig(rf_residual_vs_pred)
plt.close()

print("Residual plots saved.")

# ============================================================
# 7. OVERFITTING CHECK
# ============================================================
print("\n=== OVERFITTING CHECK ===")
print("Train RMSE:", metrics["train"]["RMSE"])
print("Test RMSE :", metrics["test"]["RMSE"])
print("Train R²  :", metrics["train"]["R2"])
print("Test R²   :", metrics["test"]["R2"])

# ============================================================
# 8. SAVE MODEL
# ============================================================
joblib.dump(rf, rf_model_out)
print("\nModel saved to:", rf_model_out)

# ============================================================
# 9. PREDICTION SCATTER
# ============================================================
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred_test, s=5)
min_v = min(y_test.min(), y_pred_test.min())
max_v = max(y_test.max(), y_pred_test.max())
plt.plot([min_v, max_v], [min_v, max_v], "--")
plt.title("Actual vs Predicted (Random Forest)")
plt.tight_layout()
plt.savefig(rf_actual_vs_pred)
plt.close()

print("Actual vs Predicted plot saved:", rf_actual_vs_pred)

# ============================================================
# 10. PERFORMANCE REPORT SAVE
# ============================================================
with open(rf_evaluation_out, "w") as f:
    f.write("Random Forest Regression – Evaluation Report\n")
    f.write("===========================================\n\n")
    f.write(f"Training time: {train_time:.2f} seconds\n\n")

    for split in ["train", "test"]:
        m = metrics[split]
        f.write(f"{split.upper()} METRICS:\n")
        f.write(f"  RMSE : {m['RMSE']:.4f}\n")
        f.write(f"  MAE  : {m['MAE']:.4f}\n")
        f.write(f"  MAPE : {m['MAPE']:.2f}%\n")
        f.write(f"  R2   : {m['R2']:.4f}\n")
        f.write(f"  RMSLE: {m['RMSLE']:.4f}\n\n")

    f.write("Top 20 Feature Importances:\n")
    for _, row in top20.iterrows():
        f.write(f"  {row['feature']}: {row['importance']:.6f}\n")

print("\nEvaluation written to:", rf_evaluation_out)
print("\n Random Forest Model Training Completed Successfully!")


Using paths:
X_train: ../data/processed/X_train.parquet
X_test : ../data/processed/X_test.parquet
y_train: ../data/processed/y_train.parquet
y_test : ../data/processed/y_test.parquet

Loaded shapes:
X_train: (1568133, 57)
X_test : (392034, 57)
y_train: (1568133,)
y_test : (392034,)

Random Forest training completed in 301.99 seconds.

=== RANDOM FOREST METRICS ===

TRAIN METRICS:
  RMSE : 1.6569
  MAE  : 0.4032
  MAPE : 12.44%
  R²   : 0.9912
  RMSLE: 0.0460

TEST METRICS:
  RMSE : 2.1165
  MAE  : 0.5590
  MAPE : 11.53%
  R²   : 0.9825
  RMSLE: 0.0579

Feature importance saved to: ../output/random_forest_feature_importance.csv
Residual plots saved.

=== OVERFITTING CHECK ===
Train RMSE: 1.656947692054232
Test RMSE : 2.116527021118823
Train R²  : 0.9912384019358911
Test R²   : 0.9824618057553178

Model saved to: ../models/random_forest_model.pkl
Actual vs Predicted plot saved: ../output/rf_actual_vs_pred.png

Evaluation written to: ../output/random_forest_evaluation.txt

 Random Forest 