# Random Forest

In [16]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load sparse matrices
X_train = sparse.load_npz("data/processed/X_train.npz")
X_valid = sparse.load_npz("data/processed/X_valid.npz")
X_test  = sparse.load_npz("data/processed/X_test.npz")

# Load targets
y_train = pd.read_csv("data/processed/y_train.csv").values.ravel()
y_valid = pd.read_csv("data/processed/y_valid.csv").values.ravel()

# Log transform for Kaggle scoring
y_train_log = np.log1p(y_train)
y_valid_log = np.log1p(y_valid)

# Train Random Forest
rf = RandomForestRegressor(
    n_estimators=800,
    random_state=42,
    n_jobs=-1,
    max_features="sqrt"
)

rf.fit(X_train, y_train_log)

# Log space predictions
pred_valid_log = rf.predict(X_valid)

# Log RMSE (Kaggle metric)
rmse_log = np.sqrt(mean_squared_error(y_valid_log, pred_valid_log))

# Convert back to dollars
pred_valid = np.expm1(pred_valid_log)
y_valid_raw = np.expm1(y_valid_log)

rmse_raw = np.sqrt(mean_squared_error(y_valid_raw, pred_valid))
mae = mean_absolute_error(y_valid_raw, pred_valid)
r2 = r2_score(y_valid_raw, pred_valid)

print(f"Log RMSE (Kaggle metric): {rmse_log:.5f}")
print(f"Raw RMSE ($): {rmse_raw:,.2f}")
print(f"MAE ($): {mae:,.2f}")
print(f"R2: {r2:.4f}")

Log RMSE (Kaggle metric): 0.14895
Raw RMSE ($): 32,128.59
MAE ($): 17,803.85
R2: 0.8521


# Train on the full data (Train + valid)

In [17]:
# Combine full training data
X_full = sparse.vstack([X_train, X_valid])
y_full = np.concatenate([y_train, y_valid])

# Log transform
y_full_log = np.log1p(y_full)

rf.fit(X_full, y_full_log)

# Predict Test set

In [18]:
pred_test_log = rf.predict(X_test)
pred_test = np.expm1(pred_test_log)  # convert back to dollars

## For Kaggle Submission

In [20]:
# Load test Ids
test_ids = pd.read_csv("data/raw/test.csv")["Id"]

submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": pred_test
})

submission.to_csv("submission01.csv", index=False)

print(submission.head())
print(submission.shape)

     Id      SalePrice
0  1461  126571.652886
1  1462  152127.302290
2  1463  181646.359813
3  1464  191586.852343
4  1465  188779.744244
(1459, 2)
