# Data Loading

We load the processed training dataset (24 features + target) from the `/data/train` directory.  
Since the data is already preprocessed, we can directly use it for model training.


In [44]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
import joblib

# Paths
PROJECT_DIR = Path("/Users/sukainaalkhalidy/Desktop/CMSE492/ca_housing_project")
TRAIN_PROCESSED_FP = PROJECT_DIR / "data" / "train" / "housing_train_processed.csv"
MODEL_FP = PROJECT_DIR / "models" / "random_forest_model.pkl"

# Load processed dataset
housing = pd.read_csv(TRAIN_PROCESSED_FP)
print("Processed train shape:", housing.shape)

X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]

# Handle missing values
X = X.fillna(X.median(numeric_only=True))


Processed train shape: (16512, 24)


# # Model Fitting

We initialize and train a Random Forest model on the processed training data.  
The training RMSE is reported as a baseline measure (though it may underestimate true error).


In [45]:
forest_reg = RandomForestRegressor(random_state=42, n_jobs=-1)
forest_reg.fit(X, y)

predictions = forest_reg.predict(X)
mse = mean_squared_error(y, predictions)
rmse = np.sqrt(mse)
print("Training RMSE:", rmse)


Training RMSE: 18530.863136665324


# Cross-Validation
We evaluate the Random Forest model using 5-fold cross-validation.  
This provides a more reliable estimate of model performance.


In [49]:
scores = cross_val_score(forest_reg, X, y,
                         scoring="neg_mean_squared_error", cv=5,
                         n_jobs=1)


rmse_scores = np.sqrt(-scores)
print("Cross-validation RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Std deviation:", rmse_scores.std())


Cross-validation RMSE scores: [49615.18640867 49452.76855645 50979.50362713 49706.50915409
 50745.22988605]
Mean RMSE: 50099.839526477226
Std deviation: 632.2398141373344


# Hyperparameter Tuning

We use GridSearchCV to tune the Random Forest model’s hyperparameters.  
The best parameters and tuned performance are reported.


In [56]:
param_grid = {
    "n_estimators": [50],
    "max_depth": [10, None]
}

grid = GridSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=1),
    param_grid,
    cv=2,  # fewer folds
    scoring="neg_mean_squared_error",
    n_jobs=1
)

grid.fit(X, y)
print("Best parameters:", grid.best_params_)
best_forest = grid.best_estimator_


Best parameters: {'max_depth': None, 'n_estimators': 50}


# Model Saving

We save the tuned Random Forest model to the `/models` directory  
so it can be reused later without retraining.


In [57]:
MODEL_FP.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(forest_reg, MODEL_FP)
print(f"Model saved to {MODEL_FP}")


Model saved to /Users/sukainaalkhalidy/Desktop/CMSE492/ca_housing_project/models/random_forest_model.pkl
