# Decision Tree Model – California Housing

This notebook trains and evaluates a **Decision Tree Regressor** on the processed California Housing dataset.  

It includes:  
- Loading the processed dataset  
- Fitting a Decision Tree model  
- Cross-validation evaluation  
- Hyperparameter tuning  
- Saving the trained model  


In [50]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import joblib

# Paths
PROJECT_DIR = Path("/Users/sukainaalkhalidy/Desktop/CMSE492/ca_housing_project")
TRAIN_PROCESSED_FP = PROJECT_DIR / "data" / "train" / "housing_train_processed.csv"
MODEL_FP = PROJECT_DIR / "models" / "decision_tree_model.pkl"

# Load processed dataset
housing = pd.read_csv(TRAIN_PROCESSED_FP)
print("Processed train shape:", housing.shape)

X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]

# Handle missing values just in case
X = X.fillna(X.median(numeric_only=True))


Processed train shape: (16512, 24)


# Model Fitting
We initialize and train a **DecisionTreeRegressor** on the dataset.


In [53]:
from sklearn.tree import DecisionTreeRegressor
# Initialize the Decision Tree Regressor
# random_state=42 ensures reproducibility
tree_reg = DecisionTreeRegressor(random_state=42)
# Fit the model on the processed training data
tree_reg.fit(X, y)

print("Decision Tree model trained successfully!")


Decision Tree model trained successfully!


# Cross-Validation
We use **cross-validation with RMSE** to evaluate the Decision Tree.


In [54]:
from sklearn.model_selection import cross_val_score
import numpy as np
# Perform 5-fold cross-validation on the Decision Tree model
# Using negative mean squared error as the scoring metric
scores = cross_val_score(tree_reg, X, y,
                         scoring="neg_mean_squared_error", cv=5)

# Convert negative MSE scores into RMSE for easier interpretation
rmse_scores = np.sqrt(-scores)

print("Cross-validation RMSE scores:", rmse_scores)
print("Mean:", rmse_scores.mean())
print("Standard deviation:", rmse_scores.std())


Cross-validation RMSE scores: [72726.66170184 72872.57193526 71809.74015932 68975.78235878
 71718.24150741]
Mean: 71620.59953252101
Standard deviation: 1402.22497724948


In [57]:
print("RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Standard deviation:", rmse_scores.std())


RMSE scores: [72726.66170184 72872.57193526 71809.74015932 68975.78235878
 71718.24150741]
Mean RMSE: 71620.59953252101
Standard deviation: 1402.22497724948


In [59]:
print(X.isna().sum().sum())  # should be 0
print(X.dtypes.unique())


0
[dtype('float64')]


# Hyperparameter Tuning
We use **GridSearchCV** to find the best tree depth and split criteria.


In [61]:
from sklearn.model_selection import GridSearchCV

# Smaller grid to test quickly
param_grid = {
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 10],
    "min_samples_leaf": [1, 5]
}

grid = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid,
    cv=3,  # fewer folds, faster
    scoring="neg_mean_squared_error",
    n_jobs=1  # single core, more stable
)


grid.fit(X, y)

print("Best parameters:", grid.best_params_)
best_tree = grid.best_estimator_

# Evaluate tuned tree
scores = cross_val_score(best_tree, X, y,
                         scoring="neg_mean_squared_error", cv=5)
rmse_scores = np.sqrt(-scores)
print("Mean RMSE (tuned):", rmse_scores.mean())


Best parameters: {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2}
Mean RMSE (tuned): 61339.120597379006


# Model Saving
We save the trained Decision Tree model to the `/models` directory.


In [65]:
from pathlib import Path
import joblib

PROJECT_DIR = Path("/Users/sukainaalkhalidy/Desktop/CMSE492/ca_housing_project")
models_dir = PROJECT_DIR / "models"
models_dir.mkdir(parents=True, exist_ok=True)

model_path = models_dir / "decision_tree_model.pkl"

# Save tuned model
joblib.dump(best_tree, model_path)
print(f"Model saved to {model_path}")



Model saved to /Users/sukainaalkhalidy/Desktop/CMSE492/ca_housing_project/models/decision_tree_model.pkl
