In [2]:
# Machine learning models

In [21]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [22]:

# --- Load and split data ---
housing = fetch_california_housing()
X, y = housing['data'], housing['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, shuffle=True, random_state=43
)

# --- Define models ---
models = {
    "Linear Regression": LinearRegression(),
    "SVM": SVR(),
    "Decision Tree": DecisionTreeRegressor(random_state=43),
    "Random Forest": RandomForestRegressor(random_state=43),
    "Gradient Boosting": GradientBoostingRegressor(random_state=43)
}

# --- Evaluate each model ---
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)

    # Predictions
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    # Metrics
    r2_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    r2_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    # Store results
    results[name] = {
        'R2 Train': r2_train,
        'R2 Test': r2_test,
        'MSE Train': mse_train,
        'MSE Test': mse_test,
        'MAE Train': mae_train,
        'MAE Test': mae_test
    }

# --- Print results in a nice table ---
import pandas as pd

df_results = pd.DataFrame(results).T.round(4)
print(df_results)

                   R2 Train  R2 Test  MSE Train  MSE Test  MAE Train  MAE Test
Linear Regression    0.6054   0.6129     0.5274    0.4976     0.5331    0.5196
SVM                  0.7496   0.7295     0.3346    0.3477     0.3836    0.3898
Decision Tree        1.0000   0.6228     0.0000    0.4849     0.0000    0.4403
Random Forest        0.9741   0.8120     0.0346    0.2417     0.1200    0.3194
Gradient Boosting    0.8042   0.7895     0.2617    0.2706     0.3566    0.3646


In [23]:
# Interpretation
#     Linear Regression: Simple, interpretable baseline. Moderate performance.
#     SVM: Better fit, but sensitive to scaling and kernel choice.
#     Decision Tree: Perfect on training (R²=1.0) but much worse on test → overfitting.
#     Random Forest: Strong performance; generalizes well, less overfitting.
#     Gradient Boosting: Usually gives best balance between bias and variance.

# Key Takeaways
#     Ensemble methods (Random Forest, Gradient Boosting) usually perform best on tabular data.
#     Tree-based models don’t need scaling but we keep it in the pipeline for uniformity.
#     Comparing both train and test metrics helps detect overfitting or underfitting.