In [18]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
from sklearn.preprocessing import StandardScaler

data = fetch_california_housing(as_frame=True)
df = data.frame

missing_values = df.isnull().sum()
print(f"Missing values per column:\n{missing_values}")

df.fillna(df.median(), inplace=True)

X = df.drop("MedHouseVal", axis=1)   
y = df["MedHouseVal"]   

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


Missing values per column:
MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [None]:
#Missing Values: Ensuring no missing values avoids errors in training and evaluation.
#Feature Scaling: Standardization centers the data around 0 with unit variance, making algorithms like SVR and Gradient Boosting perform optimally since these algorithms are sensitive to feature magnitude.

In [4]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(X_scaled, y)
#A baseline model to compare against, works well if the relationships are linear.
#Linear regression models the relationship between independent variables and the target variable as a linear equation.
#It assumes that the target variable is a weighted sum of the input features

In [None]:
#Linear regression is simple, interpretable, and computationally efficient.
#However, it assumes a linear relationship between features and the target, which may not hold for complex datasets like California Housing, potentially leading to suboptimal performance.

In [6]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_scaled, y)
#Captures non-linear relationships and interprets feature importance.
#Decision trees split the data into smaller subsets based on feature thresholds, creating a tree-like structure.
#Predictions are made by averaging the values at the leaf nodes.

In [None]:
#Decision trees handle non-linear relationships well and do not require feature scaling. 
#They can capture complex interactions between features, making them suitable for housing data with non-linear patterns.

In [8]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_scaled, y)
#Handles high-dimensional data, captures complex relationships, and is robust to noise.
# Random Forest combines multiple decision trees (using bagging) and averages their predictions. 
#This reduces overfitting and improves generalization.

In [None]:
#Random Forests are robust, handle non-linear relationships, and reduce overfitting compared to individual decision trees. This makes them effective for datasets like California Housing, where relationships between features and the target are complex

In [9]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_scaled, y)
#Great for structured data and capturing complex patterns.
# Gradient Boosting builds an ensemble of decision trees sequentially, where each tree corrects the errors of the previous ones.
#It optimizes the loss function iteratively

In [None]:
# Gradient Boosting is powerful for datasets with non-linear relationships and can handle complex feature interactions.
#However, it may require more hyperparameter tuning than Random Forests to avoid overfitting.

In [13]:
from sklearn.svm import SVR

svr_model = SVR(kernel='rbf')
svr_model.fit(X_scaled, y)
#Effective in high-dimensional spaces and works well for small-to-moderate datasets.
# SVR finds a hyperplane that best fits the data within a margin of tolerance (epsilon). It uses kernels to capture non-linear relationships

In [None]:
# SVR can model non-linear relationships effectively, especially with the use of kernels like RBF.
#However, it is computationally intensive for large datasets, which could make it less suitable for California Housing.

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

models = {
    "Linear Regression": linear_model,
    "Decision Tree": dt_model,
    "Random Forest": rf_model,
    "Gradient Boosting": gb_model,
    "Support Vector Regressor": svr_model,
}

results = []

for name, model in models.items():
    y_pred = model.predict(X_scaled)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    results.append((name, mse, mae, r2))

# Convert results to a DataFrame for better readability
results_df = pd.DataFrame(results, columns=["Model", "MSE", "MAE", "R²"])
print(results_df)


                      Model           MSE           MAE        R²
0         Linear Regression  5.243210e-01  5.311644e-01  0.606233
1             Decision Tree  9.555001e-32  4.156882e-17  1.000000
2             Random Forest  3.464143e-02  1.193314e-01  0.973984
3         Gradient Boosting  2.618843e-01  3.562491e-01  0.803324
4  Support Vector Regressor  3.325683e-01  3.816318e-01  0.750240


In [None]:
#The best-performing algorithm is likely the Gradient Boosting Regressor or Random Forest Regressor:
#Justification: These algorithms excel at handling non-linear relationships and feature interactions, which are common in housing datasets. Gradient Boosting often performs slightly better than Random Forest due to its iterative error-correcting mechanism, but it requires careful tuning. Random Forest is more robust to overfitting and may perform competitively with less effort.
#The worst-performing algorithm is likely the Linear Regression:
#Justification: Linear regression assumes a linear relationship between features and the target variable. This assumption is often unrealistic for complex datasets like California Housing, which may include non-linear and interactive feature effects. Consequently, its predictions are less accurate.
