In [16]:
# Data Handling & Analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Model Evaluation Metrics
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    mean_squared_error
)

# Pipeline & Cross Validation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV

# Miscellaneous Settings
import warnings
warnings.filterwarnings('ignore')

In [17]:
df = pd.read_csv('housing.csv')

In [18]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [19]:
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)


## Preparing X and Y variables

In [20]:
X = df.drop(columns=['median_house_value'],axis=1)
y = df['median_house_value']

In [21]:
X,y

(       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
 0        -122.23     37.88                41.0        880.0           129.0   
 1        -122.22     37.86                21.0       7099.0          1106.0   
 2        -122.24     37.85                52.0       1467.0           190.0   
 3        -122.25     37.85                52.0       1274.0           235.0   
 4        -122.25     37.85                52.0       1627.0           280.0   
 ...          ...       ...                 ...          ...             ...   
 20635    -121.09     39.48                25.0       1665.0           374.0   
 20636    -121.21     39.49                18.0        697.0           150.0   
 20637    -121.22     39.43                17.0       2254.0           485.0   
 20638    -121.32     39.43                18.0       1860.0           409.0   
 20639    -121.24     39.37                16.0       2785.0           616.0   
 
        population  households  median

In [22]:
# Apply OHE to the Categorical Column.
X = pd.get_dummies(X, drop_first=True)

In [23]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,True,False,False,False
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,True,False,False,False
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,True,False,False,False
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,True,False,False,False


In [24]:
# We Scaling our numerical columns.
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [25]:
X

array([[-1.32783522,  1.05254828,  0.98214266, ..., -0.01556621,
         2.83074203, -0.38446649],
       [-1.32284391,  1.04318455, -0.60701891, ..., -0.01556621,
         2.83074203, -0.38446649],
       [-1.33282653,  1.03850269,  1.85618152, ..., -0.01556621,
         2.83074203, -0.38446649],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ..., -0.01556621,
        -0.35326426, -0.38446649],
       [-0.87362627,  1.77823747, -0.84539315, ..., -0.01556621,
        -0.35326426, -0.38446649],
       [-0.83369581,  1.75014627, -1.00430931, ..., -0.01556621,
        -0.35326426, -0.38446649]])

In [26]:
# We split the data in training and testing part.

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [27]:
print(np.isnan(X_train).sum())
print(np.isnan(X_test).sum())

0
0


#### Create an Evaluate Function to give all metrics after model Training

In [28]:
# Evaluate All Regression Models


# Function to calculate metrics
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2


# Dictionary of Models
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGB Regressor": XGBRegressor(),
    "CatBoost Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

# Lists to store results
model_list = []
r2_list = []

# Loop through all models
for name, model in models.items():
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train & Test
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    # Store model name & R² for comparison
    model_list.append(name)
    r2_list.append(model_test_r2)

    #  Display results
    print(f" {name}")
    print('Model performance for Training set')
    print(f"- Root Mean Squared Error: {model_train_rmse:.4f}")
    print(f"- Mean Absolute Error: {model_train_mae:.4f}")
    print(f"- R2 Score: {model_train_r2:.4f}")

    print('----------------------------------')
    print('Model performance for Test set')
    print(f"- Root Mean Squared Error: {model_test_rmse:.4f}")
    print(f"- Mean Absolute Error: {model_test_mae:.4f}")
    print(f"- R2 Score: {model_test_r2:.4f}")
    print('='*50)
    print('\n')

# Create a comparison DataFrame
results_df = pd.DataFrame({"Model": model_list, "R2_Score": r2_list})
results_df = results_df.sort_values(by="R2_Score", ascending=False).reset_index(drop=True)

# Display model comparison table
print("Model Comparison Summary:")
display(results_df)


 Linear Regression
Model performance for Training set
- Root Mean Squared Error: 68433.9374
- Mean Absolute Error: 49594.8421
- R2 Score: 0.6497
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 70060.5218
- Mean Absolute Error: 50670.7382
- R2 Score: 0.6254


 Lasso
Model performance for Training set
- Root Mean Squared Error: 68433.9382
- Mean Absolute Error: 49594.7904
- R2 Score: 0.6497
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 70059.8535
- Mean Absolute Error: 50670.1345
- R2 Score: 0.6254


 Ridge
Model performance for Training set
- Root Mean Squared Error: 68433.9447
- Mean Absolute Error: 49594.2204
- R2 Score: 0.6497
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 70057.4191
- Mean Absolute Error: 50668.1259
- R2 Score: 0.6255


 K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 49513.4605
- Mean Absolute 

Unnamed: 0,Model,R2_Score
0,CatBoost Regressor,0.839502
1,XGB Regressor,0.829202
2,Random Forest Regressor,0.816526
3,Gradient Boosting,0.761625
4,K-Neighbors Regressor,0.712651
5,Decision Tree,0.643358
6,Ridge,0.625457
7,Lasso,0.625431
8,Linear Regression,0.625424
9,AdaBoost Regressor,0.317137


##  Model Analysis and Insights

###  Overview
After training and evaluating multiple regression models on the **California Housing Dataset**, the performance of each model was compared using key metrics such as **R² Score**, **RMSE**, and **MAE** on both training and testing data.

The main objective was to identify which model can best predict the `median_house_value` based on various housing and demographic features.

---

###  Model Performance Summary

| Rank | Model | Test R² | Observations |
|------|--------|----------|--------------|
| 1️⃣ | **CatBoost Regressor** | **0.8395** | Best performing model with strong generalization and smooth handling of non-linear relationships. |
| 2️⃣ | **XGBoost Regressor** | **0.8292** | Performs very close to CatBoost, powerful gradient boosting model. |
| 3️⃣ | **Random Forest Regressor** | **0.8165** | Excellent accuracy but shows mild overfitting (train R² much higher than test R²). |
| 4️⃣ | Gradient Boosting | 0.7616 | Good model but could be improved through hyperparameter tuning. |
| 5️⃣ | K-Neighbors Regressor | 0.7127 | Performs moderately well but sensitive to feature scaling and K-value. |
| 6️⃣ | Decision Tree | 0.6434 | Overfitted — train R² = 1.0 but test R² dropped significantly. |
| 7️⃣ | Ridge / Lasso / Linear Regression | ≈ 0.625 | Underfits the data — linear models cannot capture complex non-linear relations. |
| 8️⃣ | AdaBoost Regressor | 0.3171 | Weakest model, underfitting the dataset severely. |

---

###  Key Insights

1. **Top Performers:**  
   - `CatBoost`, `XGBoost`, and `Random Forest` models achieved the best performance with **R² > 0.80**.  
   - These models are robust for real-world prediction tasks.

2. **Overfitting Observation:**  
   - `Decision Tree` and `Random Forest` show signs of overfitting.  
   - Boosting-based models (CatBoost, XGBoost, Gradient Boosting) handle overfitting better due to their regularization mechanisms.

3. **Underfitting Models:**  
   - Linear models (Linear, Ridge, Lasso) are too simple for this dataset.  
   - AdaBoost performed poorly, possibly due to limited weak learner depth.

4. **CatBoost Regressor — The Winner**  
   - Provides the best **balance between accuracy and generalization**.  
   - Handles categorical data efficiently.  
   - Performs well even with minimal preprocessing.

---

---

###  Conclusion
The **CatBoost Regressor** emerged as the most reliable and accurate model for predicting California house prices, achieving an R² score of approximately **0.84**.  
This indicates that the model successfully explains around **84% of the variance** in house prices, making it an excellent choice for deployment.
