<a href="https://colab.research.google.com/github/stanymj56/Python-Capstone-Project/blob/main/Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = fetch_california_housing(as_frame=True)
data

{'data':        MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
 0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
 1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
 2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
 3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
 4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
 ...       ...       ...       ...        ...         ...       ...       ...   
 20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
 20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
 20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
 20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
 20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   
 
        Longitude 

In [None]:
df = data.frame.copy()
X = df.drop(columns=[data.target.name])
y = df[data.target.name].values
df.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
df.isnull().sum()

Unnamed: 0,0
MedInc,0
HouseAge,0
AveRooms,0
AveBedrms,0
Population,0
AveOccup,0
Latitude,0
Longitude,0
MedHouseVal,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)
print('X_train shape:',X_train.shape)
print('X_test shape:',X_test.shape)


X_train shape: (16512, 8)
X_test shape: (4128, 8)


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "SVR (RBF)": SVR(kernel='rbf', C=100.0, gamma='scale', epsilon=0.1)
}

In [None]:
results = []
for name, model in models.items():

    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Trained {name} - R2: {r2:.4f}, MSE: {mse:.4f}, MAE: {mae:.4f}")
    results.append({'model': name, 'R2': r2, 'MSE': mse, 'MAE': mae})

Trained Linear Regression - R2: 0.5758, MSE: 0.5559, MAE: 0.5332
Trained Decision Tree - R2: 0.6230, MSE: 0.4940, MAE: 0.4539
Trained Random Forest - R2: 0.8053, MSE: 0.2552, MAE: 0.3274
Trained Gradient Boosting - R2: 0.8004, MSE: 0.2615, MAE: 0.3484
Trained SVR (RBF) - R2: 0.7557, MSE: 0.3201, MAE: 0.3717


In [None]:
results_df = pd.DataFrame(results).sort_values(by="R2", ascending=False).reset_index(drop=True)
print("\nModel comparison (sorted by R2):")
print(results_df)

best = results_df.iloc[0]
worst = results_df.iloc[-1]
print(f"\nBest model: {best['model']}")
print(f"Worst model: {worst['model']}")


Model comparison (sorted by R2):
               model        R2       MSE       MAE
0      Random Forest  0.805275  0.255170  0.327425
1  Gradient Boosting  0.800444  0.261500  0.348351
2          SVR (RBF)  0.755722  0.320104  0.371703
3      Decision Tree  0.623042  0.493969  0.453904
4  Linear Regression  0.575788  0.555892  0.533200

Best model: Random Forest
Worst model: Linear Regression
