<a href="https://colab.research.google.com/github/sonjoy1s/ML/blob/main/Module_22_XGBoost_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Module 22: XGBoost (Practice Notebook)

### Instructions for Students
- This is a **practice notebook**.
- Complete all **TODO** sections.
- Read the markdown explanations carefully.
- Do not skip evaluation and reflection questions.

Dataset used here is **California Housing (Regression)**.



## 1. Import Required Libraries


In [204]:
# TODO: Import necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor


## 2. Load Dataset (California Housing)


In [205]:
# TODO: Load dataset

from sklearn.datasets import fetch_openml

data = fetch_openml(name="california_housing", version=1, as_frame=True)
display(data.frame.head())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [206]:
df = data.frame.copy()
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [207]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [208]:
features_cols = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                'total_bedrooms', 'population', 'households', 'median_income','ocean_proximity']

target_col ="median_house_value"

In [209]:

category_col = "ocean_proximity"
df_h = pd.get_dummies(df, columns=[category_col],dtype=int)
df_h.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0,1,0,0,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0,1,0,0,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0,1,0,0,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0,1,0,0,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0,1,0,0,0


In [210]:
X = df_h.drop(target_col, axis=1)
y = df_h[target_col]

In [216]:
df_h.columns = (
    df_h.columns
    .str.replace('<', 'less', regex=False)
    .str.replace(' ', '_', regex=False)
)

In [217]:
y_log = np.log1p(df['median_house_value'])

In [218]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.25, random_state=42
)

print(X_train.shape)
print(X_test.shape)


(15480, 13)
(5160, 13)


In [214]:
from sklearn.preprocessing import RobustScaler
scale = RobustScaler()
X_train_scale = scale.fit_transform(X_train)
X_test_scale = scale.transform(X_test)


## 4. Baseline XGBoost Regressor


In [220]:
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
    )
model.fit(X_train_scale, y_train)


## 5. Evaluate Baseline Model


In [221]:
# TODO: Evaluate baseline
pred = model.predict(X_test_scale)

mse = mean_squared_error(y_test, pred)
print("MSE :",mse)
r2 = r2_score(y_test, pred)
print("R2 :",r2)
rmse = np.sqrt(mse)
print("RMSE :",rmse)


MSE : 0.053263550633036054
R2 : 0.8372481411867371
RMSE : 0.2307889742449497



## 6. Hyperparameter Tuning with GridSearchCV


In [224]:
# TODO: Define parameter grid
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}




### Base Model for Grid Search


In [None]:
# TODO: Base model



### Run GridSearchCV


In [None]:
# TODO: Run GridSearchCV



## 7. Evaluate Tuned Model


In [None]:
# TODO: Evaluate tuned model



## 8. Reflection Questions

1. Did GridSearch improve performance?
2. Which parameter had the biggest effect?
3. What happens if learning_rate is too high?
4. Would you deploy this model? Why?
