<a href="https://colab.research.google.com/github/sonali6062/Machine_learning_fundamentals/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries for data manipulation, analysis, and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import machine learning libraries and functions
import xgboost as xgb
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor # Corrected imports
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

LOAD THE DATASET AND SPLIT INTO TRAIN
AND TEST SETS

In [3]:
# Load the California housing dataset
california_housing=fetch_california_housing()
x,y=california_housing.data,california_housing.target
# Splitting the data into train and test set with a 80/20 split and a fixed random state
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [4]:
# Display the loaded dataset object
california_housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

DATA MODELLING

Initialising the models

In [5]:
# Initialize the XGBoost, RandomForest, and GradientBoosting regressors with a fixed random state
xgb_regressor=xgb.XGBRegressor(objective='reg:squarederror',random_state=42)
rf_regressor=RandomForestRegressor(random_state=42)
gb_regressor=GradientBoostingRegressor(random_state=42) # Corrected typo

Data Modelling

In [6]:
# Train the initialized models on the training data
model_xgb=xgb_regressor.fit(x_train,y_train)
model_rf=rf_regressor.fit(x_train,y_train)
model_gb=gb_regressor.fit(x_train,y_train)

Model Prediction

In [7]:
# Make predictions on the test data using the trained models
y_pred_xgb=model_xgb.predict(x_test)
y_pred_rf=model_rf.predict(x_test)
y_pred_gb=model_gb.predict(x_test)

Model Evaluation

In [8]:
# Calculate the mean squared error for each model's predictions
mse_xgb=mean_squared_error(y_test,y_pred_xgb)
mse_rf=mean_squared_error(y_test,y_pred_rf)
mse_gb=mean_squared_error(y_test,y_pred_gb)
# Print the calculated mean squared errors
print("Mean squared error for XGBoostRegressor: ",mse_xgb)
print("Mean squared error for RandomForestRegressor: ",mse_rf)
print("Mean squared error for GradientBoostingRegressor: ",mse_gb)

Mean squared error for XGBoostRegressor:  0.2225899267544737
Mean squared error for RandomForestRegressor:  0.2553684927247781
Mean squared error for GradientBoostingRegressor:  0.2939973248643864


# Hyper Parameter Tuning

In [9]:
# Define the hyperparameter grids for each model
param_grid_xgb={
    'lambda':[0.01,0.1,1,10],
    'gamma':[0,0.1,1,10],
    'learning_rate':[0.01,0.1,0.2],
    'n_estimators':[100,200,300]
}
# param_grid_rf={
#     'n_estimators':[100,200,300],
#     'max_depth':[None,10,20,30],
#     'min_samples_split':[2,5,10],
#     'min_samples_leaf':[1,2,4]
# }
# param_grid_gb={
#     'n_estimators':[100,200,300],
#     'learning_rate':[0.01,0.1,0.2],
#     'max_depth':[3,5,7],
#     'min_samples_split':[2,5,10],
#     'min_samples_leaf':[1,2,4]
# }
# Set up GridSearchCV for each model with 5-fold cross-validation and negative mean squared error as the scoring metric
grid_search_xgb = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
# grid_search_rf = GridSearchCV(estimator=rf_regressor, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
# grid_search_gb = GridSearchCV(estimator=gb_regressor, param_grid=param_grid_gb, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
# Fit the models to the training data to perform hyperparameter tuning
print("Tuning XGBoost...")
grid_search_xgb.fit(x_train, y_train)
# print("Tuning RandomForest...")
# grid_search_rf.fit(x_train, y_train)
# print("Tuning GradientBoosting...")
# grid_search_gb.fit(x_train, y_train)

# Get the best performing models after hyperparameter tuning
best_xgb = grid_search_xgb.best_estimator_
# best_rf = grid_search_rf.best_estimator_
# best_gb = grid_search_gb.best_estimator_


# Make predictions on the test data using the best models
y_pred_xgb = best_xgb.predict(x_test)
# y_pred_rf = best_rf.predict(x_test)
# y_pred_gb = best_gb.predict(x_test)

Tuning XGBoost...
Fitting 5 folds for each of 144 candidates, totalling 720 fits


Model Prediction

In [10]:
# Make predictions on the test data using the initial models (before hyperparameter tuning)
y_pred_xgb = model_xgb.predict(x_test)
# y_pred_rf = model_rf.predict(x_test)
# y_pred_gb = model_gb.predict(x_test)

Model Evaluation

In [11]:
# Calculate Mean Squared Error for the predictions made by the initial models
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
# mse_rf = mean_squared_error(y_test, y_pred_rf)
# mse_gb = mean_squared_error(y_test, y_pred_gb)

# Print the best hyperparameters found for XGBoostRegressor and its corresponding Mean Squared Error
print("Best parameters for XGBRegressor: ", grid_search_xgb.best_params_)
print("Mean Squared Error for XGBRegressor: ", mse_xgb)

# print("Best parameters for RandomForestRegressor: ", grid_search_rf.best_params_)
# print("Mean Squared Error for RandomForestRegressor: ", mse_rf)

# print("Best parameters for GradientBoostingRegressor: ", grid_search_gb.best_params_)
# print("Mean Squared Error for GradientBoostingRegressor: ", mse_gb)

Best parameters for XGBRegressor:  {'gamma': 0, 'lambda': 10, 'learning_rate': 0.1, 'n_estimators': 300}
Mean Squared Error for XGBRegressor:  0.2225899267544737
