<a href="https://colab.research.google.com/github/ramesitexp/DataScience_Zone/blob/main/xgboostClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

from xgboost import XGBClassifier

# Model saving
import joblib


In [2]:
pip install xgboost



In [3]:
np.random.seed(42)
n = 1000

data = pd.DataFrame({
    "age": np.random.randint(21, 60, n),
    "income": np.random.randint(15000, 120000, n),
    "loan_amount": np.random.randint(50000, 500000, n),
    "credit_score": np.random.randint(300, 850, n),
    "years_employed": np.random.randint(0, 20, n)
})

data["emi"] = data["loan_amount"] / np.random.randint(12, 60, n)

# Target logic (realistic rule-based + noise)
data["default"] = (
    (data["credit_score"] < 600) &
    (data["emi"] > data["income"] * 0.4)
).astype(int)

# Add noise
data.loc[np.random.choice(n, 100, replace=False), "default"] = 1


In [4]:
data.head()

Unnamed: 0,age,income,loan_amount,credit_score,years_employed,emi,default
0,59,72761,86059,793,5,1870.847826,0
1,49,51368,153409,497,0,5113.633333,0
2,35,21783,460303,816,4,14848.483871,0
3,28,56914,55704,813,13,1185.191489,0
4,41,85507,254456,321,4,5531.652174,1


In [5]:
x = data.drop('default', axis=1)
Y = data['default']

In [6]:
x_train, x_test, Y_train, Y_test = train_test_split(x, Y, test_size=0.2, random_state=42)


In [7]:
Xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42,
                    max_depth=5, subsample=0.8, colsample_bytree=0.8,
                    objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

In [8]:
Xgb.fit(x_train, Y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
print(classification_report(Y_test, Xgb.predict(x_test)))

              precision    recall  f1-score   support

           0       0.85      0.98      0.91       165
           1       0.64      0.20      0.30        35

    accuracy                           0.84       200
   macro avg       0.74      0.59      0.61       200
weighted avg       0.81      0.84      0.80       200



In [10]:
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.7, 0.8],
    "colsample_bytree": [0.7, 0.8]
}

grid = GridSearchCV(
    XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=42
    ),
    param_grid,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1
)

grid.fit(x_train, Y_train)


In [12]:
grid.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 3,
 'n_estimators': 100,
 'subsample': 0.8}

In [13]:
Xgb1 = XGBClassifier(n_estimators=100, learning_rate=0.05, random_state=42,
                    max_depth=3, subsample=0.8, colsample_bytree=0.7,
                    objective='binary:logistic', eval_metric='logloss')
Xgb1.fit(x_train, Y_train)

In [14]:
print(classification_report(Y_test, Xgb1.predict(x_test)))

              precision    recall  f1-score   support

           0       0.84      0.99      0.91       165
           1       0.71      0.14      0.24        35

    accuracy                           0.84       200
   macro avg       0.78      0.57      0.57       200
weighted avg       0.82      0.84      0.79       200

