- Gradient Boosting Classifier classification algorithms
- It comes under ensemble methods 
- ensemble refers the using of multiple models
- Gradient Boosting Classifier uses the boosting principle (creates multiple datasets and the each dataset is dependent on previous dataset, it cannot repeat the errors in new dataset) 
- Random forest classifier uses the baggy principle ( it is parallel and independent)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
df=pd.read_csv("diabetes.csv")
df

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   preg    768 non-null    int64  
 1   plas    768 non-null    int64  
 2   pres    768 non-null    int64  
 3   skin    768 non-null    int64  
 4   test    768 non-null    int64  
 5   mass    768 non-null    float64
 6   pedi    768 non-null    float64
 7   age     768 non-null    int64  
 8   class   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
df.describe()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
X=df.drop('class',axis=1)
y=df['class']
#standardize features
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
X_scaled

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.8, random_state = 42)

In [15]:
#instantiate the model and define the parameters
gbc = GradientBoostingClassifier(random_state=42)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth':[3, 4, 5],
    'subsample':[0.8, 1.0]
}
grid_search = GridSearchCV(estimator=gbc,
                           param_grid=param_grid,
                           scoring='recall',
                           n_jobs=-1,
                           verbose=1)

In [16]:
#Fit the model
grid_search.fit(X_train,y_train)
#best parameters and score
print("Best Parameters:",grid_search.best_params_)
print("Best Cross-validated Recall:",grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
Best Cross-validated Recall: 0.6772727272727274


In [17]:
best_model=grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\nConfusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("\nClassification Report:\n",classification_report(y_test,y_pred))


Confusion Matrix:
 [[322  83]
 [ 85 125]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.80      0.79       405
           1       0.60      0.60      0.60       210

    accuracy                           0.73       615
   macro avg       0.70      0.70      0.70       615
weighted avg       0.73      0.73      0.73       615



#### Identify feature importance scores using XGBClassifier

In [18]:
best_model.feature_importances_

array([0.09036343, 0.26593681, 0.0561719 , 0.05357565, 0.08945071,
       0.16298641, 0.17214491, 0.10937018])