In [1]:
# Load libraries 
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import time

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import make_scorer, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier

In [3]:
# Load US income_data data 
income_data = pd.read_csv("https://github.com/breno-madruga/machine_learning_exercises/raw/master/XGBoost%20with%20GPU%20support/income_data.csv")
income_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
income_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:

# convert the numerical income data to categorical data so that it can be used in XGBoost 
temp_col = pd.Categorical(income_data.high_income)

# assign the category codes to income column 
income_data["high_income"] = temp_col.codes

In [6]:

# Transformer the columns provided as input 
class FeatureSelector(BaseEstimator, TransformerMixin):

    def __init__( self, features ):
        self.features = features

    # return the same value as it is 
    def fit( self, X, y = None):
        return self

    # return the feature names for the columns
    def transform( self, X, y = None):
        return X[self.features]

#### 1.3.1. Categorical Pipeline

In [7]:

# convert the features to categorical features 
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes a boolean as its argument.
    def __init__(self, features_new=True):
        self.features_new = features_new

    # return the same object 
    def fit( self, X, y = None):
        return self

    # transformer method 
    def transform(self, X , y = None):
        df = X.copy()
        if self.features_new:
            # if work class is missing, replace unknown 
            df['workclass']= df['workclass'].replace('?','Unknown')
            # determine the native country as US and non US 
            df.loc[df['native_country'] != ' United-States', 'native_country'] = 'non_usa'

        # create categorical features 
        for name in df.columns.to_list():
            temp_col = pd.Categorical(df[name])
            df[name] = temp_col.codes

        # return array 
        return df

In [8]:
# define the parameters 
seed = 42
folds = 10
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}

In [9]:
# create training and testing dataset 
X_train, X_test, y_train, y_test = train_test_split(income_data.drop(labels="high_income",axis=1),
                                                    income_data["high_income"],
                                                    test_size=0.20,
                                                    random_state=seed,
                                                    shuffle=True,
                                                    stratify=income_data["high_income"])

In [10]:
# extract the categorical features 
categoricalfeatures = income_data.select_dtypes("object").columns.to_list()

# extract the numerical features 
numericalfeatures = income_data.select_dtypes("int64").columns.to_list()

# setup the categorical pipeline 
categoricalpipeline = Pipeline(steps = [('cat_selector', FeatureSelector(categoricalfeatures)),
                                         ('cat_transformer', CategoricalTransformer())])

# setup the numerical pipeline
numericalpipeline = Pipeline(steps = [('num_selector', FeatureSelector(numericalfeatures)),
                                       ('std_scaler', MinMaxScaler())])

# combine the pipeline 
full_pipeline_preprocessing = FeatureUnion(transformer_list = [('categorical_pipeline', categoricalpipeline),
                                                               ('numerical_pipeline', numericalpipeline)])

In [11]:
# setup the full pipeline as an estimator 
pipe = Pipeline(steps = [("full_pipeline", full_pipeline_preprocessing),
                         ("fs", SelectKBest()),
                         ("clf", XGBClassifier())])

# define the hyperparameters 
searchspace = [
                {"clf": [RandomForestClassifier()],
                 "clf__n_estimators": [800],
                 "clf__criterion": ["gini", "entropy"],
                 "clf__max_leaf_nodes": [300],
                 "clf__random_state": [seed],
                 "clf__oob_score": [True],
                 "fs__score_func": [chi2],
                 "fs__k": [10]},
                {"clf": [XGBClassifier()],
                 "clf__n_estimators": [300],
                 "clf__max_depth": [4],
                 "clf__learning_rate": [0.1],
                 "clf__random_state": [seed],
                 "clf__subsample": [1],
                 "clf__colsample_bytree": [1],
                 "fs__score_func":[chi2],
                 "fs__k":[13]}
]

# define kfold 
kfold = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=True)

# define the GridSearch object 
grid = GridSearchCV(estimator=pipe, 
                    param_grid=searchspace,
                    cv=kfold,
                    scoring=scoring,
                    return_train_score=True,
                    n_jobs=-1,
                    refit="AUC")

tmp = time.time()

# fit the model 
bestmodel = grid.fit(X_train, y_train)

# time elpased 
print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))  


CPU Training Time: 268.29627561569214 seconds


In [12]:
print("Best: %f using %s" % (bestmodel.best_score_,bestmodel.best_params_))  

Best: 0.923599 using {'clf': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1), 'clf__colsample_bytree': 1, 'clf__learning_rate': 0.1, 'clf__max_depth': 4, 'clf__n_estimators': 300, 'clf__random_state': 42, 'clf__subsample': 1, 'fs__k': 13, 'fs__score_func': <function chi2 at 0x7f7d69009ae8>}


In [13]:
# analyse the results 
result = pd.DataFrame(bestmodel.cv_results_)
result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__criterion,param_clf__max_leaf_nodes,param_clf__n_estimators,param_clf__oob_score,param_clf__random_state,param_fs__k,param_fs__score_func,param_clf__colsample_bytree,param_clf__learning_rate,param_clf__max_depth,param_clf__subsample,params,split0_test_AUC,split1_test_AUC,split2_test_AUC,split3_test_AUC,split4_test_AUC,split5_test_AUC,split6_test_AUC,split7_test_AUC,split8_test_AUC,split9_test_AUC,mean_test_AUC,std_test_AUC,rank_test_AUC,split0_train_AUC,split1_train_AUC,split2_train_AUC,split3_train_AUC,split4_train_AUC,split5_train_AUC,split6_train_AUC,split7_train_AUC,split8_train_AUC,split9_train_AUC,mean_train_AUC,std_train_AUC,split0_test_Accuracy,split1_test_Accuracy,split2_test_Accuracy,split3_test_Accuracy,split4_test_Accuracy,split5_test_Accuracy,split6_test_Accuracy,split7_test_Accuracy,split8_test_Accuracy,split9_test_Accuracy,mean_test_Accuracy,std_test_Accuracy,rank_test_Accuracy,split0_train_Accuracy,split1_train_Accuracy,split2_train_Accuracy,split3_train_Accuracy,split4_train_Accuracy,split5_train_Accuracy,split6_train_Accuracy,split7_train_Accuracy,split8_train_Accuracy,split9_train_Accuracy,mean_train_Accuracy,std_train_Accuracy
0,16.496329,0.164926,0.787073,0.00854,"RandomForestClassifier(bootstrap=True, ccp_alp...",gini,300.0,800,True,42,10,<function chi2 at 0x7f7d69009ae8>,,,,,"{'clf': RandomForestClassifier(bootstrap=True,...",0.909508,0.913253,0.919141,0.918046,0.91899,0.916836,0.90872,0.914634,0.914643,0.915146,0.914892,0.003443,3,0.940532,0.939881,0.939487,0.93993,0.939755,0.939492,0.940409,0.93962,0.939633,0.939696,0.939843,0.000343,0.857198,0.857582,0.859501,0.872553,0.864875,0.857198,0.855662,0.860653,0.857143,0.855223,0.859759,0.005028,2,0.884486,0.886747,0.885893,0.885254,0.88568,0.885296,0.884528,0.885126,0.885472,0.884533,0.885301,0.000671
1,18.480956,0.123196,0.826283,0.01586,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,300.0,800,True,42,10,<function chi2 at 0x7f7d69009ae8>,,,,,"{'clf': RandomForestClassifier(bootstrap=True,...",0.909143,0.913503,0.921321,0.919094,0.919162,0.9178,0.90916,0.915895,0.914637,0.915794,0.915551,0.003887,2,0.941352,0.940948,0.940702,0.940749,0.940924,0.940845,0.941429,0.940772,0.94079,0.940961,0.940947,0.000237,0.857965,0.855662,0.859501,0.873704,0.862956,0.856046,0.855662,0.859117,0.856759,0.856759,0.859413,0.005221,3,0.878215,0.87941,0.880049,0.877874,0.879111,0.878983,0.878514,0.877959,0.879202,0.878007,0.878732,0.000692
2,5.560473,0.198178,0.09921,0.001732,"XGBClassifier(base_score=0.5, booster='gbtree'...",,,300,,42,13,<function chi2 at 0x7f7d69009ae8>,1.0,0.1,4.0,1.0,"{'clf': XGBClassifier(base_score=0.5, booster=...",0.923436,0.926076,0.933418,0.929958,0.927379,0.92717,0.916369,0.916896,0.917993,0.917296,0.923599,0.005817,1,0.942508,0.942237,0.941621,0.941728,0.941654,0.942172,0.942529,0.941495,0.941915,0.941105,0.941896,0.000438,0.867179,0.869098,0.876008,0.878695,0.866027,0.869866,0.856046,0.85643,0.853303,0.852919,0.864557,0.008885,1,0.884486,0.885552,0.885595,0.88295,0.884614,0.886363,0.884656,0.884017,0.885386,0.884064,0.884768,0.000933


In [14]:
# ROC 
result[result.rank_test_AUC == 1][['mean_train_AUC', 'std_train_AUC','mean_test_AUC', 'std_test_AUC']]

Unnamed: 0,mean_train_AUC,std_train_AUC,mean_test_AUC,std_test_AUC
2,0.941896,0.000438,0.923599,0.005817


In [15]:
# AUC results 
result_auc = result[['mean_train_AUC', 'std_train_AUC','mean_test_AUC', 'std_test_AUC']]
result_auc

Unnamed: 0,mean_train_AUC,std_train_AUC,mean_test_AUC,std_test_AUC
0,0.939843,0.000343,0.914892,0.003443
1,0.940947,0.000237,0.915551,0.003887
2,0.941896,0.000438,0.923599,0.005817


In [16]:
# accuracy results 
result_acc = result[['mean_train_Accuracy', 'std_train_Accuracy','mean_test_Accuracy', 'std_test_Accuracy']]
result_acc

Unnamed: 0,mean_train_Accuracy,std_train_Accuracy,mean_test_Accuracy,std_test_Accuracy
0,0.885301,0.000671,0.859759,0.005028
1,0.878732,0.000692,0.859413,0.005221
2,0.884768,0.000933,0.864557,0.008885


In [17]:
# selecting the best model.
with open('pipe.pkl', 'wb') as file:
    pickle.dump(bestmodel, file)

In [18]:
# restore the best model.
with open("pipe.pkl", "rb") as file:
    bestmodel = pickle.load(file)

In [19]:
# test the model 
predict = bestmodel.predict(X_test)
print("Accuracy of testing: ", accuracy_score(y_test, predict), "\n")  # 0.8765545831414094 (GPU) | 0.8747121142330723 (CPU)
print("Confusion Matrix:\n", confusion_matrix(y_test,predict), "\n")
print("Classification report:\n", classification_report(y_test,predict))

Accuracy of testing:  0.8747121142330723 

Confusion Matrix:
 [[4652  293]
 [ 523 1045]] 

Classification report:
               precision    recall  f1-score   support

           0       0.90      0.94      0.92      4945
           1       0.78      0.67      0.72      1568

    accuracy                           0.87      6513
   macro avg       0.84      0.80      0.82      6513
weighted avg       0.87      0.87      0.87      6513

