In [None]:
##Importing Packages for Data Manipulation and Analysis##
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score
from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import SelectFromModel
import statsmodels.api as sm
import statsmodels.stats.api as sms

##Importing Packages for Data Visualization##
import seaborn as sns
sns.set()
sns.set_style('whitegrid')
import matplotlib.pyplot as plt

%matplotlib inline

-- Data Project Goal: Predict the onset of diabetes based on diagnostic measures --

In [None]:
##Importing PIMA Population Diabetes Dataset - obtained from from https://www.kaggle.com/uciml/pima-indians-diabetes-database##
pima_diabetes_data = pd.read_csv(r'../input/pima-indians-diabetes-database/diabetes.csv')

##General Overview of Data Column Characteristics##
pima_diabetes_data.info() #9 Columns (8 predictors, 1 response (Outcome)); 768 non-null observations for all columns.

In [None]:
##Applying train-test split before proceeding with exploratory data analysis segment##

##Splitting y (response) from X variables (predictors)
y = pima_diabetes_data.loc[:,['Outcome']]

X = pima_diabetes_data
X = X.drop(['Outcome'], axis=1)

##Splitting data into training (80%) and test (20%) sets (while keeping balanced)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, train_size = 0.80, random_state = 2021, stratify=y)

Step 1: Reviewing Training Dataset and Applying Cleaning if Necessary

In [None]:
##Overview of Columns of Training Dataset##
##X_train Column Characteristics
X_train.info() #614 Observations, 7 columns.

##y_train Column Characteristics
y_train.info() #614 Observations, 1 columns.

#Therefore, the test data portion comprises of 154 observations.

In [None]:
##Cross-checking for any null values (double checking##
print(X_train.isnull().sum()) #No null values in training predictor set.
print(y_train.isnull().sum()) #No null values in training target set.

Step 2. Performing Exploratory Data Analysis

In [None]:
##Determining Descriptive Statistics for Training Predictor Set##
X_train.describe().round(1) ##Rounding descriptive statistics to 1 decimal place.

In [None]:
##Determining Descriptive Statistics for Training Target Set##
y_train_cat = y_train.astype('category') #Creating a variable within which y_train is converted to a categorical datatype.

y_train_cat.describe() #Describing count split; The top occurence is 0, which corresponds to patients with no onset of diabetes. 400 out of 614 patients (65%) had no indicated onset of diabetes in this dataset.

In [None]:
##Creating distribution plots for all predictor variables (all are continuous numerical variables)##
print(X_train.columns) #Printing column names for reference.

In [None]:
##Histogram for 'Pregnancies' Predictor Variable.
plt.hist(X_train['Pregnancies'], color='green');
plt.xlabel('Number of Times Pregnant')
plt.ylabel('Patient Count')
plt.title("Distribution of 'Pregnancies' Predictor Variable");

In [None]:
##Histogram for 'Glucose' Predictor Variable.
plt.hist(X_train['Glucose'], color='green');
plt.xlabel('Plasma glucose concentration after tolerance test')
plt.ylabel('Patient Count')
plt.title("Distribution of 'Glucose' Predictor Variable");

In [None]:
##Histogram for 'BloodPressure' Predictor Variable.
plt.hist(X_train['BloodPressure'], color='green');
plt.xlabel('Diastolic blood pressure (mm Hg)')
plt.ylabel('Patient Count')
plt.title("Distribution of 'BloodPressure' Predictor Variable");

In [None]:
##Histogram for 'SkinThickness' Predictor Variable.
plt.hist(X_train['SkinThickness'], color='green');
plt.xlabel('Triceps skin fold thickness (mm)')
plt.ylabel('Patient Count')
plt.title("Distribution of 'SkinThickness' Predictor Variable");

In [None]:
##Histogram for 'Insulin' Predictor Variable.
plt.hist(X_train['Insulin'], color='green');
plt.xlabel('2-Hour serum insulin (mu U/ml)')
plt.ylabel('Patient Count')
plt.title("Distribution of 'Insulin' Predictor Variable");

In [None]:
##Histogram for 'BMI' Predictor Variable.
plt.hist(X_train['BMI'], color='green');
plt.xlabel('Body mass index (weight in kg/(height in m)^2)')
plt.ylabel('Patient Count')
plt.title("Distribution of 'BMI' Predictor Variable");

In [None]:
##Histogram for 'Diabetes pedigree function' Predictor Variable.
plt.hist(X_train['DiabetesPedigreeFunction'], color='green');
plt.xlabel('Diabetes pedigree function')
plt.ylabel('Patient Count')
plt.title("Distribution of 'Diabetespedigreefunction' Predictor Variable");

In [None]:
##Histogram for 'Age' Predictor Variable.
plt.hist(X_train['Age'], color='green');
plt.xlabel('Age(years)')
plt.ylabel('Patient Count')
plt.title("Distribution of 'Age' Predictor Variable");

In [None]:
##Creating Boxplots to display relative difference in means of each variable between the Occurrence (1) group and Non-Ocurrence (0) groups

##Creating a combined training dataframe dedicated to making these subplots.
combined_training_data = pd.concat([X_train, y_train], axis=1)
combined_training_data.head()

In [None]:
##Creating Series of Subplots
f, axes = plt.subplots(4,2,figsize=(20,15))

sns.boxplot(x='Outcome', y='Pregnancies', data=combined_training_data, orient='v', ax=axes[0,0])
sns.boxplot(x='Outcome', y='Glucose', data=combined_training_data, orient='v', ax=axes[0,1])
sns.boxplot(x='Outcome', y='BloodPressure', data=combined_training_data, orient='v', ax=axes[1,0])
sns.boxplot(x='Outcome', y='SkinThickness', data=combined_training_data, orient='v', ax=axes[1,1])
sns.boxplot(x='Outcome', y='Insulin', data=combined_training_data, orient='v', ax=axes[2,0])
sns.boxplot(x='Outcome', y='BMI', data=combined_training_data, orient='v', ax=axes[2,1])
sns.boxplot(x='Outcome', y='DiabetesPedigreeFunction', data=combined_training_data, orient='v', ax=axes[3,0])
sns.boxplot(x='Outcome', y='Age', data=combined_training_data, orient='v', ax=axes[3,1]);

In [None]:
##Building A Pearson/Point Biserial Correlation Matrix for Training Data##
corr_combined = combined_training_data
act_corr = corr_combined.corr()
matrix = np.tril(act_corr)
f, ax = plt.subplots(figsize=(15,12))
sns.heatmap(act_corr, vmax=0.8, annot=True, mask=matrix)

Step 3: Performing Data Modelling. 
    
    The following models will be attempted: Random Forest Classification, Gradient Boost Classification, Extreme Gradient Boost Classification.

Step 3a: Random Forest Classification Modelling

In [None]:
##Importing Random Forest Classifier##
from sklearn.ensemble import RandomForestClassifier

In [None]:
##Applying Recursive Feature Elimination (RFE) with cross-validation for Random Forest Classification feature selection.##
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=2021)

##Identification of optimal number of features to select with RFECV approach. Selecting 3 folds in attempt to avoid overfitting.
opt_feat_num_rfecv = RFECV(estimator = rf_classifier, step=1, cv=StratifiedKFold(3), scoring='balanced_accuracy', min_features_to_select=1)

opt_feat_num_rfecv.fit(X_train, np.ravel(y_train))
print("Optimal number of features selected using RFECV: %d"%opt_feat_num_rfecv.n_features_) #6 out of 8 selected as important.

#Plot reference cited from: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (# of correct classifications)")
plt.plot(range(1,
               len(opt_feat_num_rfecv.grid_scores_) + 1),
         opt_feat_num_rfecv.grid_scores_)
plt.show() 

In [None]:
##Training the random forest classifier with optimal number of features already identified.
rfe_classifier = RFE(estimator=rf_classifier, n_features_to_select=6, step=1)
rfe_classifier.fit(X_train, np.ravel(y_train))

In [None]:
##Determining features of highest importance for the random forest model.
rf_feat = pd.DataFrame()
rf_feat['feature_name'] = X_train.columns
rf_feat['importance'] = rfe_classifier.support_
print(rfe_classifier.ranking_)
rf_feat

In [None]:
##Only columns found to have  importance to the random forest model via RFECV.
X_train_reduced = X_train.filter(['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age'])
X_test_reduced = X_test.filter(['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age'])

In [None]:
##Building Random Forest Classification Model with Selected Variables
model1_varimp = RandomForestClassifier(n_estimators=100, random_state=2021).fit(X_train_reduced, np.ravel(y_train))

In [None]:
##Cross-Validation Accuracy Score
model1_cvs= cross_val_score(model1_varimp, X_train_reduced, np.ravel(y_train), cv=StratifiedKFold(3))
model1_cvs.mean() #0.7622588872947552

In [None]:
#Response Prediction
y_pred = model1_varimp.predict(X_test_reduced)

In [None]:
#Creating classification report for random forest.
print(classification_report(y_test, y_pred))

In [None]:
#Creating confusion matrix for logistic regression model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.
confusion_matrix(y_test, y_pred)

In [None]:
#Determining AUC score for random forest model.
roc_auc_score(y_test, y_pred) #0.7077777777777777

In [None]:
#Determining F1 score for the random forest model
f1_score(y_test, y_pred,average='binary') #0.6122448979591836; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

In [None]:
##Checking for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train_constant_vif = sm.add_constant(X_train_reduced) #For evaluating VIF only.

vif= [variance_inflation_factor(X_train_constant_vif.values,i) for i in range(X_train_constant_vif.shape[1])]

pd.DataFrame({'vif': vif[1:]}, index=X_train_reduced.columns).T #Multicollinearity interpretted as high when VIF > 5. All found to be below 5 (no multicollinearity issues indicated)


In [None]:
##Generating values for feature importance plot.
reduced_list_rf = list(['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age'])

#Numerical Importance of Predictors
rfr_importance_rf = list(model1_varimp.feature_importances_)

#Merged and Sorted with Predictors of importance
var_importance_merge_rf = [(predictor,round(importance,2)) for predictor, importance in zip(reduced_list_rf,rfr_importance_rf)]

var_importance_merge_rf = sorted(var_importance_merge_rf, key = lambda x: x[1], reverse = True)

print(var_importance_merge_rf)

In [None]:
##Plotting feature importance.
df_importance_rf = pd.DataFrame(var_importance_merge_rf, columns = ['PREDICTOR','IMPORTANCE_LEVEL'])

#Predictor Rank Plot
sns.catplot(x="IMPORTANCE_LEVEL", y='PREDICTOR', data = df_importance_rf, kind = "bar", height =14)



In [None]:
##Hypertuning with GridSearchCV
param_grid = {
    'max_depth':[6,9,12],
    'min_samples_split':[5,10,15],
    'n_estimators':[80,100,120]
}

rf_gscv = GridSearchCV(estimator = rf_classifier, param_grid = param_grid, cv=StratifiedKFold(3), n_jobs=-1, verbose = 2)

In [None]:
##Fitting GSCV with training data 
rf_gscv.fit(X_train_reduced, np.ravel(y_train))

In [None]:
##Extracting best params from GSCV
rf_gscv.best_params_ #{'max_depth': 6, 'min_samples_split': 5, 'n_estimators': 80}

In [None]:
##Re-fitting a second random forest classification model with hypertuned parameters
model_rf_final= RandomForestClassifier(n_estimators=80, max_depth=6, min_samples_split=5, random_state=2021).fit(X_train_reduced, np.ravel(y_train))

In [None]:
##Cross-Validation Accuracy Score
model_rf_final_cvs = cross_val_score(model_rf_final, X_train_reduced, np.ravel(y_train), cv=StratifiedKFold(3))
model_rf_final_cvs.mean() #0.7720149848557308

In [None]:
##Response Prediction
y_pred_rf_final = model_rf_final.predict(X_test_reduced)

In [None]:
##Determining Test Accuracy Score
accuracy_score(y_test, y_pred_rf_final)#0.7857142857142857

In [None]:
##Creating classification report for random forest classification.
print(classification_report(y_test, y_pred_rf_final))

In [None]:
##Creating confusion matrix for random forest classification model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.
confusion_matrix(y_test, y_pred_rf_final)

In [None]:
##Determining AUC score for random forest classification model.
roc_auc_score(y_test, y_pred_rf_final) #0.7370370370370369

In [None]:
#Determining F1 score for the for random forest classification model.
f1_score(y_test, y_pred_rf_final,average='binary') #0.6526315789473683; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

In [None]:
##Generating values for feature importance plot.
reduced_list_rf_final = list(['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age'])

#Numerical Importance of Predictors
rfr_importance_rf_final = list(model_rf_final.feature_importances_)

#Merged and Sorted with Predictors of importance
var_importance_merge_rf_final= [(predictor,round(importance,2)) for predictor, importance in zip(reduced_list_rf_final,rfr_importance_rf_final)]

var_importance_merge_rf_final = sorted(var_importance_merge_rf_final, key = lambda x: x[1], reverse = True)

print(var_importance_merge_rf_final)

In [None]:
##Plotting feature importance.
df_importance_rf_final = pd.DataFrame(var_importance_merge_rf_final, columns = ['PREDICTOR','IMPORTANCE_LEVEL'])

#Predictor Rank Plot
sns.catplot(x="IMPORTANCE_LEVEL", y='PREDICTOR', data = df_importance_rf_final, kind = "bar", height =14)

**The final metrics obtained for this Random Forest Classification Model were:
    1. Final Test Score: 0.78;
    2. Sensitivity: TP/(TP+FN) = (90/(90+23)) = 0.80;
    3. Specificity: TN/(TN+FP) = (31/(31+10)) = 0.76;
    4. AUC Score: 0.7;
    5. F1 Score: 0.7;**

Step 3b:Gradient Boost Classification Modelling

In [None]:
##Importing Gradient Boosting Classifier##
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#Building GB Classification Model for Sklearn Prediction
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=2021).fit(X_train, np.ravel(y_train))

##Identification of optimal number of features to select with RFECV approach. Selecting 3 folds in attempt to avoid overfitting.
opt_gb_rfecv = RFECV(estimator = gb_classifier, step=1, cv=StratifiedKFold(3), scoring='balanced_accuracy', min_features_to_select=1)

opt_gb_rfecv.fit(X_train, np.ravel(y_train))
print("Optimal number of features selected using RFECV: %d"%opt_gb_rfecv.n_features_) #6 out of 8 selected as important.

#Plot reference cited from: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (# of correct classifications)")
plt.plot(range(1,
               len(opt_gb_rfecv.grid_scores_) + 1),
         opt_gb_rfecv.grid_scores_)
plt.show() 

In [None]:
##Training the gradient boost classifier with optimal number of features already identified.
gb_rfe_classifier = RFE(estimator=gb_classifier, n_features_to_select=6, step=1)
gb_rfe_classifier.fit(X_train, np.ravel(y_train))

In [None]:
#Determining features of highest importance for the gradient boost model.
gb_feat = pd.DataFrame()
gb_feat['feature_name'] = X_train.columns
gb_feat['importance'] = gb_rfe_classifier.support_
print(gb_rfe_classifier.ranking_)
gb_feat

In [None]:
##Only columns found to have  importance to the gradient boost model via RFECV.
X_train_reduced_gb = X_train.filter(['Pregnancies', 'Glucose','Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])
X_test_reduced_gb = X_test.filter(['Pregnancies', 'Glucose','Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])

In [None]:
##Building Gradient Boost Classification Model with Selected Variables
model2_varimp = GradientBoostingClassifier(n_estimators=100, random_state=2021).fit(X_train_reduced_gb, np.ravel(y_train))

In [None]:
##Cross-Validation Accuracy Score
gb_model_cvs = cross_val_score(model2_varimp, X_train_reduced_gb, np.ravel(y_train), cv=StratifiedKFold(3))
gb_model_cvs.mean() #0.7704128805993943

In [None]:
##Response Prediction
y_pred_gb = model2_varimp.predict(X_test_reduced_gb)

In [None]:
##Determining test accuracy score
accuracy_score(y_test, y_pred_gb) #0.7857142857142857

In [None]:
##Creating classification report for GB Classification Model 
print(classification_report(y_test, y_pred_gb))

In [None]:
##Creating confusion matrix for GB Classification Model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.
confusion_matrix(y_test, y_pred_gb)

In [None]:
##Determining AUC score for the GB Classification Model.
roc_auc_score(y_test, y_pred_gb) #0.7412962962962963

In [None]:
##Determining F1 score for the GB Classification Model.
f1_score(y_test, y_pred_gb,average='binary') #0.6597938144329897; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

In [None]:
##Checking for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train_constant_vif = sm.add_constant(X_train_reduced_gb) #For evaluating VIF only.

vif= [variance_inflation_factor(X_train_constant_vif.values,i) for i in range(X_train_constant_vif.shape[1])]

pd.DataFrame({'vif': vif[1:]}, index=X_train_reduced_gb.columns).T #Multicollinearity interpretted as high when VIF > 5. All found to be below 5 (no multicollinearity issues indicated)


In [None]:
##Generating values for feature importance plot.
reduced_list_gb = list(['Pregnancies', 'Glucose','Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])

#Numerical Importance of Predictors
importance_gb = list(model2_varimp.feature_importances_)

#Merged and Sorted with Predictors of importance
var_importance_merge_gb = [(predictor,round(importance,2)) for predictor, importance in zip(reduced_list_gb,importance_gb)]

var_importance_merge_gb = sorted(var_importance_merge_gb, key = lambda x: x[1], reverse = True)

print(var_importance_merge_gb)

In [None]:
##Plotting feature importance.
df_importance_gb = pd.DataFrame(var_importance_merge_gb, columns = ['PREDICTOR','IMPORTANCE_LEVEL'])

#Predictor Rank Plot
sns.catplot(x="IMPORTANCE_LEVEL", y='PREDICTOR', data = df_importance_gb, kind = "bar", height =14)


In [None]:
##Hypertuning with GridSearchCV
param_grid = {
    'max_depth':[6,9,12],
    'min_samples_split':[5,10,15],
    'n_estimators':[80,100,120]
}

gb_gscv = GridSearchCV(estimator = gb_classifier, param_grid = param_grid, cv=StratifiedKFold(3), n_jobs=-1, verbose = 2)

In [None]:
##Fitting GSCV with training data 
gb_gscv.fit(X_train_reduced_gb, np.ravel(y_train))

In [None]:
##Extracting best params from GSCV
gb_gscv.best_params_ #{'max_depth': 6, 'min_samples_split': 10, 'n_estimators': 120}

In [None]:
##Re-fitting a second gradient boosting classification model with hypertuned parameters
model_gb_final= GradientBoostingClassifier(n_estimators=120, max_depth=6, min_samples_split=10, random_state=2021).fit(X_train_reduced_gb, np.ravel(y_train))

In [None]:
##Cross-Validation Accuracy Score
model_gb_final_cvs = cross_val_score(model_gb_final, X_train_reduced_gb, np.ravel(y_train), cv=StratifiedKFold(3))
model_gb_final_cvs.mean() #0.7573489558424996

In [None]:
##Response Prediction
y_pred_gb_final = model_gb_final.predict(X_test_reduced_gb)

In [None]:
##Determining Test Score
accuracy_score(y_test, y_pred_gb_final)#0.7922077922077922

In [None]:
##Creating classification report for gradient boosting classification.
print(classification_report(y_test, y_pred_gb_final))

In [None]:
##Creating confusion matrix for gradient boosting model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.
confusion_matrix(y_test, y_pred_gb_final)

In [None]:
##Determining AUC score for gradient boosting classification model.
roc_auc_score(y_test, y_pred_gb_final) #0.7633333333333332

In [None]:
#Determining F1 score for the for random forest classification model.
f1_score(y_test, y_pred_gb_final,average='binary') #0.6923076923076923; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

In [None]:
##Generating values for feature importance plot.
reduced_list_gb_final = list(['Pregnancies', 'Glucose','Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])

#Numerical Importance of Predictors
importance_gb_final = list(model_gb_final.feature_importances_)

#Merged and Sorted with Predictors of importance
var_importance_merge_gb_final = [(predictor,round(importance,2)) for predictor, importance in zip(reduced_list_gb_final,importance_gb_final)]

var_importance_merge_gb_final = sorted(var_importance_merge_gb_final, key = lambda x: x[1], reverse = True)

print(var_importance_merge_gb_final)

In [None]:
##Plotting feature importance.
df_importance_gb_final= pd.DataFrame(var_importance_merge_gb_final, columns = ['PREDICTOR','IMPORTANCE_LEVEL'])

#Predictor Rank Plot
sns.catplot(x="IMPORTANCE_LEVEL", y='PREDICTOR', data = df_importance_gb_final, kind = "bar", height =14)


**The final metrics obtained for this Gradient Boosting Classification Model were: 1. Final Test Score: 0.79; 2. Sensitivity: TP/(TP+FN) = (86/(86+18)) = 0.83; 3. Specificity: TN/(TN+FP) = (36/(36+14)) = 0.72; 4. AUC Score: 0.7; 5. F1 Score: 0.7**

3c. Extreme Gradient Boost Classification (Similar to Gradient Boosting but with higher computation power and more regularization to combat overfitting while aiming to reduce bias reduction)

In [None]:
##Importing xgboost Classifier##
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
#Building XGB Classification Model for Sklearn Prediction
xgb_classifier = XGBClassifier(objective='binary:logistic',  eval_metric='logloss', use_label_encoder=False, n_estimators=100, random_state=2021).fit(X_train, np.ravel(y_train))

##Identification of optimal number of features to select with RFECV approach. Selecting 3 folds in attempt to avoid overfitting.
opt_xgb_rfecv = RFECV(estimator = xgb_classifier, step=1, cv=StratifiedKFold(3), scoring='balanced_accuracy', min_features_to_select=1)

opt_xgb_rfecv.fit(X_train, np.ravel(y_train))
print("Optimal number of features selected using RFECV: %d"%opt_xgb_rfecv.n_features_) #All 8 selected as important.

#Plot reference cited from: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (# of correct classifications)")
plt.plot(range(1,
               len(opt_xgb_rfecv.grid_scores_) + 1),
         opt_xgb_rfecv.grid_scores_)
plt.show() 

In [None]:
##Training the gradient boost classifier with optimal number of features already identified.
xgb_rfe_classifier = RFE(estimator=xgb_classifier, n_features_to_select=8, step=1)
xgb_rfe_classifier.fit(X_train, np.ravel(y_train))

In [None]:
#Determining features of highest importance for the extreme gradient boost model.
xgb_feat = pd.DataFrame()
xgb_feat['feature_name'] = X_train.columns
xgb_feat['importance'] = xgb_rfe_classifier.support_
print(xgb_rfe_classifier.ranking_)
xgb_feat

In [None]:
##Building Extreme Gradient Boost Classification Model with all Variables
model3_varimp = XGBClassifier(objective='binary:logistic',  eval_metric='logloss', use_label_encoder=False, n_estimators=100, random_state=2021).fit(X_train, np.ravel(y_train))

In [None]:
##Cross-Validation Accuracy Score
xgb_model_cvs = cross_val_score(model3_varimp, X_train, np.ravel(y_train), cv=StratifiedKFold(3))
xgb_model_cvs.mean() #0.7736170891120676

In [None]:
##Response Prediction
y_pred_xgb = model3_varimp.predict(X_test)

In [None]:
##Determining test accuracy score
accuracy_score(y_test, y_pred_xgb) #0.7987012987012987

In [None]:
#Creating classification report for XGB classification model.
print(classification_report(y_test, y_pred_xgb))

In [None]:
#Creating confusion matrix for XGB classification model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.
confusion_matrix(y_test, y_pred_xgb)

In [None]:
#Determining AUC score for the XGB classification model.
roc_auc_score(y_test, y_pred_xgb) #0.764074074074074

In [None]:
#Determining F1 score for the XGB classification model
f1_score(y_test, y_pred_xgb,average='binary') #0.6930693069306931; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

In [None]:
##Hypertuning parameters: max_depth, min_child_weight, eta.
#Code referenced as guide for tuning procedure: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
param_grid = {
    'max_depth':[6,8,10],
    'n_estimators':range(80,120,20),
    'learning_rate': [0.10,0.15,0.20]
}

In [None]:
##Employing GridSearchCV to identify optimal parameters based on specified range.
xgb_optmodel = xgb.XGBClassifier(random_state=2021)
optimal_params = GridSearchCV(xgb_optmodel, param_grid, verbose=0,n_jobs=-1, cv=StratifiedKFold(3))

In [None]:
##Determing optimal parameters
optimal_params.fit(X_train,np.ravel(y_train))
print(optimal_params.best_params_) #{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 80}

In [None]:
##Final Optimized Gradient Boosting Model Object
xgb_final = xgb.XGBClassifier(objective='binary:logistic',  eval_metric='logloss', use_label_encoder=False, n_estimators=80, max_depth=10,learning_rate=0.1, n_jobs=-1, random_state=2021)

In [None]:
##Fitting xgb_final on training data and predicting on test data
xgb_final.fit(X_train,np.ravel(y_train))

In [None]:
#Cross-Validation Accuracy Score
xgb_cvs = cross_val_score(xgb_final, X_train, np.ravel(y_train), cv=StratifiedKFold(3))
xgb_cvs.mean() #0.7524868483978957

In [None]:
##Response Prediction
xgb_pred_final = xgb_final.predict(X_test)

In [None]:
##Determining accuracy scores
accuracy_score(y_test, xgb_pred_final) #0.7597402597402597

In [None]:
##Creating classification report for XGB classification.
print(classification_report(y_test, xgb_pred_final))

In [None]:
##Creating confusion matrix for XGB classification model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.
confusion_matrix(y_test, xgb_pred_final)

In [None]:
##Determining AUC score for XGB classification model.
roc_auc_score(y_test, xgb_pred_final) #0.7383333333333333

In [None]:
#Determining F1 score for the for XGB classification model.
f1_score(y_test, xgb_pred_final,average='binary') #0.6605504587155963; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

In [None]:
##Checking for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train_constant_vif = sm.add_constant(X_train) #For evaluating VIF only.

vif= [variance_inflation_factor(X_train_constant_vif.values,i) for i in range(X_train_constant_vif.shape[1])]

pd.DataFrame({'vif': vif[1:]}, index=X_train.columns).T #Multicollinearity interpretted as high when VIF > 5. All found to be below 5 (no multicollinearity issues indicated)


In [None]:
##Plotting features by ranked importance (Training Data) (Importance_type=Weight is default)

#Converting dataframes to Dmatrix optimized structure
xgb_trainmatrix = xgb.DMatrix(X_train, label = y_train)
xgb_testmatrix = xgb.DMatrix(X_test, label = y_test)

params={'learning_rate': 0.1, 'max_depth': 10}

xgb_finalimportance = xgb.train(params=params, dtrain=xgb_trainmatrix, num_boost_round=80)

xgb.plot_importance(xgb_finalimportance, importance_type='gain')

plt.show()

**The final metrics obtained for this Extreme Gradient Boosting Classification Model were: 1. Final Test Score: 0.76; 2. Sensitivity: TP/(TP+FN) = (81/(81+18)) = 0.82; 3. Specificity: TN/(TN+FP) = (36/(36+19)) = 0.75; 4. AUC Score: 0.7; 5. F1 Score: 0.7**

**Ultimately, Gradient Boosting (not extreme) seems to have the strongest performance, in terms of sensitivity (ability to detect true positive occurences (onset of diabetes) ~0.80), whereas the random forest model had the strongest performance in terms of being able to determine a true negative occurence (when onset of diabetes would not occur) ~0.76). Gradient boosting also had the highest test accuracy score (0.79).**