### Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import RocCurveDisplay
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from xgboost import plot_tree
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
df = pd.read_csv('../Data/diabetes.csv')

### EDA

In [None]:
df.head()

In [None]:
qqq = df[df['Outcome']==0]
zzz = df[df['Outcome']==1]

In [None]:
# plt.hist(qqq['Age']);

In [None]:
# plt.hist(zzz['Age']);

In [None]:
y = df['Outcome']
X = df[['Glucose','Age','Pregnancies']]

### Running train_test_split

In [None]:
X_train_whole, X_test_whole, y_train_whole, y_test_whole = train_test_split(X, y, test_size=.25)
train = xgb.DMatrix(X_train_whole,label=y_train_whole)
test = xgb.DMatrix(X_test_whole,label=y_test_whole)

### Using XGBClassifier and fitting the model

In [None]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train_whole, y_train_whole)
print("Training set accuracy score:",xgb_model.score(X_train_whole, y_train_whole))
print("Test set accuracy score:",xgb_model.score(X_test_whole, y_test_whole))
y_pred = xgb_model.predict_proba(X_test_whole)

### Graphing the ROC Curve

In [None]:
pos_probs = y_pred[:,1]

In [None]:
RocCurveDisplay.from_predictions(y_test_whole,pos_probs);
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Control')
plt.legend()
plt.show()

# ========================================================

### Kfold

In [None]:
kfold = KFold(n_splits=7, shuffle=True, random_state=42)

scores = []

for train_index, test_index in kfold.split(X_train_whole):
    X_train , X_test = X_train_whole.iloc[train_index,:],X_train_whole.iloc[test_index,:]
    y_train, y_test = y_train_whole.iloc[train_index], y_train_whole.iloc[test_index]

    xgb_model = xgb.XGBClassifier(objective="binary:logistic")
    xgb_model.fit(X_train, y_train)
    
    y_pred = xgb_model.predict_proba(X_test)
    
    scores.append(xgb_model.score(X_test,y_test))
    
scores

In [None]:
y_pred = xgb_model.predict_proba(X_test_whole)

### Graphing the ROC Curve for the KFold model

In [None]:
pos_probs = y_pred[:,1]

In [None]:
RocCurveDisplay.from_predictions(y_test_whole,pos_probs);
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - KFold')
plt.legend()
plt.show()

In [None]:
plot_tree(xgb_model)
fig = plt.gcf()
fig.set_size_inches(250, 100)