# Import Libraries and Dataset

In [1]:
import pandas as pd

In [2]:
data=pd.read_csv('../data/encoded_data.csv')

In [3]:
data

Unnamed: 0,Age,Gender,Prakriti,Diet,Food Preferences,Meal Timing,Exercise,Sleep Hours/Night,Sleep Quality,Daily Routine,Stress Level,Diabetes Diagnosis,Family History of Diabetes
0,0.063492,0,2,3,475,0,0,0.285714,1,1,0.50,0,1
1,0.285714,0,2,1,705,3,1,0.428571,0,1,0.00,1,1
2,0.126984,0,0,1,242,2,0,0.428571,0,1,0.25,0,0
3,0.317460,1,0,3,643,3,0,0.428571,1,1,0.50,0,0
4,0.714286,1,4,3,705,3,0,0.714286,1,1,0.00,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3558,0.523810,0,4,3,701,2,0,0.285714,1,0,0.25,0,0
3559,0.761905,1,4,3,605,2,1,0.428571,2,0,0.50,0,1
3560,0.190476,1,1,3,693,1,1,0.428571,1,1,0.25,0,0
3561,0.809524,1,3,0,705,2,1,0.142857,2,0,0.00,0,1


# Splitting Data into Training and Testing set

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X=data.drop(columns=['Diabetes Diagnosis'])

In [6]:
X

Unnamed: 0,Age,Gender,Prakriti,Diet,Food Preferences,Meal Timing,Exercise,Sleep Hours/Night,Sleep Quality,Daily Routine,Stress Level,Family History of Diabetes
0,0.063492,0,2,3,475,0,0,0.285714,1,1,0.50,1
1,0.285714,0,2,1,705,3,1,0.428571,0,1,0.00,1
2,0.126984,0,0,1,242,2,0,0.428571,0,1,0.25,0
3,0.317460,1,0,3,643,3,0,0.428571,1,1,0.50,0
4,0.714286,1,4,3,705,3,0,0.714286,1,1,0.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3558,0.523810,0,4,3,701,2,0,0.285714,1,0,0.25,0
3559,0.761905,1,4,3,605,2,1,0.428571,2,0,0.50,1
3560,0.190476,1,1,3,693,1,1,0.428571,1,1,0.25,0
3561,0.809524,1,3,0,705,2,1,0.142857,2,0,0.00,1


In [7]:
y=data['Diabetes Diagnosis']

In [8]:
y

0       0
1       1
2       0
3       0
4       1
       ..
3558    0
3559    0
3560    0
3561    0
3562    0
Name: Diabetes Diagnosis, Length: 3563, dtype: int64

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# Build and Train the Gradient Boosting Model

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

In [11]:
gb_model=GradientBoostingClassifier()

In [12]:
gb_model.fit(X_train,y_train)

In [13]:
y_pred=gb_model.predict(X_test)

In [14]:
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score,roc_auc_score

In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.7194950911640954
Precision: 0.7315914489311164
Recall: 0.779746835443038
F1 Score: 0.7549019607843137


In [16]:
y_prob = gb_model.predict_proba(X_test)[:, 1]

In [17]:
print("AUC-ROC:", roc_auc_score(y_test, y_prob))

AUC-ROC: 0.7894753602420189


# Hyperparameter Tuning

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10]
}

In [20]:
grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5, scoring='f1', verbose=1)

In [21]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [22]:
print("Best Parameters:", grid_search.best_params_)
optimized_gb_model = grid_search.best_estimator_

Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 100}


# Evaluate the Optimized Model

In [23]:
y_pred_opt = optimized_gb_model.predict(X_test)

In [24]:
y_prob_opt = optimized_gb_model.predict_proba(X_test)[:, 1]

In [25]:
print("Accuracy:", accuracy_score(y_test, y_pred_opt))
print("Precision:", precision_score(y_test, y_pred_opt))
print("Recall:", recall_score(y_test, y_pred_opt))
print("F1 Score:", f1_score(y_test, y_pred_opt))
print("AUC-ROC:", roc_auc_score(y_test, y_prob_opt))

Accuracy: 0.7012622720897616
Precision: 0.7166666666666667
Recall: 0.7620253164556962
F1 Score: 0.7386503067484662
AUC-ROC: 0.7832656635618183
