In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive

In [4]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
cd /content/gdrive/My Drive/DiabetesPrediction

/content/gdrive/My Drive/DiabetesPrediction


In [6]:
data=pd.read_csv('Diabetes_dataset.csv')

In [7]:
data.drop(columns=['Income','Education','NoDocbcCost','AnyHealthcare'],inplace=True)

Splitting the data

In [8]:
y = data.iloc[:,0:1] # dependent variable
x = data.iloc[:,1:] # dependent variables

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)

**Model Building**

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

**Random Forest Classifier**

In [12]:
rf=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)

In [13]:
rf.fit(x_train,y_train)

In [14]:
pred=rf.predict(x_test)

In [16]:
accuracy_score(y_test,pred)*100

82.59355618627143

In [17]:
confusion_matrix(y_test,pred)

array([[60618,   181,  3193],
       [ 1184,    11,   163],
       [ 8475,    51,  2228]])

Cross Validation

In [18]:
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestClassifier()
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, cv=3, n_iter=10)
random_search.fit(x_train,y_train)
print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_*100)

Best parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 10, 'criterion': 'gini'}
Best score: 0.8488703428391223


**Logistic Regression**


In [21]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)

In [22]:
y_pred = logreg.predict(x_test)

In [23]:
accuracy_score(y_test,y_pred)*100

84.38846841164722

In [24]:
confusion_matrix(y_test,y_pred)

array([[62288,     0,  1704],
       [ 1251,     0,   107],
       [ 8819,     0,  1935]])

**Adaboost Classifier**

In [54]:
adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)

In [55]:
adaboost.fit(x_train, y_train)

In [56]:
y_pred = adaboost.predict(x_test)

In [57]:
accuracy_score(y_test,y_pred)*100

84.94034479133818

In [58]:
confusion_matrix(y_test,y_pred)

array([[62477,     0,  1515],
       [ 1228,     0,   130],
       [ 8588,     0,  2166]])

In [59]:
import pickle
model_filename = 'model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(adaboost, file)
print("Model saved as", model_filename)

Model saved as model.pkl


Cross Validation

In [53]:
adaboost = AdaBoostClassifier()

In [33]:
param_dist = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

In [36]:
random_search = RandomizedSearchCV(
    adaboost,        # Estimator (AdaBoost)
    param_distributions=param_dist, # Hyperparameter grid
    n_iter=10,        # Number of random combinations to try
    cv=5,             # Number of cross-validation folds
    scoring='accuracy', # Scoring metric
    random_state=42   # Random seed for reproducibility
)
random_search.fit(x_train, y_train)

In [38]:
print("Best Hyperparameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_*100)

Best Hyperparameters: {'n_estimators': 100, 'learning_rate': 1.0, 'algorithm': 'SAMME.R'}
Best Score: 84.79974859332148


**Decision Tree**

In [40]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(x_train,y_train)

In [41]:
y_pred=dt.predict(x_test)

In [42]:
accuracy_score(y_test,y_pred)*100

78.3033743298644

In [43]:
confusion_matrix(y_test,y_pred)

array([[56330,  1107,  6555],
       [ 1012,    46,   300],
       [ 7173,   365,  3216]])

Cross Validation

In [45]:
dt = DecisionTreeClassifier()

param_dist = {
    'criterion': ['gini', 'entropy'],        # Splitting criterion
    'max_depth': np.arange(1, 11),           # Max depth of the tree
    'min_samples_split': np.arange(2, 11),   # Minimum samples required to split an internal node
    'min_samples_leaf': np.arange(1, 11)     # Minimum samples required for a leaf node
}
random_search = RandomizedSearchCV(
    dt,    # Estimator (DecisionTreeClassifier)
    param_distributions=param_dist, # Hyperparameter grid
    n_iter=10,           # Number of random combinations to try
    cv=5,                # Number of cross-validation folds
    scoring='accuracy',  # Scoring metric
    random_state=42      # Random seed for reproducibility
)
random_search.fit(x, y)

In [46]:
print("Best Hyperparameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Best Hyperparameters: {'min_samples_split': 9, 'min_samples_leaf': 4, 'max_depth': 5, 'criterion': 'entropy'}
Best Score: 0.8477688426363923


**XGBoost Classifier**

In [49]:
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(y)))
xgb_classifier.fit(x_train, y_train)

In [50]:
y_pred = xgb_classifier.predict(x_test)

In [51]:
accuracy_score(y_test,y_pred)*100

84.92851886891623

In [52]:
confusion_matrix(y_test,y_pred)

array([[62748,     0,  1244],
       [ 1242,     0,   116],
       [ 8868,     0,  1886]])

The adaboost model has more accuracy compared to other models, and thus that model is saved and used for further processing.

In [61]:
#x_test