<a href="https://colab.research.google.com/github/selvamani1992/DiabetesPrediction/blob/master/Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
# import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import make_scorer, r2_score, roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

In [53]:
df = pd.read_csv("/content/diabetes.csv")

In [54]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [56]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [57]:
X, y = df.drop('Outcome', axis=1), df['Outcome']

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)

In [59]:
lgr = LogisticRegression()
lgr.fit(X_train, y_train)
y_pred = lgr.predict(X_test)
print('Accuracy:',accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7597402597402597
[[91 14]
 [23 26]]


In [60]:
scoring = {
    'AUC_ROC': make_scorer(roc_auc_score),
    'f1_score': make_scorer(f1_score),
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score)
}

In [61]:
score_list= []

In [62]:
best_depth = best_score = 0
for depth in [1,2,3,4,5,6,7,8,9,10,20,30]:
  dt = DecisionTreeClassifier(max_depth=depth)
  score = np.mean(cross_val_score(dt,X_train, y_train, cv = 10, scoring = 'accuracy'))
  print("Depth Level : ", depth, "test score : ", score)
  if best_score < score:
    best_score = score
    best_depth = depth
print(f'\n\n\n best Depth : {best_depth} with the score of {best_score}')

Depth Level :  1 test score :  0.7250132205182444
Depth Level :  2 test score :  0.7495505023796933
Depth Level :  3 test score :  0.7479111581173983
Depth Level :  4 test score :  0.7414331041776838
Depth Level :  5 test score :  0.7282125859333687
Depth Level :  6 test score :  0.711872025383395
Depth Level :  7 test score :  0.6955314648334217
Depth Level :  8 test score :  0.6808038075092544
Depth Level :  9 test score :  0.6939714436805923
Depth Level :  10 test score :  0.6956107879428874
Depth Level :  20 test score :  0.6875198307773664
Depth Level :  30 test score :  0.6890005288207297



 best Depth : 2 with the score of 0.7495505023796933


In [63]:
model = DecisionTreeClassifier(max_depth=best_depth)
eval = cross_validate(model, X_train, y_train, cv=10, scoring=scoring)
a = (model.__class__.__name__,np.mean(eval['test_AUC_ROC']),np.mean(eval['test_f1_score']),np.mean(eval['test_Accuracy']),np.mean(eval['test_Precision']),np.mean(eval['test_Recall']))
score_list.append(a)

In [64]:
score_list

[('DecisionTreeClassifier',
  0.699521312021312,
  0.5921563859055189,
  0.7495505023796933,
  0.689979341264171,
  0.5253246753246753)]

In [65]:
best_k = best_score = 0
for k in [1,2,3,4,5,6,7,8,9,10,20,30]:
  knn = KNeighborsClassifier(n_neighbors=k)
  score = cross_val_score(knn,X_train, y_train, cv = 10).mean()
  print("K value : ", k, "test score : ", score)
  if best_score < score:
    best_score = score
    best_k = k
print(f'\n\n\n best_k : {best_k} with the score of {best_score}')

K value :  1 test score :  0.7020888418826019
K value :  2 test score :  0.7019566367001586
K value :  3 test score :  0.7135113696456902
K value :  4 test score :  0.7151771549444739
K value :  5 test score :  0.738048651507139
K value :  6 test score :  0.741274457958752
K value :  7 test score :  0.7396615547329455
K value :  8 test score :  0.7607879428873613
K value :  9 test score :  0.7623479640401903
K value :  10 test score :  0.7445795875198308
K value :  20 test score :  0.7478318350079324
K value :  30 test score :  0.7413273400317293



 best_k : 9 with the score of 0.7623479640401903


In [66]:
model = KNeighborsClassifier(n_neighbors=best_k)
eval = cross_validate(model, X_train, y_train, cv=10, scoring=scoring)
a = (model.__class__.__name__,np.mean(eval['test_AUC_ROC']),np.mean(eval['test_f1_score']),np.mean(eval['test_Accuracy']),np.mean(eval['test_Precision']),np.mean(eval['test_Recall']))
score_list.append(a)

In [67]:
model = LogisticRegression()
eval = cross_validate(model, X_train, y_train, cv=10, scoring=scoring)
a = (model.__class__.__name__,np.mean(eval['test_AUC_ROC']),np.mean(eval['test_f1_score']),np.mean(eval['test_Accuracy']),np.mean(eval['test_Precision']),np.mean(eval['test_Recall']))
score_list.append(a)

In [68]:
model = RandomForestClassifier(n_estimators= 100, max_depth = 5, max_features='sqrt')
eval = cross_validate(model, X_train, y_train, cv=10, scoring=scoring)
a = (model.__class__.__name__,np.mean(eval['test_AUC_ROC']),np.mean(eval['test_f1_score']),np.mean(eval['test_Accuracy']),np.mean(eval['test_Precision']),np.mean(eval['test_Recall']))
score_list.append(a)

In [69]:
best_score = best_lr = 0
for lr in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:
  xgbc = xgb.XGBClassifier(learning_rate = lr, n_estimators=100, verbosity = 0)
  score = np.mean(cross_val_score(xgbc,X_train, y_train, cv = 10, scoring = 'accuracy'))
  print(f'learning rate:{lr}, score:{score}')
  if best_score < score:
    best_score = score
    best_lr = lr
print(f'\n\n\n best learning rate : {best_lr} with the score of {best_score}')

learning rate:0.01, score:0.7477525118984664
learning rate:0.02, score:0.7478847170809095
learning rate:0.03, score:0.756002115282919
learning rate:0.04, score:0.7477260708619778
learning rate:0.05, score:0.7526705446853517
learning rate:0.06, score:0.751004759386568
learning rate:0.07, score:0.7412215758857748
learning rate:0.08, score:0.7412480169222634
learning rate:0.09, score:0.7445531464833423
learning rate:0.1, score:0.7395822316234797
learning rate:0.11, score:0.7558963511369645
learning rate:0.12, score:0.749365415124273
learning rate:0.13, score:0.7525912215758856
learning rate:0.14, score:0.7509518773135906
learning rate:0.15, score:0.7379693283976733
learning rate:0.2, score:0.7412215758857748
learning rate:0.3, score:0.7395029085140138
learning rate:0.4, score:0.7509254362771021
learning rate:0.5, score:0.747778952934955
learning rate:0.6, score:0.7380750925436278
learning rate:0.7, score:0.7396351136964571
learning rate:0.8, score:0.7348757271285035
learning rate:0.9, sco

In [70]:
model = xgb.XGBClassifier(learning_rate = best_lr, n_estimators=100, verbosity = 0)
eval = cross_validate(model, X_train, y_train, cv=10, scoring=scoring)
a = (model.__class__.__name__,np.mean(eval['test_AUC_ROC']),np.mean(eval['test_f1_score']),np.mean(eval['test_Accuracy']),np.mean(eval['test_Precision']),np.mean(eval['test_Recall']))
score_list.append(a)

In [71]:
score_list

[('DecisionTreeClassifier',
  0.699521312021312,
  0.5921563859055189,
  0.7495505023796933,
  0.689979341264171,
  0.5253246753246753),
 ('KNeighborsClassifier',
  0.7288369963369964,
  0.6469523104208299,
  0.7623479640401903,
  0.6922258263266474,
  0.6119047619047618),
 ('LogisticRegression',
  0.7172073759573759,
  0.6214497294231626,
  0.7607086197778953,
  0.707004093295317,
  0.5660173160173161),
 ('RandomForestClassifier',
  0.7221736596736597,
  0.6299761202398751,
  0.7641195134849287,
  0.7066281379439274,
  0.5757575757575758),
 ('XGBClassifier',
  0.7260227272727272,
  0.6411940488443609,
  0.756002115282919,
  0.6694040892028508,
  0.6212121212121212)]

In [72]:
score_df = pd.DataFrame(score_list,columns=['Model','Auc_Roc','F1_Score','Accuracy','Precision','Recall'])
score_df.set_index('Model',inplace=True)
score_df

Unnamed: 0_level_0,Auc_Roc,F1_Score,Accuracy,Precision,Recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DecisionTreeClassifier,0.699521,0.592156,0.749551,0.689979,0.525325
KNeighborsClassifier,0.728837,0.646952,0.762348,0.692226,0.611905
LogisticRegression,0.717207,0.62145,0.760709,0.707004,0.566017
RandomForestClassifier,0.722174,0.629976,0.76412,0.706628,0.575758
XGBClassifier,0.726023,0.641194,0.756002,0.669404,0.621212


In [73]:
final_model = LogisticRegression()
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
print('Accuracy:',accuracy_score(y_test, y_pred))

Accuracy: 0.7597402597402597


In [74]:
import pickle

In [80]:
with open('final_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [76]:
with open('final_model.pkl', 'rb') as model_file:
    xgb_model = pickle.load(model_file)

In [81]:
finalized_model = LogisticRegression()
finalized_model.fit(X_train, y_train)

In [82]:
import pickle

with open('finalized_model.pkl', 'wb') as model_file:
    pickle.dump(finalized_model, model_file)

In [None]:
y_pred = finalized_model.predict(X_test)
print('Accuracy:',accuracy_score(y_test, y_pred))

In [79]:
X_test.iloc[25]

Pregnancies                  1.000
Glucose                      0.000
BloodPressure               68.000
SkinThickness               35.000
Insulin                      0.000
BMI                         32.000
DiabetesPedigreeFunction     0.389
Age                         22.000
Name: 342, dtype: float64

In [83]:
y_test.iloc[25]

0