# IMPORTING REQUIRED LIBRARIES

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np

# READING THE DATASET

In [None]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# CHECKING THE SHAPE OF DATASET

In [None]:
df.shape

(768, 9)

# DESCRIBING THE DATASET

In [None]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


# CHECKING THE NULL VALUES IN DATA

In [None]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# PRINTING COLUMN NAMES

In [None]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

# CHECKING CORRELATION BETWEEN DEPENDENT AND INDEPENDENT VARIABLES

In [None]:
print("CORRELATIONS")
print("")
for i in df.columns:
  if i == 'Outcome':
    break
  else:
    print(i,"------->",df[i].corr(df['Outcome']))
    print("")

CORRELATIONS

Pregnancies -------> 0.22189815303398613

Glucose -------> 0.46658139830687295

BloodPressure -------> 0.0650683595503327

SkinThickness -------> 0.0747522319183194

Insulin -------> 0.13054795488404794

BMI -------> 0.29269466264444494

DiabetesPedigreeFunction -------> 0.1738440656529596

Age -------> 0.23835598302719754



# IMPORTING THE REQUIRED MODELS AND METRICS

In [None]:
# pipeline and preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Train Test Split
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import confusion_matrix, accuracy_score

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# DEPENDENT AND INDEPENDENT VARIABLES

In [None]:
y = df['Outcome']
X = df.drop(['Outcome'],axis = 1)

# DIVIDING THE DATA INTO TRAIN AND TEST DATASETS

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
lst = [x_train,x_test,y_train,y_test]
for data in lst:
  print("shape","---->",data.shape)

shape ----> (614, 8)
shape ----> (154, 8)
shape ----> (614,)
shape ----> (154,)


# MAKING PIPELINE FOR EVERY MODEL

In [None]:
mdl_1 = make_pipeline(StandardScaler(), KNeighborsClassifier()) 
mdl_2 = make_pipeline(StandardScaler(), LogisticRegression()) 
mdl_3 = make_pipeline(StandardScaler(), SVC()) 
mdl_4 = make_pipeline(StandardScaler(), DecisionTreeClassifier()) 
mdl_5 = make_pipeline(StandardScaler(), RandomForestClassifier())

# FITTING AND PREDICTING ACCURACY OF MODELS

In [None]:
mdl_lst = [mdl_1,mdl_2,mdl_3,mdl_4,mdl_5]
accuracy_lst = []
for mdl in mdl_lst:
  i = 1
  mdl.fit(x_train,y_train)
  y_pred = mdl.predict(x_test)
  print("")
  print("******CLASSIFICATION MODEL******")
  i = i+1
  print("")
  print("CONFUSION MATRIX")
  print(confusion_matrix(y_test,y_pred))
  print("")
  print("ACCURACY SCORE")
  print(accuracy_score(y_test,y_pred))
  print("----------------------------")
  accuracy_lst.append(accuracy_score(y_test,y_pred)) 
 


******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[79 20]
 [27 28]]

ACCURACY SCORE
0.6948051948051948
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[79 20]
 [18 37]]

ACCURACY SCORE
0.7532467532467533
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[82 17]
 [24 31]]

ACCURACY SCORE
0.7337662337662337
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[77 22]
 [16 39]]

ACCURACY SCORE
0.7532467532467533
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[75 24]
 [20 35]]

ACCURACY SCORE
0.7142857142857143
----------------------------


# CHECKING FOR HIGH ACCURACY SCORE AMOUNG THE MODELS

In [None]:
print(accuracy_lst.index(max(accuracy_lst)))

1


# SAVING THE MODEL IN PICKLE FILE

In [None]:
import pickle

: 

In [None]:
filename = 'diabetes_model.pkl'
pickle.dump(mdl_1, open(filename, 'wb'))
 