In [1]:
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
warnings.filterwarnings('ignore')

In [3]:
dataframe = pd.read_csv('diabetes.csv')

In [4]:
dataframe.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
dataframe.shape

(768, 9)

In [6]:
round(dataframe.describe(),2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.85,120.89,69.11,20.54,79.8,31.99,0.47,33.24,0.35
std,3.37,31.97,19.36,15.95,115.24,7.88,0.33,11.76,0.48
min,0.0,0.0,0.0,0.0,0.0,0.0,0.08,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.37,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.63,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [8]:
dataframe.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
dataframe.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [10]:
X = dataframe[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']]
y = dataframe['Outcome']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=142)

In [12]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((576, 8), (192, 8), (576,), (192,))

# Logistic Regression

In [13]:
model_log_reg = LogisticRegression()
model_log_reg.fit(X_train.values, y_train.values)

LogisticRegression()

In [14]:
model_log_reg.score(X_test, y_test)

0.8697916666666666

In [15]:
confusion_matrix(y_test,model_log_reg.predict(X_test))

array([[115,   8],
       [ 17,  52]], dtype=int64)

In [16]:
accuracy_score(y_test,model_log_reg.predict(X_test))

0.8697916666666666

In [17]:
precision_score(y_test,model_log_reg.predict(X_test))

0.8666666666666667

In [18]:
recall_score(y_test,model_log_reg.predict(X_test))

0.7536231884057971

In [19]:
f1_score(y_test,model_log_reg.predict(X_test))

0.8062015503875969

# KNN

In [20]:
model_knn = KNeighborsClassifier(n_neighbors=13)
model_knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=13)

In [21]:
model_knn.score(X_test, y_test)

0.8020833333333334

In [22]:
confusion_matrix(y_test,model_knn.predict(X_test))

array([[108,  15],
       [ 23,  46]], dtype=int64)

In [23]:
accuracy_score(y_test,model_knn.predict(X_test))

0.8020833333333334

In [24]:
precision_score(y_test, model_knn.predict(X_test))

0.7540983606557377

In [25]:
recall_score(y_test,model_knn.predict(X_test))

0.6666666666666666

In [26]:
f1_score(y_test,model_knn.predict(X_test))

0.7076923076923076

# SVM - Linear

In [27]:
model_svc_lin = SVC(kernel='linear')
model_svc_lin.fit(X_train, y_train)

SVC(kernel='linear')

In [28]:
model_svc_lin.score(X_test, y_test)

0.84375

In [29]:
confusion_matrix(y_test,model_svc_lin.predict(X_test))

array([[114,   9],
       [ 21,  48]], dtype=int64)

In [30]:
accuracy_score(y_test,model_svc_lin.predict(X_test))

0.84375

In [31]:
precision_score(y_test, model_svc_lin.predict(X_test))

0.8421052631578947

In [32]:
recall_score(y_test,model_svc_lin.predict(X_test))

0.6956521739130435

In [33]:
f1_score(y_test,model_svc_lin.predict(X_test))

0.761904761904762

# SVM - Poly

In [34]:
model_svc_poly = SVC(kernel='poly')
model_svc_poly.fit(X_train, y_train)

SVC(kernel='poly')

In [35]:
model_svc_poly.score(X_test, y_test)

0.796875

In [36]:
confusion_matrix(y_test,model_svc_poly.predict(X_test))

array([[116,   7],
       [ 32,  37]], dtype=int64)

In [37]:
accuracy_score(y_test,model_svc_poly.predict(X_test))

0.796875

In [38]:
precision_score(y_test, model_svc_poly.predict(X_test))

0.8409090909090909

In [39]:
recall_score(y_test,model_svc_poly.predict(X_test))

0.5362318840579711

In [40]:
f1_score(y_test,model_svc_poly.predict(X_test))

0.6548672566371683

# SVM - RBF

In [41]:
model_svc_rbf = SVC(kernel='rbf')
model_svc_rbf.fit(X_train, y_train)

SVC()

In [42]:
model_svc_rbf.score(X_test, y_test)

0.7916666666666666

In [43]:
confusion_matrix(y_test,model_svc_rbf.predict(X_test))

array([[114,   9],
       [ 31,  38]], dtype=int64)

In [44]:
accuracy_score(y_test,model_svc_rbf.predict(X_test))

0.7916666666666666

In [45]:
precision_score(y_test, model_svc_rbf.predict(X_test))

0.8085106382978723

In [46]:
recall_score(y_test,model_svc_rbf.predict(X_test))

0.5507246376811594

In [47]:
f1_score(y_test,model_svc_rbf.predict(X_test))

0.6551724137931035

# Decision Tree

In [48]:
model_dt = DecisionTreeClassifier(max_depth=2, min_samples_leaf=1)
model_dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=2)

In [49]:
model_dt.score(X_test,y_test)

0.7916666666666666

In [50]:
confusion_matrix(y_test,model_dt.predict(X_test))

array([[118,   5],
       [ 35,  34]], dtype=int64)

In [51]:
accuracy_score(y_test,model_dt.predict(X_test))

0.7916666666666666

In [52]:
precision_score(y_test,model_dt.predict(X_test))

0.8717948717948718

In [53]:
recall_score(y_test,model_dt.predict(X_test))

0.4927536231884058

In [54]:
f1_score(y_test,model_dt.predict(X_test))

0.6296296296296297

# Random Forest

In [55]:
model_rf = RandomForestClassifier(n_estimators=400)

In [56]:
model_rf.fit(X_train,y_train)

RandomForestClassifier(n_estimators=400)

In [57]:
model_rf.score(X_test,y_test)

0.8177083333333334

In [58]:
confusion_matrix(y_test,model_rf.predict(X_test))

array([[109,  14],
       [ 21,  48]], dtype=int64)

In [59]:
accuracy_score(y_test,model_rf.predict(X_test))

0.8177083333333334

In [60]:
precision_score(y_test,model_rf.predict(X_test))

0.7741935483870968

In [61]:
recall_score(y_test,model_rf.predict(X_test))

0.6956521739130435

In [62]:
f1_score(y_test,model_rf.predict(X_test))

0.732824427480916