# Diabetes Prediction

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score

## Importing the dataset

In [2]:
data = pd.read_csv('diabetes_prediction_dataset.csv')
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


## Cleaning the dataset

In [4]:
# check null
data.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [8]:
# check duplicate
data.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
99995     True
99996    False
99997    False
99998    False
99999    False
Length: 100000, dtype: bool

In [9]:
# drop duplicate
data.drop_duplicates(inplace=True)

In [10]:
data.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
99994    False
99996    False
99997    False
99998    False
99999    False
Length: 96146, dtype: bool

## Encoding gender data

In [11]:
data['gender'].value_counts()

Female    56161
Male      39967
Other        18
Name: gender, dtype: int64

In [12]:
# do label encoding
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,never,25.19,6.6,140,0
1,0,54.0,0,0,No Info,27.32,6.6,80,0
2,1,28.0,0,0,never,27.32,5.7,158,0
3,0,36.0,0,0,current,23.45,5.0,155,0
4,1,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99994,0,36.0,0,0,No Info,24.60,4.8,145,0
99996,0,2.0,0,0,No Info,17.37,6.5,100,0
99997,1,66.0,0,0,former,27.83,5.7,155,0
99998,0,24.0,0,0,never,35.42,4.0,100,0


## Mapping smoking history

In [16]:
# mapping untuk smoking_history
data['smoking_history'].value_counts()

never          34398
No Info        32887
former          9299
current         9197
not current     6367
ever            3998
Name: smoking_history, dtype: int64

In [17]:
smoking_history_maping = {'never': 0, 'No Info': -1, 'current':2, 'not current': 0, 'former' : 1, 'ever': 2}
data['smoking_history'] = data['smoking_history'].map(smoking_history_maping)
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,0,25.19,6.6,140,0
1,0,54.0,0,0,-1,27.32,6.6,80,0
2,1,28.0,0,0,0,27.32,5.7,158,0
3,0,36.0,0,0,2,23.45,5.0,155,0
4,1,76.0,1,1,2,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99994,0,36.0,0,0,-1,24.60,4.8,145,0
99996,0,2.0,0,0,-1,17.37,6.5,100,0
99997,1,66.0,0,0,1,27.83,5.7,155,0
99998,0,24.0,0,0,0,35.42,4.0,100,0


## Checking age data

In [18]:
data.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
count,96146.0,96146.0,96146.0,96146.0,96146.0,96146.0,96146.0,96146.0,96146.0
mean,0.416065,41.794326,0.077601,0.040803,0.029143,27.321461,5.532609,138.218231,0.08822
std,0.493287,22.462948,0.267544,0.197833,0.993422,6.767716,1.073232,40.909771,0.283616
min,0.0,0.08,0.0,0.0,-1.0,10.01,3.5,80.0,0.0
25%,0.0,24.0,0.0,0.0,-1.0,23.4,4.8,100.0,0.0
50%,0.0,43.0,0.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,1.0,59.0,0.0,0.0,0.0,29.86,6.2,159.0,0.0
max,2.0,80.0,1.0,1.0,2.0,95.69,9.0,300.0,1.0


In [26]:
data[data['age'] < 1]

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
155,0,0.08,0,0,-1,14.43,6.5,160,0
218,0,0.56,0,0,-1,16.85,5.0,140,0
241,1,0.88,0,0,-1,17.49,6.0,140,0
268,0,0.16,0,0,-1,12.15,6.6,100,0
396,1,0.16,0,0,-1,14.35,6.5,126,0
...,...,...,...,...,...,...,...,...,...
99452,1,0.32,0,0,-1,15.93,5.7,100,0
99536,0,0.40,0,0,-1,16.66,3.5,140,0
99629,0,0.64,0,0,-1,17.58,6.1,140,0
99778,0,0.32,0,0,-1,12.26,5.8,126,0


In [29]:
data[data['age'] > 1]

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,0,25.19,6.6,140,0
1,0,54.0,0,0,-1,27.32,6.6,80,0
2,1,28.0,0,0,0,27.32,5.7,158,0
3,0,36.0,0,0,2,23.45,5.0,155,0
4,1,76.0,1,1,2,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99994,0,36.0,0,0,-1,24.60,4.8,145,0
99996,0,2.0,0,0,-1,17.37,6.5,100,0
99997,1,66.0,0,0,1,27.83,5.7,155,0
99998,0,24.0,0,0,0,35.42,4.0,100,0


In [23]:
data['age'].value_counts()

80.00    4932
51.00    1566
47.00    1511
48.00    1508
53.00    1498
         ... 
0.48       83
1.00       83
0.40       66
0.16       59
0.08       36
Name: age, Length: 102, dtype: int64

In [30]:
data = data[data['age'] > 1]

In [33]:
# convert age to int
data['age'] = data['age'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['age'] = data['age'].astype(int)


In [34]:
data.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
count,95153.0,95153.0,95153.0,95153.0,95153.0,95153.0,95153.0,95153.0,95153.0
mean,0.415037,42.218932,0.078411,0.041228,0.039326,27.424858,5.534414,138.279077,0.089141
std,0.493115,22.189801,0.268818,0.198819,0.993256,6.714516,1.073989,40.970627,0.284948
min,0.0,1.0,0.0,0.0,-1.0,10.01,3.5,80.0,0.0
25%,0.0,24.0,0.0,0.0,-1.0,23.56,4.8,100.0,0.0
50%,0.0,43.0,0.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,1.0,60.0,0.0,0.0,0.0,29.93,6.2,159.0,0.0
max,2.0,80.0,1.0,1.0,2.0,95.69,9.0,300.0,1.0


In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95153 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               95153 non-null  int64  
 1   age                  95153 non-null  int64  
 2   hypertension         95153 non-null  int64  
 3   heart_disease        95153 non-null  int64  
 4   smoking_history      95153 non-null  int64  
 5   bmi                  95153 non-null  float64
 6   HbA1c_level          95153 non-null  float64
 7   blood_glucose_level  95153 non-null  int64  
 8   diabetes             95153 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 7.3 MB


## Split train and test set

In [36]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

## Feature scaling

In [38]:
sc = preprocessing.StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [40]:
print(X_train)

[[-0.84224601  0.57535041 -0.29280621 ... -0.01618012 -0.49797338
   0.40482047]
 [ 1.18563389  0.25991685 -0.29280621 ...  1.1067195  -1.42874349
  -1.18178923]
 [-0.84224601  1.70189886 -0.29280621 ... -0.66784961  0.43279673
  -1.30383613]
 ...
 [-0.84224601 -1.31725098 -0.29280621 ... -1.0600443   0.52587374
   1.50324257]
 [-0.84224601  0.12473103 -0.29280621 ...  1.44672097  0.61895075
  -0.20541403]
 [ 1.18563389 -0.91169354 -0.29280621 ... -0.01618012  0.89818179
   1.50324257]]


## Training and predict the model using Logistic regression

In [39]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [42]:
y_pred_lr = lr.predict(X_test)

## Training and predict the model using K-NN

In [44]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [45]:
y_pred_knn = knn.predict(X_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


## Linear Linear SVM

In [46]:
from sklearn.svm import SVC
linear_svm = SVC(kernel = 'linear')
linear_svm.fit(X_train, y_train)

SVC(kernel='linear')

In [49]:
y_pred_svm = linear_svm.predict(X_test)

## Accuracy Score

In [47]:
# logistic regression
accuracy_score(y_test, y_pred_lr)

0.959014239924334

In [48]:
# K-NN
accuracy_score(y_test, y_pred_knn)

0.9603804319268562

In [50]:
# Kernel SVM
accuracy_score(y_test, y_pred_svm)

0.9598549734643477