In [1]:
%matplotlib inline

In [2]:
import warnings
warnings.simplefilter('ignore')

In [None]:
# Dependencies
import pandas as pd
import numpy as np
import csv
import os
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

### Loading the dataset

In [None]:
# Importing csv file
os.chdir('/Users/ruddysimonpour/Desktop/Diabetes/source/')
diabetes_file = pd.read_csv("pima-data.csv", low_memory = False)

In [None]:
diabetes_file.info()

In [None]:
diabetes_file.describe().T

In [None]:
diabetes_file

In [None]:
# Fix the columns names so if column have '-' we can change it to '_'
def clean_diabetes_data(diabetes_file):
    cols = diabetes_file.columns
    diabetes_file.columns = [str.replace('-','_') for str in cols]
    return diabetes_file
diabetes_file = clean_diabetes_data(diabetes_file)
print(diabetes_file.columns)

In [None]:
# Check the Null values 
diabetes_file.isnull()

We checked the null values and it shows that we don't have missing values in the dataset, however regarding to this article: 
- https://www.sciencedirect.com/science/article/pii/S2352914816300016
"In 2001, 376 of 786 observations in the PID dataset were shownto lack experimental validity[65]because for some attributes, thevalue ofzerowas recorded in place of missing experimentalobservations[66]. It was also shown that if the instances withzerovalues were removed, performance could be dramaticallyimproved[65]."

We considering the zero values as missing values. 
Now we can replace zeros with median.

In [None]:
# Replacing zero to NaN values
data = diabetes_file[[
                      "glucose_conc",
                      "diastolic_bp",
                      "thickness",
                      "insulin",
                      "bmi",
                      "skin"]]= diabetes_file\
[["glucose_conc","diastolic_bp","thickness","insulin","bmi","skin"]].replace(0,np.nan)
data

In [None]:
# replacing NaN values with median
diabetes_file['glucose_conc'] = diabetes_file['glucose_conc'].fillna((diabetes_file['glucose_conc'].mean()))
diabetes_file['diastolic_bp'] = diabetes_file['diastolic_bp'].fillna((diabetes_file['diastolic_bp'].mean()))
diabetes_file['thickness'] = diabetes_file['thickness'].fillna((diabetes_file['thickness'].mean()))
diabetes_file['insulin'] = diabetes_file['insulin'].fillna((diabetes_file['insulin'].mean()))
diabetes_file['bmi'] = diabetes_file['bmi'].fillna((diabetes_file['bmi'].mean()))
diabetes_file['skin'] = diabetes_file['bmi'].fillna((diabetes_file['skin'].mean()))

In [None]:
diabetes_file

### Matrix of correlation between attributes

In [None]:
# Finding correlation between different trends
corr = diabetes_file.corr()
corr.style.background_gradient(cmap='coolwarm')

In this dataset we found out that skin and bmi have correlation to each other. So we need to remove one of them.


In [None]:
# removing correlated columns
del diabetes_file['skin']

In [None]:
diabetes_file

### Changing diabetes values to 0 and 1

In [91]:
mapping = {True:1,
           False:0}
diabetes_file['diabetes'] = diabetes_file['diabetes'].map(mapping)
diabetes_file

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148.0,72.0,35.00000,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.00000,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.00000,94.000000,28.1,0.167,21,0
4,0,137.0,40.0,35.00000,168.000000,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.00000,180.000000,32.9,0.171,63,0
764,2,122.0,70.0,27.00000,155.548223,36.8,0.340,27,0
765,5,121.0,72.0,23.00000,112.000000,26.2,0.245,30,0
766,1,126.0,60.0,29.15342,155.548223,30.1,0.349,47,1


Now, we need to consider the class distribution, to see if our data distributed properly or not.

In [115]:
total_numbers = len(diabetes_file)
diabetes_true = len(diabetes_file.loc[diabetes_file['diabetes']==1])
diabetes_false = len(diabetes_file.loc[diabetes_file['diabetes']==0])
print('Number of diabates people: {0} ({1:2.2f}%)'.format(diabetes_true, (diabetes_true)/(total_numbers) * 100))
print('Number of non-diabetes people: {0} ({1:2.2f}%)'.format(diabetes_false, (diabetes_false)/(total_numbers) * 100))

Number of diabates people: 268 (34.90%)
Number of non-diabetes people: 500 (65.10%)


In [None]:
distribution = diaba