In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv('diabetes.csv')

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

In [5]:
for column in zero_not_accepted:
    data[column] = data[column].replace(0, np.NaN)
    mean = int(data[column].mean(skipna=True))
    data[column] = data[column].replace(np.NaN, mean)

In [6]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [7]:
x = data.iloc[:,0:8]
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63
764,2,122.0,70.0,27.0,155.0,36.8,0.340,27
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1,126.0,60.0,29.0,155.0,30.1,0.349,47


In [8]:
y = data.iloc[:,8]
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [11]:
#split dataset
x = data.iloc[:,0:8]
y = data.iloc[:,8]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [12]:
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

614
154
614
154


In [27]:
x_train

array([[ 0.90832902,  0.93641795,  0.44764174, ...,  0.36863635,
         0.67740401,  1.69955804],
       [ 0.03644676, -0.81630913, -1.05200558, ..., -0.63294341,
        -0.07049698, -0.96569189],
       [-1.12606292,  1.43247278,  1.44740662, ...,  2.81535261,
        -0.11855487, -0.88240283],
       ...,
       [ 0.03644676, -0.91552009, -0.63543688, ..., -1.13373329,
        -0.95656442, -1.04898095],
       [ 2.0708387 , -1.21315299,  0.11438678, ..., -0.36108605,
        -0.50001442,  0.11706589],
       [ 0.32707418,  0.47343344,  0.7808967 , ..., -0.08922869,
         0.52121586,  2.94889395]])

In [20]:
sd = StandardScaler()
x_train = sd.fit_transform(x_train)
x_test = sd.transform(x_test)


In [22]:
classifier = KNeighborsClassifier(n_neighbors=11,metric='euclidean', p=2)
classifier.fit(x_train, y_train)


In [23]:
y_pred = classifier.predict(x_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [24]:
from sklearn.metrics import accuracy_score


In [25]:
x_train_prediction = classifier.predict(x_train)
training_data_accuracy = accuracy_score( x_train_prediction, y_train)
training_data_accuracy

0.7947882736156352

In [26]:
x_test_prediction = classifier.predict(x_test)
test_data_accuracy = accuracy_score( x_test_prediction, y_test)
test_data_accuracy

0.8181818181818182

In [28]:
x_train

array([[ 0.90832902,  0.93641795,  0.44764174, ...,  0.36863635,
         0.67740401,  1.69955804],
       [ 0.03644676, -0.81630913, -1.05200558, ..., -0.63294341,
        -0.07049698, -0.96569189],
       [-1.12606292,  1.43247278,  1.44740662, ...,  2.81535261,
        -0.11855487, -0.88240283],
       ...,
       [ 0.03644676, -0.91552009, -0.63543688, ..., -1.13373329,
        -0.95656442, -1.04898095],
       [ 2.0708387 , -1.21315299,  0.11438678, ..., -0.36108605,
        -0.50001442,  0.11706589],
       [ 0.32707418,  0.47343344,  0.7808967 , ..., -0.08922869,
         0.52121586,  2.94889395]])

In [36]:
input_data = (
    10,115,	0	,0	,0	,35.3,	0.134,	29


              )
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
std_data = sd.transform(input_data_reshaped)
prediction = classifier.predict(std_data)
if prediction == 0:
    print("The person is not diabetic")
else:
    print("The person is diabetic")

The person is not diabetic


