In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


In [None]:
data = pd.read_csv('diabetes.csv')
data.head()

headers = np.array(data.columns)
print(headers)


In [None]:
# checking null&na datas
print(data.isnull().sum()) # missing data
print(data.isna().sum()) # Nan data

In [None]:
# zeroes not accepted
zero_not_accepted = ['Glucose',	'BloodPressure',
                     'SkinThickness', 'BMI', 'Insulin']

for column in zero_not_accepted:
    data[column] = data[column].replace(0, np.NaN)
    mean = int(data[column].mean(skipna=True))
    data[column] = data[column].replace(np.NaN, mean)


In [None]:
features = headers[:-1]
target = headers[-1]

X = data[features]
Y = data[target]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)


In [None]:
# feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# knn model building
model = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')
model.fit(X_train,Y_train)

In [None]:
# linear_regression model
linear_model = LinearRegression()
linear_model.fit(X_train,Y_train)
Y_pred_lin = linear_model.predict(X_test)

In [None]:
# predict test results
Y_pred = model.predict(X_test)


In [None]:
# model evaluating
cm = confusion_matrix(Y_test,Y_pred)
print('confusion_matrix: ',cm)

print('f1_score: ', f1_score(Y_test,Y_pred))

# accuracy = (cm[0, 0]+cm[1, 1])/np.sum(cm)
print('knn accuracy_score: ', accuracy_score(Y_test,Y_pred))
print('linear accuracy: ', linear_model.score(X_test,Y_test))