In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Replace invalid zeros with NaN
cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[cols] = data[cols].replace({0: np.nan})

# Fill NaNs with the mean
for col in cols:
    data[col].fillna(data[col].mean(), inplace=True)

# Scale the data
scaler = StandardScaler()
data[cols] = scaler.fit_transform(data[cols])

# Features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=1)

# KNN
knn = KNeighborsClassifier()
knn_scores = cross_val_score(knn, X, y, cv=kf)
print("KNN Cross-validation scores:", knn_scores)
print("KNN Mean score:", knn_scores.mean())

# Decision Tree
dt = DecisionTreeClassifier(random_state=1)
dt_scores = cross_val_score(dt, X, y, cv=kf)
print("Decision Tree Cross-validation scores:", dt_scores)
print("Decision Tree Mean score:", dt_scores.mean())


KNN Cross-validation scores: [0.75974026 0.72727273 0.68831169 0.76470588 0.69934641]
KNN Mean score: 0.7278753925812749
Decision Tree Cross-validation scores: [0.69480519 0.66883117 0.61688312 0.7254902  0.70588235]
Decision Tree Mean score: 0.6823784059078177
