In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:


diabetes = pd.read_csv("diabetes.csv")
diabetes.head(3)

# Datensplit aufsetzen
X = diabetes.loc[:, diabetes.columns != "Outcome"]
y = diabetes["Outcome"]

# Split in Training- und Testdatensatz
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

# Erstes KNN mit K=2 + Accuracy berechnen
model = KNeighborsClassifier(n_neighbors=2)
model.fit(X_train, y_train)
print(f"accuracy: {model.score(X_test, y_test):.4f}")


# b)
# Funktion, um in die drei Datensätze zu splitten
def train_val_test_split(X, y, train_ratio=0.70, val_ratio=0.20):
	# Split in Training und temp = Validierung + Test
	X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=train_ratio)
	# Splitte temp in Validierung und Test
	# zuerst das Verhältnis ziwschen val und test berechnen
	test_val_ratio = (1 - train_ratio - val_ratio) / (1 - train_ratio)
	X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_val_ratio)
	return X_train, X_val, X_test, y_train, y_val, y_test


X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, y)

best = {"k": None, "accuracy": 0, "model": None}
for k in range(3, 31):
	model_knn = KNeighborsClassifier(n_neighbors=k)
	model_knn.fit(X_train, y_train)
	accuracy = model_knn.score(X=X_val, y=y_val)
	print(f"k:{k}, accuracy:{accuracy:.4f}")
	# aktuell bestes Ergebnis speichern
	if accuracy > best["accuracy"]:
		print("besser")
		best["k"] = k
		best["accuracy"] = accuracy
		best["model"] = model

# echte accuracy bestimmen
test_accuracy = best["model"].score(X=X_test, y=y_test)
print(f"Bestes Ergebnis: k={best['k']} mit accuracy: {test_accuracy:.4f} auf dem Test-Datensatz")

# c)
from sklearn.model_selection import GridSearchCV

param_grid = dict(n_neighbors=list(range(3, 31)))

clf = GridSearchCV(model, param_grid)
clf.fit(X_train, y_train)
print(clf.best_params_)

accuracy: 0.6719
k:3, accuracy:0.6340
besser
k:4, accuracy:0.6536
besser
k:5, accuracy:0.6601
besser
k:6, accuracy:0.6536
k:7, accuracy:0.6471
k:8, accuracy:0.6797
besser
k:9, accuracy:0.6732
k:10, accuracy:0.6732
k:11, accuracy:0.6732
k:12, accuracy:0.6732
k:13, accuracy:0.6732
k:14, accuracy:0.6928
besser
k:15, accuracy:0.6536
k:16, accuracy:0.6732
k:17, accuracy:0.6797
k:18, accuracy:0.6732
k:19, accuracy:0.6601
k:20, accuracy:0.6797
k:21, accuracy:0.6667
k:22, accuracy:0.6797
k:23, accuracy:0.6732
k:24, accuracy:0.6797
k:25, accuracy:0.6732
k:26, accuracy:0.6863
k:27, accuracy:0.6667
k:28, accuracy:0.6797
k:29, accuracy:0.6863
k:30, accuracy:0.6797
Bestes Ergebnis: k=14 mit accuracy: 0.8590 auf dem Test-Datensatz
{'n_neighbors': 29}
