In [3]:
# imports
import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# load data to df
breast_cancer_data = pd.read_csv("breast_cancer.csv")
breast_cancer_data
# searching for NaN
breast_cancer_data.isna().sum()
# drop NaN's
breast_cancer_data.drop(["Unnamed: 32", "id"], axis=1, inplace=True)
# searching for NaN
breast_cancer_data.isna().sum()
# defining X & y
X = breast_cancer_data.select_dtypes("number")
y = breast_cancer_data["diagnosis"]
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,
													y,
													test_size=0.2,
													random_state=42,
													stratify=y
													)
# distribution "diagnosis"
breast_cancer_data["diagnosis"].value_counts(normalize=True)
# distribution "diagnosis" compare to y_train
y_train.value_counts(normalize=True)
# distribution "diagnosis" compare to y_test
y_test.value_counts(normalize=True)
# knn
knn = KNeighborsClassifier()
# crossvalidation
knn_cv = cross_val_score(
	knn,
	X_train,
	y_train,
	cv=5,
	scoring="accuracy",
	verbose=3
)
# knn_cv
knn_cv
# mean accuracy for 5 folds
knn_cv_mean = knn_cv.mean()
print(f"Mean accuracy for 5 folds: {knn_cv_mean:.3f}")
# prepare model, compare error_rates & cv
# counting trainings
error_rates = []
trainings = 0

for i in range(1, 51):
	knn = KNeighborsClassifier(i)
	knn_cv = cross_val_score(knn, X_train, y_train, cv=5,
							 scoring="accuracy", verbose=1)
	error_rate = 1 - knn_cv.mean()
	error_rates.append(error_rate)
	trainings += len(knn_cv)
# error_rates
error_rates
# plotting error_rates
# sns.lineplot(x=range(1, 51), y=error_rates, marker='o')
# plt.xticks(range(51))
# plt.title('Error Rate KNN -> k')
# plt.xlabel('k Values')
# plt.ylabel('Error Rate');
# min error_rate
min_error = min(error_rates)
print(f"Min error_rate: {min_error:.3f}")
# best k for min error rate
best_k = error_rates.index(min_error) + 1
print(f"Best k with min error rate ({min_error:.3f}) is k-Nr.: {best_k}")
# hyperparams
new_knn = KNeighborsClassifier()
new_knn.get_params()
# defining params
new_knn_range = np.arange(1, 51)
grid_params = {'n_neighbors': new_knn_range,
			   'weights': ['uniform', 'distance'],
			   'metric': ['euclidean', 'manhattan']}
# grid search
grid = GridSearchCV(new_knn, grid_params, verbose=1, cv=5)
grid_results = grid.fit(X_train, y_train)
# best result out of 120 runs
print(f"Best result out of 120 runs: {grid_results.best_score_:.3f}")
print(f"For compare the accuracy: {1 - min_error:.3f}")
# best params
print(f"Best params: {grid_results.best_params_}")
# final model
best_params = grid_results.best_params_
final_knn = KNeighborsClassifier(**best_params)
final_knn.fit(X_train, y_train)
final_knn.score(X_test, y_test)
# save model
with open("breast_cancer_diagnosis_model.pkl", "wb") as f:
	pickle.dump(final_knn, f)
# load model
bc_diagnosis_model = pickle.load(open("breast_cancer_diagnosis_model.pkl", "rb"))
# X test predictions
pred_results = bc_diagnosis_model.predict(X_test)
print(f"Predictions: {pred_results}")

[CV] END ................................ score: (test=0.945) total time=   0.0s
[CV] END ................................ score: (test=0.945) total time=   0.0s
[CV] END ................................ score: (test=0.934) total time=   0.0s
[CV] END ................................ score: (test=0.901) total time=   0.0s
[CV] END ................................ score: (test=0.846) total time=   0.0s
Mean accuracy for 5 folds: 0.914
Min error_rate: 0.073
Best k with min error rate (0.073) is k-Nr.: 14
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best result out of 120 runs: 0.932
For compare the accuracy: 0.927
Best params: {'metric': 'manhattan', 'n_neighbors': np.int64(11), 'weights': 'distance'}
Predictions: ['B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'M' 'B'
 'B' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'B' 'M' 'M' 'M' 'M' 'B' 'M' 'M' 'B' 'B'
 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'B' 'M' 'M' 'B' 'B'
 'B' 'M' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'B'