# Lesson 5B: K-Nearest Neighbors PracticalProduction KNN with scikit-learn, optimization, and practical tips.

In [None]:
import numpy as npimport matplotlib.pyplot as pltfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.datasets import load_breast_cancerfrom sklearn.model_selection import train_test_split, cross_val_scorefrom sklearn.preprocessing import StandardScalerfrom sklearn.metrics import accuracy_score, classification_reportnp.random.seed(42)print('✅ Libraries loaded')

## Scikit-learn KNN

In [None]:
data = load_breast_cancer()X, y = data.data, data.targetX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# IMPORTANT: Normalize features!scaler = StandardScaler()X_train_scaled = scaler.fit_transform(X_train)X_test_scaled = scaler.transform(X_test)knn = KNeighborsClassifier(n_neighbors=5)knn.fit(X_train_scaled, y_train)y_pred = knn.predict(X_test_scaled)print(f'Accuracy: {accuracy_score(y_test, y_pred):.3f}')print('\n✅ Scikit-learn KNN trained!')

## Finding Optimal K

In [None]:
k_values = range(1, 31)scores = []for k in k_values:    knn = KNeighborsClassifier(n_neighbors=k)    cv_scores = cross_val_score(knn, X_train_scaled, y_train, cv=5)    scores.append(cv_scores.mean())plt.figure(figsize=(10, 6))plt.plot(k_values, scores, 'o-', linewidth=2)plt.xlabel('K (number of neighbors)', fontsize=12)plt.ylabel('Cross-validation accuracy', fontsize=12)plt.title('Finding Optimal K', fontsize=14, fontweight='bold')plt.grid(alpha=0.3)plt.show()best_k = k_values[np.argmax(scores)]print(f'Best K: {best_k} (accuracy: {max(scores):.3f})')

## Distance Metrics Comparison

In [None]:
metrics = ['euclidean', 'manhattan', 'minkowski', 'chebyshev']for metric in metrics:    knn = KNeighborsClassifier(n_neighbors=5, metric=metric)    knn.fit(X_train_scaled, y_train)    acc = accuracy_score(y_test, knn.predict(X_test_scaled))    print(f'{metric:12s}: {acc:.3f}')

## Algorithm Comparison

In [None]:
# KNN has different search algorithmsalgorithms = ['ball_tree', 'kd_tree', 'brute']for algo in algorithms:    knn = KNeighborsClassifier(n_neighbors=5, algorithm=algo)    knn.fit(X_train_scaled, y_train)    acc = accuracy_score(y_test, knn.predict(X_test_scaled))    print(f'{algo:12s}: {acc:.3f}')

## Conclusion**Key Takeaways:**- **Always normalize features** for KNN- Use cross-validation to find optimal K- Euclidean distance works well for most cases- Ball tree/KD tree speed up searches**Best practices:**- Start with K=√n- Normalize features with StandardScaler- Use KNN for small-medium datasets (<100k samples)- Consider approximate nearest neighbors for large datasets