# KNN Classifier
### Siva.Jasthi@metrostate.edu
Machine Learning and Data Mining

# KNN Classifier using basic python

In [None]:
#KNN using Basic python
def euclidean_distance(x1, x2):
    distance = 0
    for i in range(len(x1)):
        distance += (x1[i] - x2[i])**2
    return distance ** 0.5



def knn(data, query, k):
    distances = []
    for i in range(len(data)):
        dist = euclidean_distance(data[i][:2], query)
        distances.append((dist, data[i][2]))

    # Sort the distances in ascending order
    distances.sort()

    # identify the nearest K neighbors
    neighbors = []
    for i in range(len(distances)):
        if i >= k:
            break
        neighbors.append(distances[i][1])
    return neighbors



def majority_vote(labels):
    label_counts = {}
    for label in labels:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1
    sorted_counts = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)
    return sorted_counts[0][0]


def best_k(data, query, k_values):
    best_accuracy = 0
    best_k = None
    for k in k_values:
        neighbors = knn(data, query, k)
        label = majority_vote(neighbors)
        accuracy = neighbors.count(label) / len(neighbors)
        print(k, " = ", accuracy)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_k = k


    return best_k, best_accuracy


# Test the classifier
data = [[2, 3, 'A'], [4, 2, 'B'], [1, 5, 'A'], [4, 4, 'B']]
data = [[2, 3, 'A'], [4, 2, 'B'], [1, 5, 'A'], [4, 4, 'B'], [6, 8, 'A'], [9, 1, 'B'], [3, 7, 'A'], [5, 6, 'B'], [2, 9, 'A'], [7, 3, 'B'], [8, 5, 'A'], [3, 2, 'B'], [1, 7, 'A'], [5, 4, 'B'], [6, 2, 'A'], [8, 7, 'B'], [2, 6, 'A'], [7, 1, 'B'], [3, 5, 'A'], [9, 3, 'B'], [4, 7, 'A']]


# we want to classify [3,4]
query = [2, 9]

# Try out different k values and identify the best K
k_values = [3, 5, 7, 9, 11, 13, 15]
best_k, best_accuracy = best_k(data, query, k_values)
print('best k:', best_k)
print('best accuracy:', best_accuracy)



# Rebuild the classifier with the best K
neighbors = knn(data, query, best_k)
label = majority_vote(neighbors)
print('prediction for', query, '=', label)



## üìä Visualization 1: Basic KNN - Data Points and Query Point

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract coordinates and labels
class_A = [point for point in data if point[2] == 'A']
class_B = [point for point in data if point[2] == 'B']

# Separate x and y coordinates
A_x = [point[0] for point in class_A]
A_y = [point[1] for point in class_A]
B_x = [point[0] for point in class_B]
B_y = [point[1] for point in class_B]

# Create the plot
plt.figure(figsize=(10, 8))
plt.scatter(A_x, A_y, c='blue', s=100, alpha=0.6, edgecolors='black', label='Class A', marker='o')
plt.scatter(B_x, B_y, c='red', s=100, alpha=0.6, edgecolors='black', label='Class B', marker='s')

# Plot query point
plt.scatter(query[0], query[1], c='green', s=300, alpha=0.8, edgecolors='black', 
            marker='*', label=f'Query Point {query}', linewidths=2)

# Find the k=3 nearest neighbors and draw circles
distances_viz = []
for point in data:
    dist = euclidean_distance(point[:2], query)
    distances_viz.append((dist, point))
distances_viz.sort()

# Highlight the 3 nearest neighbors
for i in range(3):
    neighbor = distances_viz[i][1]
    plt.plot([query[0], neighbor[0]], [query[1], neighbor[1]], 
             'g--', alpha=0.5, linewidth=1.5)
    
    # Add distance annotations
    mid_x = (query[0] + neighbor[0]) / 2
    mid_y = (query[1] + neighbor[1]) / 2
    plt.annotate(f'd={distances_viz[i][0]:.2f}', 
                xy=(mid_x, mid_y), fontsize=8, 
                bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.5))

plt.xlabel('Feature 1', fontsize=12)
plt.ylabel('Feature 2', fontsize=12)
plt.title(f'KNN Classification (K=3)\nQuery Point {query} ‚Üí Predicted Class: {label}', 
          fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\n‚úÖ The query point {query} is classified as Class '{label}'")
print(f"\nThe 3 nearest neighbors are:")
for i in range(3):
    neighbor = distances_viz[i][1]
    print(f"  {i+1}. Point {neighbor[:2]} (Class {neighbor[2]}) - Distance: {distances_viz[i][0]:.2f}")

#KNN Classifier for Breast Cancer dataset

In [None]:
#@title import the libraries
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
#@title Load the Breast Cancer Wisconsin dataset
breast_cancer = load_breast_cancer()
display(breast_cancer.data)

In [None]:
#@title KNN on Breast Cancer data set (N = 5)

#@title Load the Breast Cancer Wisconsin dataset
breast_cancer = load_breast_cancer()

#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(breast_cancer.data, breast_cancer.target, test_size=0.3, random_state=42)

# Create a KNN classifier object
knn = KNeighborsClassifier(n_neighbors = 5)

# Train the classifier on the training set
knn.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = knn.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


## üìä Visualization 2: Confusion Matrix for K=5

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
            xticklabels=['Malignant', 'Benign'],
            yticklabels=['Malignant', 'Benign'])
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.title(f'Confusion Matrix - KNN (K=5)\nAccuracy: {accuracy:.4f}', 
          fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, 
                          target_names=['Malignant', 'Benign']))

In [None]:
#@title Finding the optimal N value for K-NN Classifier using cross-validation
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Create a range of K values to test
k_values = range(3, 22, 2)

# Create an empty list to store the mean cross-validation scores
mean_scores = []

# Loop over the K values and calculate the mean cross-validation score
for k in k_values:
  knn = KNeighborsClassifier(n_neighbors=k)
  scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
  mean_scores.append(scores.mean())


#print the k and accuracy
for k, accuracy in zip(k_values, mean_scores):
  print(f"K = {k}, Accuracy = {accuracy}")


# Find the index of the K value with the highest mean accuracy score
best_k_index = np.argmax(mean_scores)

# Find the best K value based on the index
best_k = k_values[best_k_index]

# Print the best K value and its corresponding mean accuracy score
print("Best K value: ", best_k)
print("Mean accuracy score: ", mean_scores[best_k_index])


## üìä Visualization 3: K Value vs Accuracy (Cross-Validation)

In [None]:
# Plot K vs Accuracy
plt.figure(figsize=(12, 6))

plt.plot(k_values, mean_scores, marker='o', linewidth=2, markersize=8, color='blue')
plt.axvline(x=best_k, color='red', linestyle='--', linewidth=2, 
           label=f'Best K = {best_k}')
plt.axhline(y=mean_scores[best_k_index], color='green', linestyle='--', 
           linewidth=2, alpha=0.5, label=f'Best Accuracy = {mean_scores[best_k_index]:.4f}')

# Highlight the best point
plt.scatter(best_k, mean_scores[best_k_index], color='red', s=300, 
           zorder=5, edgecolors='black', linewidths=2)

plt.xlabel('K Value (Number of Neighbors)', fontsize=12)
plt.ylabel('Cross-Validation Accuracy', fontsize=12)
plt.title('K-NN Hyperparameter Tuning: K Value vs Accuracy\nBreast Cancer Dataset (5-Fold CV)', 
         fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.legend(loc='best', fontsize=10)
plt.xticks(k_values)
plt.tight_layout()
plt.show()

print(f"\nüìà Accuracy Range: {min(mean_scores):.4f} to {max(mean_scores):.4f}")
print(f"üìä Accuracy Variation: {max(mean_scores) - min(mean_scores):.4f}")

In [None]:
#@title Which distance metric is giving the best accuracy?
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the Breast Cancer Wisconsin dataset
data = load_breast_cancer()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3, random_state=42)

# Store results
metric_results = {}

# Try different distance metrics
for metric in ['euclidean', 'manhattan', 'minkowski']:
    # Train the KNN model
    knn = KNeighborsClassifier(n_neighbors = 13, metric = metric)
    knn.fit(X_train, y_train)

    # Predict the classes of the test set
    y_pred = knn.predict(X_test)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    metric_results[metric] = accuracy
    print("Accuracy ({}): {}".format(metric, accuracy))


## üìä Visualization 4: Distance Metric Comparison

In [None]:
# Bar plot comparing distance metrics
plt.figure(figsize=(10, 6))

metrics = list(metric_results.keys())
accuracies = list(metric_results.values())

colors = ['#3498db', '#e74c3c', '#2ecc71']
bars = plt.bar(metrics, accuracies, color=colors, alpha=0.7, edgecolor='black', linewidth=2)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{acc:.4f}',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

# Highlight the best metric
best_metric = max(metric_results, key=metric_results.get)
best_idx = metrics.index(best_metric)
bars[best_idx].set_edgecolor('gold')
bars[best_idx].set_linewidth(4)

plt.xlabel('Distance Metric', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title(f'Comparison of Distance Metrics (K=13)\nBest Metric: {best_metric.capitalize()} ({metric_results[best_metric]:.4f})', 
         fontsize=14, fontweight='bold')
plt.ylim(min(accuracies) - 0.01, max(accuracies) + 0.01)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nüèÜ Best Distance Metric: {best_metric.upper()}")
print(f"üìä Accuracy Difference: {max(accuracies) - min(accuracies):.4f}")

## üìä Visualization 5: Decision Boundary using PCA (2D)

In [None]:
from sklearn.decomposition import PCA
from matplotlib.colors import ListedColormap

# Reduce to 2 dimensions using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(data.data)

# Split the PCA-transformed data
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(
    X_pca, data.target, test_size=0.3, random_state=42)

# Train KNN on PCA data
knn_pca = KNeighborsClassifier(n_neighbors=13, metric='manhattan')
knn_pca.fit(X_train_pca, y_train_pca)

# Create mesh for decision boundary
h = 0.5  # step size in the mesh
x_min, x_max = X_pca[:, 0].min() - 1, X_pca[:, 0].max() + 1
y_min, y_max = X_pca[:, 1].min() - 1, X_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# Predict for each point in the mesh
Z = knn_pca.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot
plt.figure(figsize=(12, 8))
cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#0000FF'])

plt.contourf(xx, yy, Z, alpha=0.4, cmap=cmap_light)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=data.target, cmap=cmap_bold,
           edgecolor='black', s=50, alpha=0.7)

plt.xlabel(f'First Principal Component ({pca.explained_variance_ratio_[0]:.2%} variance)', fontsize=12)
plt.ylabel(f'Second Principal Component ({pca.explained_variance_ratio_[1]:.2%} variance)', fontsize=12)
plt.title('KNN Decision Boundary (K=13, Manhattan Distance)\nBreast Cancer Dataset - PCA Visualization', 
         fontsize=14, fontweight='bold')
plt.legend(['Malignant', 'Benign'], loc='best')
plt.tight_layout()
plt.show()

print(f"\nüìä Total variance explained by 2 components: {sum(pca.explained_variance_ratio_):.2%}")
print(f"‚úÖ Test accuracy on PCA data: {knn_pca.score(X_test_pca, y_test_pca):.4f}")

# (Lab 10) KNN Classifier for Wine dataset

In [None]:
#@title KNN Classifier using scikit learn (Wine Dataset)
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold

# Load the wine dataset
wine = load_wine()

# Define the features and target variable
X = wine.data
y = wine.target

# Set up K-fold cross-validation with K=10
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize a list to store the accuracy for each value of K
accuracy_list = []

# Test the KNN classifier for k=3 to k=21 with step size 2 using K-fold cross-validation
for k in range(3, 22, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    accuracy_sum = 0
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        knn.fit(X_train, y_train)
        accuracy_sum += knn.score(X_test, y_test)
    accuracy = accuracy_sum / 10
    accuracy_list.append(accuracy)
    print(f"K = {k}, Accuracy = {accuracy}")

# Find the best value of K based on the highest accuracy
best_k = accuracy_list.index(max(accuracy_list)) * 2 + 3
print(f"\nBest K = {best_k} with accuracy {max(accuracy_list)}")



## üìä Visualization 6: K Value vs Accuracy (Wine Dataset)

In [None]:
k_range = range(3, 22, 2)

# Plot K vs Accuracy for Wine dataset
plt.figure(figsize=(12, 6))

plt.plot(k_range, accuracy_list, marker='o', linewidth=2, markersize=8, 
        color='purple', label='Accuracy')
plt.axvline(x=best_k, color='red', linestyle='--', linewidth=2, 
           label=f'Best K = {best_k}')
plt.axhline(y=max(accuracy_list), color='green', linestyle='--', 
           linewidth=2, alpha=0.5, label=f'Best Accuracy = {max(accuracy_list):.4f}')

# Highlight the best point
best_idx = accuracy_list.index(max(accuracy_list))
plt.scatter(best_k, accuracy_list[best_idx], color='red', s=300, 
           zorder=5, edgecolors='black', linewidths=2)

plt.xlabel('K Value (Number of Neighbors)', fontsize=12)
plt.ylabel('Cross-Validation Accuracy', fontsize=12)
plt.title('K-NN Hyperparameter Tuning: K Value vs Accuracy\nWine Dataset (10-Fold CV)', 
         fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.legend(loc='best', fontsize=10)
plt.xticks(k_range)
plt.tight_layout()
plt.show()

print(f"\nüìà Accuracy Range: {min(accuracy_list):.4f} to {max(accuracy_list):.4f}")
print(f"üìä Accuracy Variation: {max(accuracy_list) - min(accuracy_list):.4f}")

In [None]:
#@title Test KNN Wine Classifier with these two samples

# Train the KNN classifier on the entire wine dataset using the best value of K
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X, y)

# Create two new wine samples and test your classifier
new_samples = np.array([[12.0, 2.35, 2.50, 21.5, 85.0, 1.65, 1.59, 0.42, 1.30, 2.80, 0.90, 2.45, 420.0],
                        [13.5, 2.61, 2.48, 20.5, 120.0, 1.75, 0.84, 0.48, 1.56, 2.52, 0.98, 2.85, 1035.0]])

# Predict the class labels of the new wine samples
new_sample_labels = knn.predict(new_samples)

# Print the predicted class labels of the new wine samples
print(f"\nNew sample 1 has class label {new_sample_labels[0]}")
print(f"New sample 2 has class label {new_sample_labels[1]}")

## üìä Visualization 7: Wine Dataset - PCA Visualization with Predictions

In [None]:
# PCA for Wine dataset
pca_wine = PCA(n_components=2)
X_wine_pca = pca_wine.fit_transform(wine.data)

# Transform new samples to PCA space
new_samples_pca = pca_wine.transform(new_samples)

# Create the plot
plt.figure(figsize=(12, 8))

# Plot existing wine samples
colors = ['red', 'blue', 'green']
markers = ['o', 's', '^']
for i, (color, marker) in enumerate(zip(colors, markers)):
    plt.scatter(X_wine_pca[y == i, 0], X_wine_pca[y == i, 1], 
               c=color, marker=marker, s=80, alpha=0.6, 
               edgecolors='black', label=f'Class {i}')

# Plot new samples
for idx, (sample, label) in enumerate(zip(new_samples_pca, new_sample_labels)):
    plt.scatter(sample[0], sample[1], c=colors[label], marker='*', 
               s=500, edgecolors='gold', linewidths=3, 
               label=f'New Sample {idx+1} ‚Üí Class {label}', zorder=5)
    
    # Add annotation
    plt.annotate(f'Sample {idx+1}\n(Class {label})', 
                xy=(sample[0], sample[1]), 
                xytext=(20, 20), textcoords='offset points',
                bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.7),
                arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0', lw=2))

plt.xlabel(f'First Principal Component ({pca_wine.explained_variance_ratio_[0]:.2%} variance)', fontsize=12)
plt.ylabel(f'Second Principal Component ({pca_wine.explained_variance_ratio_[1]:.2%} variance)', fontsize=12)
plt.title(f'Wine Dataset - PCA Visualization with New Sample Predictions\nKNN Classifier (K={best_k})', 
         fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=9)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nüìä Total variance explained by 2 components: {sum(pca_wine.explained_variance_ratio_):.2%}")
print(f"\nüç∑ Wine Dataset Classes:")
print(f"  Class 0: {wine.target_names[0]}")
print(f"  Class 1: {wine.target_names[1]}")
print(f"  Class 2: {wine.target_names[2]}")

## üìä Visualization 8: Confusion Matrix for Wine Dataset

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Split wine data for final evaluation
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
    wine.data, wine.target, test_size=0.3, random_state=42)

# Train with best K
knn_final = KNeighborsClassifier(n_neighbors=best_k)
knn_final.fit(X_train_wine, y_train_wine)
y_pred_wine = knn_final.predict(X_test_wine)

# Create confusion matrix
cm_wine = confusion_matrix(y_test_wine, y_pred_wine)
accuracy_wine = accuracy_score(y_test_wine, y_pred_wine)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_wine, annot=True, fmt='d', cmap='RdYlGn', cbar=True,
            xticklabels=wine.target_names,
            yticklabels=wine.target_names)
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.title(f'Confusion Matrix - Wine Dataset\nKNN (K={best_k}) - Accuracy: {accuracy_wine:.4f}', 
         fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test_wine, y_pred_wine, 
                          target_names=wine.target_names))

## üìä Summary: Key Visualizations Added

### ‚úÖ Visualizations Included:

1. **Basic KNN Visualization** - 2D scatter plot showing training data, query point, and K nearest neighbors with distance annotations

2. **Confusion Matrix (Breast Cancer)** - Heatmap showing model performance with true vs predicted classifications

3. **K Value Optimization** - Line plot showing how accuracy changes with different K values (cross-validation)

4. **Distance Metric Comparison** - Bar chart comparing Euclidean, Manhattan, and Minkowski distances

5. **Decision Boundary (PCA)** - 2D visualization of decision boundaries using Principal Component Analysis

6. **Wine Dataset K Optimization** - Similar to #3 but for wine dataset

7. **Wine PCA with Predictions** - PCA visualization showing new sample predictions in feature space

8. **Wine Confusion Matrix** - Multi-class confusion matrix for wine classification

### üéØ Learning Outcomes:
- Understanding how K-NN makes decisions based on proximity
- Visualizing the impact of hyperparameter tuning (K value)
- Comparing different distance metrics
- Seeing decision boundaries in reduced dimensions
- Evaluating model performance through confusion matrices
