<a href="https://colab.research.google.com/github/showrin20/Machine-Learning-Learning-Path/blob/main/Knn_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

def process_file(fname):
    df = pd.read_csv(fname)
    data = df.to_numpy()
    feats = data[:, 1:]
    labs = data[:, 0]
    return feats, labs

def process_data(train_file, test_file, train_size=6000, test_size=1000):
    # Ensure we do not exceed the dataset's maximum available sizes
    max_train_size = 60000  # Assuming 60,000 is the full size of the training dataset
    max_test_size = 10000   # Assuming 10,000 is the full size of the testing dataset
    train_size = min(train_size, max_train_size)
    test_size = min(test_size, max_test_size)

    train_feats, train_labs = process_file(train_file)
    test_feats, test_labs = process_file(test_file)

    return train_feats[:train_size], train_labs[:train_size], test_feats[:test_size], test_labs[:test_size]

def euclidean_distance(v1, v2):
    '''Calculates the Euclidean distance between two vectors'''
    return np.linalg.norm(v1 - v2)

def knn_predict(train_feats, train_labs, test_feats, k=3):
    '''Predicts labels for test data using the k-nearest neighbors algorithm'''
    predictions = []
    for test_vector in test_feats:
        distances = np.array([euclidean_distance(test_vector, train_vector) for train_vector in train_feats])
        nearest_indices = np.argsort(distances)[:k]
        nearest_labels = train_labs[nearest_indices]
        vote = np.bincount(nearest_labels).argmax()
        predictions.append(vote)
    return predictions

def accuracy_score(y_true, y_pred):
    '''Calculates the accuracy of predictions'''
    return np.sum(y_true == y_pred) / len(y_true)

def confusion_matrix(y_true, y_pred):
    '''Generates a confusion matrix'''
    matrix = np.zeros((10, 10), dtype=int)
    for actual, predicted in zip(y_true, y_pred):
        matrix[actual][predicted] += 1
    return matrix

# Replace these paths with the correct paths to your MNIST dataset files
train_file = '/content/mnist_train.csv'
test_file = '/content/mnist_test.csv'

# Load and process the data
train_feats, train_labs, test_feats, test_labs = process_data(train_file, test_file)

# Evaluate the model for different values of k and determine the best one based on error rates
k_values = [1, 3, 5, 7]
error_rates = {}

for k in k_values:
    predictions = knn_predict(train_feats, train_labs, test_feats, k)
    acc = accuracy_score(test_labs, predictions)
    error_rates[k] = 1 - acc

best_k = min(error_rates, key=error_rates.get)
best_predictions = knn_predict(train_feats, train_labs, test_feats, best_k)
conf_matrix = confusion_matrix(test_labs, best_predictions)

print("Error Rates:", error_rates)
print("Best k:", best_k)
print("Confusion Matrix for the best k:\n", conf_matrix)


Error Rates: {1: 0.09599999999999997, 3: 0.08699999999999997, 5: 0.08399999999999996, 7: 0.08599999999999997}
Best k: 5
Confusion Matrix for the best k:
 [[ 83   0   0   0   0   0   2   0   0   0]
 [  0 126   0   0   0   0   0   0   0   0]
 [  2   4  98   1   1   0   2   6   2   0]
 [  0   1   0  98   0   2   2   2   0   2]
 [  0   2   0   0  99   0   1   1   0   7]
 [  1   1   0   0   1  81   1   0   1   1]
 [  2   0   0   0   1   0  84   0   0   0]
 [  0   6   0   0   1   1   0  88   0   2]
 [  3   1   1   4   1   3   2   0  71   3]
 [  0   0   0   0   4   0   0   1   2  88]]
