In [6]:
import numpy as np
import pandas as pd

In [7]:
class KNN:
    def __init__(self, k=5, distance_metric='manhattan'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y.astype(int)

    def predict(self, X):
        predictions = []
        for x in X:
            distances = self.compute_distance(x, self.X_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            prediction = np.bincount(k_nearest_labels).argmax()
            predictions.append(prediction)
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'manhattan':
            return np.sum(np.abs(X2 - X1), axis=1)
        elif self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X2 - X1)**2, axis=1))
        else:
            raise ValueError("Unsupported distance metric")

In [8]:
def preprocess_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    X = train_df.drop(['Exited', 'id'], axis=1)
    y = train_df['Exited'].values
    X_test = test_df.drop('id', axis=1)

    categorical_cols = X.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        X[col] = pd.Categorical(X[col]).codes
        X_test[col] = pd.Categorical(X_test[col]).codes

    X = X.apply(pd.to_numeric, errors='coerce')
    X_test = X_test.apply(pd.to_numeric, errors='coerce')

    X = X.fillna(X.mean())
    X_test = X_test.fillna(X_test.mean())

    X = (X - X.mean()) / X.std()
    X_test = (X_test - X_test.mean()) / X_test.std()

    return X.values, y, X_test.values

In [9]:
def cross_validate(X, y, model, cv=5):
    n_samples = len(y)
    fold_size = n_samples // cv
    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    scores = []
    for i in range(cv):
        test_indices = indices[i*fold_size:(i+1)*fold_size]
        train_indices = np.concatenate([indices[:i*fold_size], indices[(i+1)*fold_size:]])

        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        score = np.mean(predictions == y_test)
        scores.append(score)

    return np.array(scores)

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
train_path = '/content/drive/My Drive/BU/Year 4: Fall 2024/CS506/Assignment 5/train.csv'
test_path = '/content/drive/My Drive/BU/Year 4: Fall 2024/CS506/Assignment 5/test.csv'
X, y, X_test = preprocess_data(train_path, test_path)

knn = KNN(k=5, distance_metric='manhattan')

cv_scores = cross_validate(X, y, knn)
print("Cross-validation scores:", cv_scores)

knn.fit(X, y)
test_predictions = knn.predict(X_test)

submission_df = pd.DataFrame({'id': pd.read_csv(test_path)['id'], 'Exited': test_predictions})
submission_df.to_csv('/content/drive/My Drive/BU/Year 4: Fall 2024/CS506/Assignment 5/submissions.csv', index=False)

Cross-validation scores: [0.86633333 0.86933333 0.88033333 0.86933333 0.86966667]
