In [11]:
import numpy as np
import pandas as pd

In [12]:
# Load and preprocess data
from google.colab import drive
drive.mount('/content/drive')

# Define file paths
train_path = '/content/drive/My Drive/BU/Fall 2024/CS506/Assignments/Assignment 5/train.csv'
test_path = '/content/drive/My Drive/BU/Fall 2024/CS506/Assignments/Assignment 5/test.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', weighted=False):
        self.k = k
        self.distance_metric = distance_metric
        self.weighted = weighted

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        predictions = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            # Get the indices of k nearest neighbors
            nearest_neighbors_indices = np.argsort(distances)[:self.k]
            nearest_neighbors_labels = self.y_train[nearest_neighbors_indices]
            nearest_neighbors_distances = distances[nearest_neighbors_indices]

            # If weighted, compute weighted voting
            if self.weighted:
                inverse_distances = 1 / (nearest_neighbors_distances + 1e-5)  # Avoid division by zero
                weighted_votes = np.zeros(np.max(self.y_train) + 1)
                for idx, label in enumerate(nearest_neighbors_labels):
                    weighted_votes[label] += inverse_distances[idx]
                prediction = np.argmax(weighted_votes)
            else:
                # Unweighted voting: most common label among neighbors
                prediction = np.argmax(np.bincount(nearest_neighbors_labels.astype(int)))

            predictions.append(prediction)
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            # Euclidean distance: sqrt(sum((x1 - x2)^2))
            distances = np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        return distances

In [14]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    # Separate features and labels for training data
    # Assuming 'Exited' is the target variable and 'id' is a non-feature column
    X_train = train_data.drop(columns=['Exited', 'id'])
    y_train = train_data['Exited'].astype(int)  # Convert target labels to integers

    X_test = test_data.drop(columns=['id'])

    # Convert to numpy arrays for use in KNN
    X_train = X_train.select_dtypes(include=[np.number]).values
    X_test = X_test.select_dtypes(include=[np.number]).values
    y_train = y_train.values

    # Normalize/scale the features (mean = 0, std = 1)
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std

    return X_train, y_train, X_test

In [15]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    n_samples = len(X)
    fold_size = n_samples // n_splits
    scores = []

    for i in range(n_splits):
        # Split the data into training and validation sets
        start_val = i * fold_size
        end_val = (i + 1) * fold_size if i < n_splits - 1 else n_samples

        X_val = X[start_val:end_val]
        y_val = y[start_val:end_val]

        X_train = np.concatenate((X[:start_val], X[end_val:]), axis=0)
        y_train = np.concatenate((y[:start_val], y[end_val:]), axis=0)

        # Train the model
        knn.fit(X_train, y_train)

        # Predict on validation set (labels treated as pseudo-scores)
        y_pred = knn.predict(X_val)

        # Use the predicted labels directly for ROC AUC
        roc_auc = compute_roc_auc(y_val, y_pred)
        scores.append(roc_auc)

    return scores

def compute_roc_auc(y_true, y_pred):
    # Sort by predicted labels (0 or 1 treated as scores)
    sorted_indices = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[sorted_indices]

    # True positive and false positive rates
    tpr_list = []
    fpr_list = []

    P = np.sum(y_true == 1)  # Total positives
    N = np.sum(y_true == 0)  # Total negatives

    tp = 0
    fp = 0

    prev_label = -1
    for i in range(len(y_pred)):
        if y_pred[i] != prev_label:
            tpr_list.append(tp / P)
            fpr_list.append(fp / N)
            prev_label = y_pred[i]

        if y_true_sorted[i] == 1:
            tp += 1
        else:
            fp += 1

    # Add final (1, 1) point
    tpr_list.append(1)
    fpr_list.append(1)

    # Compute area under the curve (AUC) using the trapezoidal rule
    tpr = np.array(tpr_list)
    fpr = np.array(fpr_list)
    auc = np.trapz(tpr, fpr)

    return auc

In [16]:
X, y, X_test = preprocess_data(train_path, test_path)

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
k_values = [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25]
best_score = -1
best_k = None

# Hyperparameter tuning for k (Euclidean distance only)
for k in k_values:
    knn = KNN(k=k, distance_metric='euclidean', weighted=True)  # Use weighted KNN
    cv_scores = cross_validate(X, y, knn)
    avg_score = np.mean(cv_scores)  # Average ROC AUC score

    # Update the best parameters if the current one is better
    if avg_score > best_score:
        best_score = avg_score
        best_k = k

print(f"Best k found: {best_k}, with score: {best_score}")
# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric='euclidean', weighted=True)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv(test_path)['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [0.7580419789941601, 0.7635930270797424, 0.7651226520439236, 0.7620617005163131, 0.7435775682165994]
Best k found: 11, with score: 0.7609190470303633
