In [3]:
import numpy as np
import pandas as pd

In [5]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])

    def _predict_single(self, x):
        distances = self.compute_distance(x, self.X_train)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        return sum(k_nearest_labels) / self.k

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return [np.linalg.norm(X1 - x_train) for x_train in X2]
        elif self.distance_metric == 'manhattan':
            return [np.sum(np.abs(X1 - x_train)) for x_train in X2]
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

In [6]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Handle categorical variables
    train_data['Geography'] = train_data['Geography'].map({'France': 0, 'Spain': 1, 'Germany': 2})
    train_data['Gender'] = train_data['Gender'].map({'Male': 0, 'Female': 1})
    
    test_data['Geography'] = test_data['Geography'].map({'France': 0, 'Spain': 1, 'Germany': 2})
    test_data['Gender'] = test_data['Gender'].map({'Male': 0, 'Female': 1})
    
    features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    X_train = train_data[features].values
    y_train = train_data['Exited'].values
    X_test = test_data[features].values
    
    X_train = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)
    X_test = (X_test - np.mean(X_test, axis=0)) / np.std(X_test, axis=0)
    
    return X_train, y_train, X_test


In [7]:
def compute_auc(y_true, y_proba):
    sorted_indices = np.argsort(y_proba)[::-1]
    y_true_sorted = y_true[sorted_indices]

    total_positives = np.sum(y_true_sorted)
    total_negatives = len(y_true_sorted) - total_positives

    if total_positives == 0 or total_negatives == 0:
        return 0.0  # AUC is undefined in this case, so we return 0.0

    tpr = np.cumsum(y_true_sorted) / total_positives  # Cumulative sum of true positives
    fpr = np.cumsum(1 - y_true_sorted) / total_negatives  # Cumulative sum of false positives

    tpr = np.concatenate([[0], tpr])
    fpr = np.concatenate([[0], fpr])

    auc = np.trapz(tpr, fpr)

    return auc


def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(X) // n_splits
    auc_scores = []
    
    for i in range(n_splits):
        # Split the data into training and validation sets
        X_val = X[i * fold_size: (i + 1) * fold_size]
        y_val = y[i * fold_size: (i + 1) * fold_size]
        
        X_train = np.concatenate([X[:i * fold_size], X[(i + 1) * fold_size:]], axis=0)
        y_train = np.concatenate([y[:i * fold_size], y[(i + 1) * fold_size:]], axis=0)
        
        # Train the KNN model
        knn.fit(X_train, y_train)
        
        # Make predictions and compute AUC
        y_pred = knn.predict(X_val)
        auc = compute_auc(y_val, y_pred)
        print("auc: ", auc)

        auc_scores.append(auc)
    
    return auc_scores


In [None]:
X, y, X_test = preprocess_data('data/train.csv', 'data/test.csv')

for k in range(2, 50):
    for distance_metric in ['euclidean', 'manhattan']:
        knn = KNN(k=k, distance_metric=distance_metric)
        print(k, distance_metric)
        cv_scores = cross_validate(X, y, knn)
        print("Cross-validation scores:", cv_scores)

knn = KNN(k=40, distance_metric='manhattan')

knn.fit(X, y)
test_predictions = knn.predict(X_test)

pd.DataFrame({'id': pd.read_csv('data/test.csv')['id'], 'Exited': test_predictions}).to_csv('output/submissions.csv', index=False)