In [1]:
import numpy as np
import pandas as pd

In [16]:
import numpy as np
from collections import Counter

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        """
        Store the training data.
        
        Parameters:
        - X: Feature matrix for training data
        - y: Labels for training data
        """
        self.X_train = X
        self.y_train = y

    def compute_distance(self, X1, X2):
        """
        Compute distances between two sets of samples.
        
        Parameters:
        - X1: First sample set (training data)
        - X2: Second sample set (a single test instance)
        
        Returns:
        - distances: Array of distances from each sample in X1 to X2
        """
        if self.distance_metric == 'euclidean':
            # Using numpy for vectorized operations
            distances = np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("Unsupported distance metric")
        return distances

    def predict(self, X):
        """
        Predict the class labels for the provided data.
        
        Parameters:
        - X: Feature matrix for test data
        
        Returns:
        - predictions: Predicted class labels for each sample in X
        """
        predictions = []
        for x in X:
            # Compute distances from the test instance to all training instances
            distances = self.compute_distance(self.X_train, x)
            # Get indices of the k nearest neighbors
            nearest_indices = np.argsort(distances)[:self.k]
            # Get the labels of the k nearest neighbors
            nearest_labels = self.y_train[nearest_indices]
            # Determine the most common label
            majority_label = Counter(nearest_labels).most_common(1)[0][0]
            predictions.append(majority_label)
        return np.array(predictions)
    def get_params(self, deep=True):
        """Get parameters for this estimator."""
        return {'k': self.k, 'distance_metric': self.distance_metric}

    def set_params(self, **params):
        """Set the parameters of this estimator."""
        for key, value in params.items():
            setattr(self, key, value)
        return self

In [17]:
import numpy as np
import pandas as pd

def one_hot_encode(data, columns):
    """Perform one-hot encoding on specified columns."""
    return pd.get_dummies(data, columns=columns, drop_first=True)

def standard_scale(X):
    """Standardize features by removing the mean and scaling to unit variance."""
    X = np.asarray(X, dtype=np.float64)  # Ensure input is a NumPy array of floats
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)

    # Avoid division by zero by replacing std with 1 if std is zero
    std[std == 0] = 1

    return (X - mean) / std

def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop unnecessary columns
    train_data.drop(columns=['id', 'CustomerId', 'Surname'], inplace=True)
    test_data.drop(columns=['id', 'CustomerId', 'Surname'], inplace=True)

    # Handle categorical variables using one-hot encoding
    train_data = one_hot_encode(train_data, ['Geography', 'Gender'])
    test_data = one_hot_encode(test_data, ['Geography', 'Gender'])

    # Align columns of test_data to match train_data
    test_data = test_data.reindex(columns=train_data.columns.drop('Exited'), fill_value=0)

    # Separate features and target in the training data
    X_train = train_data.drop(columns=['Exited']).values
    y_train = train_data['Exited'].values

    # Scale features using custom standard scaling
    X_train = standard_scale(X_train)
    X_test = standard_scale(test_data.values)

    return X_train, y_train, X_test

def calculate_roc_auc(y_true, y_scores):
    """Calculate the ROC AUC score."""
    from sklearn.metrics import roc_auc_score
    return roc_auc_score(y_true, y_scores)

def cross_validate(X, y, knn, n_splits=5):
    """Perform cross-validation."""
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    roc_auc_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        knn.fit(X_train, y_train)
        y_val_pred_proba = knn.predict(X_val)

        # Compute ROC AUC score
        roc_auc = calculate_roc_auc(y_val, y_val_pred_proba)
        roc_auc_scores.append(roc_auc)

    return np.mean(roc_auc_scores), np.std(roc_auc_scores)

def grid_search(knn_class, param_grid, X, y, scoring_func, n_splits=5):
    best_score = -np.inf
    best_params = {}

    for k in param_grid['k']:
        for distance_metric in param_grid['distance_metric']:
            # Create a KNN instance with the current parameters
            knn = knn_class(k=k, distance_metric=distance_metric)
            
            # Perform cross-validation
            mean_cv_score, _ = cross_validate(X, y, knn, n_splits=n_splits)
            print(f'k={k}, distance_metric={distance_metric}, Mean AUC={mean_cv_score:.4f}')
            
            # Check if the current score is the best score
            if mean_cv_score > best_score:
                best_score = mean_cv_score
                best_params = {'k': k, 'distance_metric': distance_metric}

    return best_params

# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model with default hyperparameters
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
mean_cv_score, std_cv_score = cross_validate(X, y, knn)
print("Cross-validation scores: Mean AUC = {:.4f}, Std = {:.4f}".format(mean_cv_score, std_cv_score))

# Hyperparameter tuning using manual grid search
param_grid = {'k': [1, 3, 5, 7, 9], 'distance_metric': ['euclidean', 'manhattan']}
best_params = grid_search(KNN, param_grid, X, y, calculate_roc_auc, n_splits=5)

# Retrieve the best hyperparameters
best_k = best_params['k']
best_distance_metric = best_params['distance_metric']
print(f'Best parameters from grid search: k={best_k}, distance_metric={best_distance_metric}')

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_distance_metric)
knn.fit(X, y)

# Make predictions on test set
test_predictions = knn.predict(X_test)

# Save test predictions
submission = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)
print("Submissions saved to 'submissions.csv'")


Cross-validation scores: Mean AUC = 0.7663, Std = 0.0080
k=1, distance_metric=euclidean, Mean AUC=0.7565
k=1, distance_metric=manhattan, Mean AUC=0.7498
k=3, distance_metric=euclidean, Mean AUC=0.7642
k=3, distance_metric=manhattan, Mean AUC=0.7645
k=5, distance_metric=euclidean, Mean AUC=0.7663
k=5, distance_metric=manhattan, Mean AUC=0.7685
k=7, distance_metric=euclidean, Mean AUC=0.7649
k=7, distance_metric=manhattan, Mean AUC=0.7666
k=9, distance_metric=euclidean, Mean AUC=0.7624
k=9, distance_metric=manhattan, Mean AUC=0.7680
Best parameters from grid search: k=5, distance_metric=manhattan
Submissions saved to 'submissions.csv'
