In [76]:
import pandas as pd
import numpy as np

In [77]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X.astype(np.float64)
        self.y_train = y.astype(int)

    def predict(self, X):
        y_pred = []
        for x in X:
            distances = self.compute_distance(x, self.X_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            # Majority vote using numpy functions
            counts = np.bincount(k_nearest_labels)
            most_common = np.argmax(counts)
            y_pred.append(most_common)
        return np.array(y_pred)

    def compute_distance(self, x1, X2):
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X2 - x1) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X2 - x1), axis=1)
        else:
            raise ValueError("Unsupported distance metric")
        return distances

In [78]:
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Combine train and test data for consistent preprocessing
    data = pd.concat([train_data, test_data], sort=False)

    # Drop unnecessary columns
    data = data.drop(['id', 'Surname'], axis=1)

    # One-hot encode categorical variables
    data = pd.get_dummies(data, columns=['Geography', 'Gender'], drop_first=True)

    # Convert boolean columns to float64
    bool_columns = data.select_dtypes(include=['bool']).columns
    data[bool_columns] = data[bool_columns].astype(float)

    # Ensure all data is numeric and of type float64
    data = data.astype(np.float64)

    # Feature scaling (mean normalization)
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    data[numerical_features] = (data[numerical_features] - data[numerical_features].mean()) / data[numerical_features].std()
    # Handle missing values
    if data.isnull().values.any():
        data = data.fillna(0)

    # Split the data back into train and test sets
    X_train = data.iloc[:len(train_data)].copy()
    X_test = data.iloc[len(train_data):].copy()

    # Separate features and target variable
    y_train = X_train['Exited'].values.astype(int)
    X_train = X_train.drop('Exited', axis=1).values

    if 'Exited' in X_test.columns:
        X_test = X_test.drop('Exited', axis=1)

    X_test = X_test.values
    return X_train, y_train, X_test

In [79]:
def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(X) // n_splits
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    scores = []
    for i in range(n_splits):
        start = i * fold_size
        if i != n_splits - 1:
            end = start + fold_size
        else:
            end = len(X)
        val_indices = indices[start:end]
        train_indices = np.concatenate((indices[:start], indices[end:]))

        X_train_fold = X[train_indices]
        y_train_fold = y[train_indices]
        X_val_fold = X[val_indices]
        y_val_fold = y[val_indices]

        knn.fit(X_train_fold, y_train_fold)
        predictions = knn.predict(X_val_fold)

        # Calculate accuracy
        accuracy = np.mean(predictions == y_val_fold)
        scores.append(accuracy)
    return scores

In [80]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

# Hyperparameter tuning
# Define the possible values for hyperparameters
k_values = [3, 5, 7, 9, 11]
distance_metrics = ['euclidean', 'manhattan']
best_score = 0
best_params = {}

for k in k_values:
    for distance_metric in distance_metrics:
        # Initialize KNN with current hyperparameters
        knn = KNN(k=k, distance_metric=distance_metric)
        # Perform cross-validation
        cv_scores = cross_validate(X, y, knn)
        mean_score = np.mean(cv_scores)
        print(f'k={k}, distance_metric={distance_metric}, Mean CV Score={mean_score}')
        # Update best hyperparameters if current mean score is better
        if mean_score > best_score:
            best_score = mean_score
            best_params = {'k': k, 'distance_metric': distance_metric}

# Train on full dataset with optimal hyperparameters and make predictions on test set

# Initialize KNN with the best hyperparameters
knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'])
# Train the model on the full training data
knn.fit(X, y)
# Make predictions on the test set
test_predictions = knn.predict(X_test)

# Save test predictions
test_ids = pd.read_csv('test.csv')['id']
pd.DataFrame({'id': test_ids, 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [0.766, 0.7703333333333333, 0.7673333333333333, 0.7693333333333333, 0.774]
k=3, distance_metric=euclidean, Mean CV Score=0.7517333333333334
k=3, distance_metric=manhattan, Mean CV Score=0.7582666666666666
k=5, distance_metric=euclidean, Mean CV Score=0.7703333333333335
k=5, distance_metric=manhattan, Mean CV Score=0.7696000000000001
k=7, distance_metric=euclidean, Mean CV Score=0.7794666666666666
k=7, distance_metric=manhattan, Mean CV Score=0.7834666666666668
k=9, distance_metric=euclidean, Mean CV Score=0.7885333333333333
k=9, distance_metric=manhattan, Mean CV Score=0.7894
k=11, distance_metric=euclidean, Mean CV Score=0.7933333333333333
k=11, distance_metric=manhattan, Mean CV Score=0.7945333333333333
Best hyperparameters: k=11, distance_metric=manhattan
