In [None]:
import numpy as np
import pandas as pd

In [None]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y).astype(int)

    def compute_distance(self, X_test):
        # Vectorized Euclidean distance calculation for multiple test points
        # Uses broadcasting to compute distance between X_test and all points in X_train
        X_test = np.array(X_test)
        distances = np.sqrt(np.sum((self.X_train - X_test) ** 2, axis=1))
        return distances

    def predict(self, X):
      X = np.array(X)
      predictions = []
      for i in range(X.shape[0]):
          x_test = X[i]
          distances = self.compute_distance(x_test)
          k_indices = np.argsort(distances)[:self.k]
          k_nearest_labels = self.y_train[k_indices]
          prediction = np.argmax(np.bincount(k_nearest_labels))
          predictions.append(prediction)
      return np.array(predictions)


    def predict_proba(self, X):
      X = np.array(X)
      probas = []
      for i in range(X.shape[0]):
          x_test = X[i]
          distances = self.compute_distance(x_test)
          k_indices = np.argsort(distances)[:self.k]
          k_nearest_labels = self.y_train[k_indices]
          prob_class_1 = np.sum(k_nearest_labels) / self.k
          prob_class_0 = 1 - prob_class_1
          probas.append([prob_class_0, prob_class_1])
      return np.array(probas)

In [None]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    train_data = train_data.drop(columns=['id', 'CustomerId', 'Surname', 'HasCrCard', 'Tenure', 'CreditScore'])
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname', 'HasCrCard', 'Tenure', 'CreditScore'])

    y_train = train_data['Exited']
    train_data = train_data.drop(columns=['Exited'])

    train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    train_data.fillna(train_data.median(), inplace=True)
    test_data.fillna(test_data.median(), inplace=True)

    train_data = train_data.astype(float)
    test_data = test_data.astype(float)

    #X_train = train_data.drop(columns=['Exited'])
    #y_train = train_data['Exited']

    X_test = test_data

    X_train_scaled = (train_data - train_data.min()) / (train_data.max() - train_data.min())
    X_test_scaled = (test_data - train_data.min()) / (train_data.max() - train_data.min())


    scaling_factors = {
        'Age': 1.148512,
        'Geography_Germany': 0.597631,
        'EstimatedSalary': 0.055690,
        'Geography_Spain': 0.031196,
        'Balance': 0.285098,
        'Gender_Male': 0.406736,
        'IsActiveMember': 0.666314,
        'NumOfProducts': 0.689368
    }

    # Normalize the scaling factors (optional)
    max_scaling_factor = max(abs(v) for v in scaling_factors.values())
    normalized_scaling_factors = {k: abs(v) / max_scaling_factor for k, v in scaling_factors.items()}

    # Apply scaling to each feature in the training and test data
    for feature, factor in normalized_scaling_factors.items():
        if feature in X_train_scaled.columns:
            X_train_scaled[feature] *= factor
            X_test_scaled[feature] *= factor

    # Return the preprocessed and scaled data as numpy arrays
    return X_train_scaled.to_numpy(), y_train.to_numpy(), X_test_scaled.to_numpy()

In [None]:
# Define cross-validation function
def roc_auc_score_np(y_true, y_prob):
    # Sort the data based on the predicted probabilities
    sorted_indices = np.argsort(y_prob)[::-1]
    y_true_sorted = y_true[sorted_indices]

    # Calculate True Positive and False Positive rates
    tpr = np.cumsum(y_true_sorted) / np.sum(y_true_sorted)
    fpr = np.cumsum(1 - y_true_sorted) / np.sum(1 - y_true_sorted)

    # Add (0, 0) at the beginning of the curve
    tpr = np.concatenate([[0], tpr])
    fpr = np.concatenate([[0], fpr])

    # Calculate AUC as the area under the curve using the trapezoidal rule
    auc = np.trapz(tpr, fpr)
    return auc

def cross_validate(X, y, knn, n_splits=5):
    # Convert DataFrames to numpy arrays


    # Shuffle the data indices
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)

    # Split the indices into n_splits roughly equal parts
    fold_size = X.shape[0] // n_splits
    auc_scores = []

    for i in range(n_splits):
        # Determine test indices for this fold
        test_indices = indices[i * fold_size:(i + 1) * fold_size]

        # The rest are the training indices
        train_indices = np.setdiff1d(indices, test_indices)

        # Split the data into training and test sets
        X_train, y_train = X[train_indices], y[train_indices]
        X_test, y_test = X[test_indices], y[test_indices]

        # Train the K-NN model
        knn.fit(X_train, y_train)

        # Predict probabilities for the test set (positive class = 1)
        y_prob = knn.predict_proba(X_test)[:, 1]

        # Calculate the ROC AUC score for this fold using the helper function
        auc = roc_auc_score_np(y_test, y_prob)
        auc_scores.append(auc)

    # Return the average ROC AUC score across all folds
    return np.mean(auc_scores)


In [None]:
# Load and preprocess data
X, y, X_test = preprocess_data('/content/train.csv', '/content/test.csv')


# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning

def hyperparameter_tuning(X, y, n_splits=5):



    best_auc = 0
    best_params = {'k': None}

    # Define the hyperparameter grid to search
    k_values = [24]  # Different values of k (number of neighbors)

    # Iterate over all values of k
    for k in k_values:
        # Initialize a K-NN model with the current value of k and Euclidean distance
        knn = KNN(k=k, distance_metric='euclidean')

        # Perform cross-validation and get the average AUC score
        auc_score = cross_validate(X, y, knn, n_splits=n_splits)
        print(f"k: {k}, AUC: {auc_score}")

        # Check if this is the best score so far
        if auc_score > best_auc:
            best_auc = auc_score
            best_params = {'k': k}

    print(f"Best hyperparameters: k = {best_params['k']}")
    print(f"Best AUC: {best_auc}")
    return best_params



# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
best_params = hyperparameter_tuning(X, y)

# Train the model with the best hyperparameters on the full dataset
knn = KNN(k=best_params['k'])
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/content/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions3.csv', index=False)

Cross-validation scores: 0.8873008031725792
k: 24, AUC: 0.9195204345953838
Best hyperparameters: k = 24
Best AUC: 0.9195204345953838
