In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [2]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
    
    def fit(self, X, y):
        # Store the training data
        self.X_train = X
        self.y_train = y
        
    def compute_distance(self, X1, X2):
        # Compute Manhattan distance (L1 norm)
        if self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))
        # Default to Euclidean distance (L2 norm)
        else:
            return np.sqrt(np.sum((X1 - X2) ** 2))

    def predict(self, X):
        predictions = []
        
        for test_point in X:
            # Reshape test_point to (1, n_features) to ensure broadcasting works
            test_point = test_point.reshape(1, -1)
            
            # Compute distances using the selected distance metric (Euclidean or Manhattan)
            if self.distance_metric == 'manhattan':
                distances = np.sum(np.abs(self.X_train - test_point), axis=1)
            else:  # Default is Euclidean
                distances = np.linalg.norm(self.X_train - test_point, axis=1)
            
            # Get the indices of the k nearest neighbors
            k_indices = np.argsort(distances)[:self.k]
            
            # Get the labels of the k nearest neighbors
            k_labels = self.y_train[k_indices]
            
            # Predict the most common label among neighbors
            majority_vote = Counter(k_labels).most_common(1)[0][0]
            predictions.append(majority_vote)
        
        return np.array(predictions)



In [3]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Drop unnecessary columns like 'id', 'CustomerId', and 'Surname'
    train_data_cleaned = train_data.drop(columns=['id', 'CustomerId', 'Surname'])
    test_data_cleaned = test_data.drop(columns=['id', 'CustomerId', 'Surname'])
    
    # Combine training and test data for consistent label encoding
    combined_data = pd.concat([train_data_cleaned, test_data_cleaned], axis=0)
    
    # Encode categorical columns like 'Geography' and 'Gender'
    label_encoder = LabelEncoder()
    combined_data['Geography'] = label_encoder.fit_transform(combined_data['Geography'])
    combined_data['Gender'] = label_encoder.fit_transform(combined_data['Gender'])
    
    # Split the combined data back into train and test sets
    train_data_cleaned = combined_data.iloc[:len(train_data_cleaned)]
    test_data_cleaned = combined_data.iloc[len(train_data_cleaned):]
    
    # Create new features (Ratios and Interaction terms)
    train_data_cleaned['Balance_Age'] = train_data_cleaned['Balance'] / (train_data_cleaned['Age'] + 1e-5)
    train_data_cleaned['Salary_Products'] = train_data_cleaned['EstimatedSalary'] / (train_data_cleaned['NumOfProducts'] + 1e-5)
    train_data_cleaned['Credit_Age'] = train_data_cleaned['CreditScore'] * train_data_cleaned['Age']
    train_data_cleaned['Balance_ActiveMember'] = train_data_cleaned['Balance'] * train_data_cleaned['IsActiveMember']
    
    test_data_cleaned['Balance_Age'] = test_data_cleaned['Balance'] / (test_data_cleaned['Age'] + 1e-5)
    test_data_cleaned['Salary_Products'] = test_data_cleaned['EstimatedSalary'] / (test_data_cleaned['NumOfProducts'] + 1e-5)
    test_data_cleaned['Credit_Age'] = test_data_cleaned['CreditScore'] * test_data_cleaned['Age']
    test_data_cleaned['Balance_ActiveMember'] = test_data_cleaned['Balance'] * test_data_cleaned['IsActiveMember']
    
    # Scale the numerical columns
    scaler = StandardScaler()
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 
                          'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 
                          'Balance_Age', 'Salary_Products', 'Credit_Age', 'Balance_ActiveMember']
    
    train_data_cleaned[numerical_features] = scaler.fit_transform(train_data_cleaned[numerical_features])
    test_data_cleaned[numerical_features] = scaler.transform(test_data_cleaned[numerical_features])
    
    # Separate features and target
    X_train = train_data_cleaned.drop(columns=['Exited'])
    y_train = train_data_cleaned['Exited']
    X_test = test_data_cleaned
    
    return X_train, y_train, X_test


In [4]:
def cross_validate(X, y, knn, n_splits=5):
    # Initialize k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    roc_auc_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    accuracy_scores = []
    
    # Loop through each split (train/validation)
    for train_index, val_index in kf.split(X):
        # Use iloc to index the DataFrame using the integer indices from KFold
        X_train_fold = X.iloc[train_index]
        X_val_fold = X.iloc[val_index]
        y_train_fold = y.iloc[train_index]
        y_val_fold = y.iloc[val_index]
        
        # Train the KNN on the train fold
        knn.fit(X_train_fold.values, y_train_fold.values)
        
        # Predict on the validation fold
        y_val_pred = knn.predict(X_val_fold.values)
        
        # Compute metrics for the validation fold
        roc_auc = roc_auc_score(y_val_fold, y_val_pred)
        precision = precision_score(y_val_fold, y_val_pred)
        recall = recall_score(y_val_fold, y_val_pred)
        f1 = f1_score(y_val_fold, y_val_pred)
        accuracy = accuracy_score(y_val_fold, y_val_pred)
        
        # Append the scores
        roc_auc_scores.append(roc_auc)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        accuracy_scores.append(accuracy)
    
    # Return the mean of the scores across all folds
    return {
        'roc_auc': np.mean(roc_auc_scores),
        'precision': np.mean(precision_scores),
        'recall': np.mean(recall_scores),
        'f1': np.mean(f1_scores),
        'accuracy': np.mean(accuracy_scores)
    }


In [5]:
# Preprocess the data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Ensure that 'Exited' is not part of the test set
X_test = X_test.drop(columns=['Exited'])

# # Print the shape of the training and test sets
# print(f"Shape of X_train: {X.shape}")
# print(f"Shape of X_test: {X_test.shape}")

# # Print the columns of X_train and X_test to see the difference
# print(f"Columns in X_train: {X.columns}")
# print(f"Columns in X_test: {X_test.columns}")

# Define a wider range of k values to test
k_values = list(range(1, 51, 2))  # Test odd k values between 1 and 50

best_k = None
best_cv_score = 0

best_metrics = {}

# Perform cross-validation for different values of k
for k in k_values:
    print(f"Evaluating KNN with k={k}...")
    
    # Initialize the KNN with the current k value
    knn = KNN(k=k, distance_metric='manhattan')  # You can switch to 'euclidean' if needed
    
    # Perform cross-validation and get all metrics
    metrics = cross_validate(X, y, knn)
    
    # Print the metrics for this k
    print(f"Cross-validation metrics for k={k}:")
    print(f"ROC AUC: {metrics['roc_auc']}")
    print(f"Precision: {metrics['precision']}")
    print(f"Recall: {metrics['recall']}")
    print(f"F1 Score: {metrics['f1']}")
    print(f"Accuracy: {metrics['accuracy']}")
    
    # Update the best k based on ROC AUC score
    if metrics['roc_auc'] > best_cv_score:
        best_cv_score = metrics['roc_auc']
        best_k = k
        best_metrics = metrics

# Print the best k and its metrics
print(f"\nBest k: {best_k}, with cross-validation metrics:")
print(f"ROC AUC: {best_metrics['roc_auc']}")
print(f"Precision: {best_metrics['precision']}")
print(f"Recall: {best_metrics['recall']}")
print(f"F1 Score: {best_metrics['f1']}")
print(f"Accuracy: {best_metrics['accuracy']}")

print(f"Best k: {best_k}, with cross-validation score: {best_cv_score}")

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric='manhattan')  # Use the best k with Manhattan distance
knn.fit(X.values, y.values)
test_predictions = knn.predict(X_test.values)

# Save test predictions
pd.DataFrame({
    'id': pd.read_csv('test.csv')['id'],
    'Exited': test_predictions
}).to_csv('submissions.csv', index=False)

print("Test predictions saved to 'submissions.csv'")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_cleaned['Balance_Age'] = train_data_cleaned['Balance'] / (train_data_cleaned['Age'] + 1e-5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_cleaned['Salary_Products'] = train_data_cleaned['EstimatedSalary'] / (train_data_cleaned['NumOfProducts'] + 1e-5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-

Evaluating KNN with k=1...
Cross-validation metrics for k=1:
ROC AUC: 0.7556550305937402
Precision: 0.6132597914383908
Recall: 0.6087238515665502
F1 Score: 0.6106808290220231
Accuracy: 0.8431333333333335
Evaluating KNN with k=3...
Cross-validation metrics for k=3:
ROC AUC: 0.7712483242943373
Precision: 0.7002797773783923
Recall: 0.6084870810170612
F1 Score: 0.6509316669234559
Accuracy: 0.8681333333333333
Evaluating KNN with k=5...
Cross-validation metrics for k=5:
ROC AUC: 0.7728574861459014
Precision: 0.7384829347163234
Recall: 0.5994334042315053
F1 Score: 0.6616311243201419
Accuracy: 0.8761333333333333
Evaluating KNN with k=7...
Cross-validation metrics for k=7:
ROC AUC: 0.7738903987571335
Precision: 0.7558636798429786
Recall: 0.5965866859225095
F1 Score: 0.666763247042646
Accuracy: 0.8795333333333334
Evaluating KNN with k=9...
Cross-validation metrics for k=9:
ROC AUC: 0.7736163859881394
Precision: 0.7655095469556382
Recall: 0.5932843169554048
F1 Score: 0.668400363799705
Accuracy: 0