In [47]:
import numpy as np
import pandas as pd
from collections import Counter

In [48]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
    
    def fit(self, X, y):
        self.X_train = np.asarray(X, dtype=np.float64)
        self.y_train = np.asarray(y, dtype=np.float64)
    
    def predict(self, X):
        X = np.asarray(X, dtype=np.float64)
        y_pred = []
        for i in range(X.shape[0]):
            x1 = X[i]
            distances = self.compute_distance(x1, self.X_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            k_nearest_distances = distances[k_indices]
            epsilon = 1e-5 
            weights = 1 / (k_nearest_distances + epsilon)
            prob = np.sum(weights * k_nearest_labels) / np.sum(weights)
            y_pred.append(prob)
        return np.array(y_pred)
    
    def compute_distance(self, x1, X2):
        x1 = x1.reshape(1, -1)
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X2 - x1)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X2 - x1), axis=1)
        else:
            raise ValueError('Invalid distance metric')
        return distances

In [49]:
# Oversampling function
def oversample_data(X, y):
    counter = Counter(y)
    max_count = max(counter.values())
    X_list = [X[y == label] for label in counter.keys()]
    y_list = [y[y == label] for label in counter.keys()]
    X_resampled = np.vstack([np.tile(X_class, (max_count // len(X_class) + 1, 1))[:max_count] for X_class in X_list])
    y_resampled = np.hstack([np.tile(y_class, max_count // len(y_class) + 1)[:max_count] for y_class in y_list])
    return X_resampled, y_resampled

# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    train_data = train_data.dropna()
    test_data = test_data.dropna()
    
    train_data['is_train'] = 1
    test_data['is_train'] = 0
    full_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
    
    full_data = full_data.drop(['Surname', 'CustomerId'], axis=1)
    
    full_data['Gender'] = full_data['Gender'].map({'Male': 0, 'Female': 1})
    geography_dummies = pd.get_dummies(full_data['Geography'], prefix='Geography')
    full_data = pd.concat([full_data.drop('Geography', axis=1), geography_dummies], axis=1)
    
    full_data['BalanceSalaryRatio'] = full_data['Balance'] / (full_data['EstimatedSalary'] + 1)
    full_data['TenureByAge'] = full_data['Tenure'] / (full_data['Age'] + 1)
    full_data['CreditScoreGivenAge'] = full_data['CreditScore'] / (full_data['Age'] + 1)
    
    train_corr = full_data[full_data['is_train'] == 1].corr()
    corr_with_target = train_corr['Exited'].abs().sort_values(ascending=False)
    top_features = corr_with_target.index[1:10]  # Exclude 'Exited' itself
    
    selected_features = list(top_features)
    
    features_to_scale = selected_features.copy()
    features_to_scale.remove('Gender') 
    
    train_features = full_data[full_data['is_train'] == 1]
    means = train_features[features_to_scale].mean()
    stds = train_features[features_to_scale].std()
    
    full_data[features_to_scale] = (full_data[features_to_scale] - means) / stds
    
    train_data = full_data[full_data['is_train'] == 1].drop(['is_train'], axis=1)
    test_data = full_data[full_data['is_train'] == 0].drop(['is_train', 'Exited'], axis=1)
    
    X_train = train_data[selected_features].values
    y_train = train_data['Exited'].astype(int).values
    
    X_train, y_train = oversample_data(X_train, y_train)
    
    X_test = test_data[selected_features].values
    
    test_ids = test_data['id'].values
    
    return X_train, y_train, X_test, test_ids

In [50]:
# Define cross-validation function
def cross_validate(X, y, k_values, distance_metrics, n_splits=5):
    from sklearn.model_selection import KFold
    from sklearn.metrics import roc_auc_score
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    best_params = {}
    best_auc = 0
    
    for k in k_values:
        for distance_metric in distance_metrics:
            auc_scores = []
            for train_index, val_index in kf.split(X):
                X_train_cv, X_val_cv = X[train_index], X[val_index]
                y_train_cv, y_val_cv = y[train_index], y[val_index]
                
                knn = KNN(k=k, distance_metric=distance_metric)
                knn.fit(X_train_cv, y_train_cv)
                y_pred_prob = knn.predict(X_val_cv)
                y_pred_prob = np.clip(y_pred_prob, 0, 1)
                
                auc = roc_auc_score(y_val_cv, y_pred_prob)
                auc_scores.append(auc)
            
            mean_auc = np.mean(auc_scores)
            print(f"k={k}, distance_metric={distance_metric}, AUC={mean_auc:.4f}")
            
            if mean_auc > best_auc:
                best_auc = mean_auc
                best_params['k'] = k
                best_params['distance_metric'] = distance_metric
    return best_params, best_auc


In [51]:
# Load and preprocess data
X, y, X_test, test_ids = preprocess_data('./train.csv', './test.csv')

# Hyperparameter tuning
k_values = list(range(1, 2077, 2)) 
distance_metrics = ['euclidean', 'manhattan']

best_params, best_auc = cross_validate(X, y, k_values, distance_metrics)

print(f"\nBest parameters: k={best_params['k']}, distance_metric={best_params['distance_metric']}, AUC={best_auc:.4f}")

knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'])
knn.fit(X, y)
test_predictions = knn.predict(X_test)
test_predictions = np.clip(test_predictions, 0, 1) 

submission = pd.DataFrame({'id': test_ids, 'Exited': test_predictions})
submission['id'] = submission['id'].astype(int)
submission = submission[['id', 'Exited']]
submission.to_csv('./jsubmissions.csv', index=False)

k=1, distance_metric=euclidean, AUC=0.9445
k=1, distance_metric=manhattan, AUC=0.9445
k=3, distance_metric=euclidean, AUC=0.9443
k=3, distance_metric=manhattan, AUC=0.9444
k=5, distance_metric=euclidean, AUC=0.9721
k=5, distance_metric=manhattan, AUC=0.9705

Best parameters: k=5, distance_metric=euclidean, AUC=0.9721
