In [27]:
import numpy as np 
import pandas as pd 

In [28]:
# Define the KNN class
class KNN:
    def __init__(self, k, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.Y_train = np.array(y)
        
    def compute_distance(self, x, X):
        x = np.array(x) 
        X = np.array(X)
        if self.distance_metric == 'euclidean':
            squared_diffs = (X - x) ** 2
            summed_squared_diffs = np.sum(squared_diffs, axis=1)
            distances = np.sqrt(summed_squared_diffs)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
        return distances
    
    def predict_proba(self, X):
        """Predict continuous probabilities for the test data."""
        probabilities = []
        for x in X:
            distances = self.compute_distance(x, self.X_train)
            k_nearest_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.Y_train[k_nearest_indices]  # Keep floats intact
            # Calculate the mean (or weighted average) of nearest labels
            prob = np.mean(k_nearest_labels)  # Use mean for continuous probabilities
            probabilities.append(prob)  # Assume binary case for simplicity

        return np.array(probabilities)
    def get_exit_probabilities(knn_model, X_test):
        # Ensure X_test is a NumPy array or DataFrame
        if isinstance(X_test, pd.DataFrame):
            X_test = X_test.values  # Convert DataFrame to NumPy array
        return knn_model.predict_proba(X_test)
        

In [29]:
def preprocess_data(train_path, test_path, k=5):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    train_data = train_data.iloc[:, 3:]  # Keep columns from index 3 onwards
    test_data = test_data.iloc[:, 3:]    # Keep columns from index 3 onwards
    
    train_data = train_data.drop(train_data.columns[[1,2,6,7,8]], axis = 1)
    test_data = test_data.drop(test_data.columns[[1,2,6,7,8]], axis = 1)
    
    # Handle missing values and data type conversions
    train_data = handle_missing_values(train_data)
    test_data = handle_missing_values(test_data)
    
    
    # Convert appropriate columns to numeric types
    numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']  # Update with actual numeric column names
    for col in numeric_cols:
        train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
        test_data[col] = pd.to_numeric(test_data[col], errors='coerce')
    # Separate features and labels
    X_train = train_data.drop('Exited', axis=1)  # Assuming 'Exited' is the label
    y_train = train_data['Exited']
    X_test = test_data.copy()  # Test data typically has no labels
    
    #X_train, y_train = smote(X_train.to_numpy(), y_train.to_numpy(), k=5)
    
    # Scale numerical features
    X_train, X_test = scale_features(X_train, X_test)
    
    return X_train, X_test, y_train

def handle_missing_values(df):
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:
            df[col].fillna(df[col].mean(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)
    return df

def scale_features(train_df, test_df):
    numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns

    for col in numerical_cols:
        mean = train_df[col].mean()
        std = train_df[col].std()

        # Scale bosth train and test sets using the same statistics
        train_df[col] = (train_df[col] - mean) / std
        test_df[col] = (test_df[col] - mean) / std

    return train_df, test_df

In [30]:
def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(X) // n_splits
    indices = np.arange(len(X))  # Use the natural order of indices
    auc_scores = []
    for i in range(n_splits):
        # Define test indices for the current fold
        if i == n_splits - 1:  # Last fold
            test_idx = indices[i * fold_size:]  # Get remaining samples
        else:
            test_idx = indices[i * fold_size: (i + 1) * fold_size]  # Normal case
        
        # Define train indices
        train_idx = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])
        
        # Use .iloc for DataFrame indexing
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        # Fit the model and make predictions
        knn.fit(X_train.values, y_train.values)
        y_pred = knn.predict_proba(X_test.values)

        # Compute AUC score and append to scores list
        auc = compute_roc_auc(y_test.values, y_pred)
        auc_scores.append(auc)

    return auc_scores

def compute_roc_auc(y_true, y_prob):
    # Sort by predicted probabilities
    sorted_indices = np.argsort(y_prob)[::-1]
    y_true = y_true[sorted_indices]
    y_prob = y_prob[sorted_indices]

    # Compute true positive and false positive rates
    tpr = np.cumsum(y_true) / np.sum(y_true)  # True Positive Rate
    fpr = np.cumsum(1 - y_true) / np.sum(1 - y_true)  # False Positive Rate

    # Compute the AUC using the trapezoidal rule
    auc = np.trapz(tpr, fpr)
    return auc

In [None]:
# Load and preprocess data
X, X_test, y = preprocess_data('train.csv', 'test.csv')

# Hyperparameter tuning
def hyperparameter_tuning(X, y, k_values=[i for i in range(45,100)]):
    best_k = None
    best_score = -np.inf

    for k in k_values:
        knn = KNN(k=k, distance_metric='euclidean')
        scores = cross_validate(X, y, knn, n_splits=5)
        mean_score = np.mean(scores)
        print(f"k={k}, Mean AUC={mean_score:.4f}")

        if mean_score > best_score:
            best_score = mean_score
            best_k = k

    print(f"Best k: {best_k} with AUC: {best_score:.4f}")
    return best_k

# Perform hyperparameter tuning
optimal_k = hyperparameter_tuning(X, y)

# Train final model with optimal hyperparameters
knn = KNN(k=optimal_k, distance_metric='euclidean')
knn.fit(X, y)

# Make predictions on test set
test_predictions = knn.get_exit_probabilities(X_test)

# Save test predictions
test_data = pd.read_csv('test.csv')  # Load the test file to get the 'id' column
submission = pd.DataFrame({
    'id': test_data['id'],
    'Exited': test_predictions
})
submission.to_csv('submissions.csv', index=False)

print("Test predictions saved to 'submissions.csv'")

k=45, Mean AUC=0.8229
k=46, Mean AUC=0.8237
k=47, Mean AUC=0.8237
k=48, Mean AUC=0.8238
k=49, Mean AUC=0.8245
k=50, Mean AUC=0.8248
k=51, Mean AUC=0.8252
k=52, Mean AUC=0.8257
k=53, Mean AUC=0.8258
k=54, Mean AUC=0.8264
k=55, Mean AUC=0.8268
k=56, Mean AUC=0.8271
k=57, Mean AUC=0.8271
k=58, Mean AUC=0.8267
k=59, Mean AUC=0.8274
