In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from scipy.stats import mode
from sklearn.metrics import roc_auc_score
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y
        return self

    def predict(self, X):
        # TODO: Implement the predict method
        # 一次性计算所有测试点与所有训练点的距离
        predictions = []
        for x in X:
            # 计算与训练数据的距离
            distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
            # 获取最近的k个邻居
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            
            # 对分类问题进行投票
            most_common = Counter(k_nearest_labels).most_common(1)
            predictions.append(most_common[0][0])
        
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

In [3]:
X_train = np.array([[1, 2], [2, 3], [3, 4]])
y_train = np.array([0, 1, 0])

X_test = np.array([[1, 2], [3, 3]])

# 创建KNN模型
knn = KNN(k=2, distance_metric='euclidean')
knn.fit(X_train, y_train)

# 预测新数据
predictions = knn.predict(X_test)
print(predictions)

knn = KNN(k=5, distance_metric='euclidean')
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
print(predictions)

[0 1]
[0 0]


In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Extract labels for training data only
    y = train_data['Exited'].values
    
    # List of features to remove
    train_features_to_remove = ['Exited', 'id', 'CustomerId', 'Surname']
    test_features_to_remove = ['id', 'CustomerId', 'Surname']  # 'Exited' is not in test data
    
    # Drop irrelevant features from training data
    X = train_data.drop(train_features_to_remove, axis=1)
    
    # Drop irrelevant features from test data
    X_test = test_data.drop(test_features_to_remove, axis=1)

    # Store CustomerId or ID from test data for later use
    test_ids = test_data['id'].values

    # Identify categorical columns
    categorical_cols = X.select_dtypes(include=['object']).columns

    # One-hot encode categorical columns
    X_encoded = pd.get_dummies(X, columns=categorical_cols)
    X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols)

    # Reindex X_test_encoded to ensure it has the same columns as X_encoded
    X_test_encoded = X_test_encoded.reindex(columns=X_encoded.columns, fill_value=0)

    # Feature scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_encoded)
    X_test_scaled = scaler.transform(X_test_encoded)

    return X_scaled, y, X_test_scaled, test_ids

In [5]:
from joblib import Parallel, delayed

def cross_validate(X, y, knn, n_splits=5, n_jobs=-1):
    # StratifiedKFold for balanced splits
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    
    # Parallelize cross-validation using joblib
    def train_and_score(train_idx, val_idx):
        # Split the data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Train the model
        knn.fit(X_train, y_train)

        # Predict and calculate ROC AUC score
        y_pred = knn.predict(X_val)
        return roc_auc_score(y_val, y_pred)
    
    # Use parallel processing to speed up cross-validation
    scores = Parallel(n_jobs=n_jobs)(
        delayed(train_and_score)(train_idx, val_idx) for train_idx, val_idx in skf.split(X, y)
    )
    
    return np.array(scores)


In [7]:
# # Load and preprocess data
# X, y, X_test = preprocess_data('train.csv', 'test.csv')

# # Create and evaluate model
# knn = KNN(k=5, distance_metric='euclidean')

# # Perform cross-validation
# cv_scores = cross_validate(X, y, knn)

# print("Cross-validation scores:", cv_scores)

# TODO: hyperparameters tuning
def tune_hyperparameters(X, y, k_values=None, metrics=None, n_splits=3):
    """
    Hyperparameter tuning
    """
    if k_values is None:
        k_values = [3, 5, 7]
    if metrics is None:
        metrics = ['euclidean', 'manhattan']
        
    best_score = 0
    best_params = {}
    
    def evaluate_params(k, metric):
        knn = KNN(k=k, distance_metric=metric)
        scores = cross_validate(X, y, knn, n_splits=n_splits)
        mean_score = scores.mean()
        return (k, metric, mean_score)

    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_params, k, metric): (k, metric) 
                   for k in k_values for metric in metrics}
        
        for future in as_completed(futures):
            k, metric, mean_score = future.result()
            print(f"Testing parameters k={k}, metric={metric} => Mean score: {mean_score:.4f}")
            
            if mean_score > best_score:
                best_score = mean_score
                best_params = {'k': k, 'distance_metric': metric}

    print(f"\nBest parameters found: {best_params}")
    print(f"Best score: {best_score:.4f}")
    return best_params, best_score


# # TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
# Perform hyperparameter tuning
# Load and preprocess data
X, y, X_test, test_ids = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# Perform hyperparameter tuning
best_params, best_score = tune_hyperparameters(X, y, n_splits=5)

# Train the best KNN model using the best hyperparameters
best_knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'])
best_knn.fit(X, y)

# Make predictions on the test set
y_test_pred = best_knn.predict(X_test)

# Create a DataFrame with the predictions
results_df = pd.DataFrame({
    'id': test_ids,  # Use the CustomerId or ID column returned from preprocess_data
    'Exited': y_test_pred
})

# Save the DataFrame to a new CSV file
results_df.to_csv('predictions.csv', index=False)

print("Prediction results saved to predictions.csv")




Cross-validation scores: [0.75497531 0.76120996 0.76965903 0.79040082 0.75575625]
Testing parameters k=3, metric=euclidean => Mean score: 0.7677
Testing parameters k=5, metric=euclidean => Mean score: 0.7664
Testing parameters k=7, metric=euclidean => Mean score: 0.7649
Testing parameters k=3, metric=manhattan => Mean score: 0.7645
Testing parameters k=7, metric=manhattan => Mean score: 0.7628
Testing parameters k=5, metric=manhattan => Mean score: 0.7675

Best parameters found: {'k': 3, 'distance_metric': 'euclidean'}
Best score: 0.7677
Prediction results saved to predictions.csv
