In [1]:
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:

# Directory containing the data files
data_directory = 'D:\\data\\PCA_data\\classification' #insert data file here
data_files = [f for f in os.listdir(data_directory) if f.endswith('.npy')]

# Initialize classifiers
models = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC(),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
    'MLP Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=300)
}



In [3]:
# Process each file
for file_name in data_files:
    file_path = os.path.join(data_directory, file_name)
    print(f"Processing {file_name}")
    
    data = np.load(file_path)
    X = data[:, :-1]
    y = data[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions_proba = model.predict_proba(X_test)[:,1]
        auc_score = roc_auc_score(y_test, predictions_proba)
        auprc_score = average_precision_score(y_test, predictions_proba)
        
        # Using cross-validation to estimate margin of error
        cv_auc = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
        cv_auprc = cross_val_score(model, X, y, cv=5, scoring='average_precision')

        print(f"{name} AUROC: {auc_score} ± {cv_auc.std()}")
        print(f"{name} AUPRC: {auprc_score} ± {cv_auprc.std()}")


Processing db_data_pca.npy
KNN Accuracy: 0.9118023280603761
KNN Confusion Matrix:
[[83850  1382]
 [ 6892  1688]]
KNN Classification Report:
              precision    recall  f1-score   support

        -1.0       0.92      0.98      0.95     85232
         1.0       0.55      0.20      0.29      8580

    accuracy                           0.91     93812
   macro avg       0.74      0.59      0.62     93812
weighted avg       0.89      0.91      0.89     93812

Logistic Regression Accuracy: 0.9192427408007504
Logistic Regression Confusion Matrix:
[[84687   545]
 [ 7031  1549]]
Logistic Regression Classification Report:
              precision    recall  f1-score   support

        -1.0       0.92      0.99      0.96     85232
         1.0       0.74      0.18      0.29      8580

    accuracy                           0.92     93812
   macro avg       0.83      0.59      0.62     93812
weighted avg       0.91      0.92      0.90     93812

Naive Bayes Accuracy: 0.854240395684987
Naive