In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from ngboost import NGBClassifier
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from river import forest 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Scikit-learn Classifiers
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
    HistGradientBoostingClassifier,
    BaggingClassifier, 
    IsolationForest
)
from sklearn.linear_model import (
    LogisticRegression,
    RidgeClassifier,
    RidgeClassifierCV,
    SGDClassifier,
    Perceptron,
    PassiveAggressiveClassifier,
    LogisticRegressionCV,
    BayesianRidge,
    PassiveAggressiveRegressor,
    HuberRegressor,
    RANSACRegressor,
    TheilSenRegressor
)
from sklearn.svm import SVC, LinearSVC, NuSVC, OneClassSVM
from sklearn.neighbors import (
    KNeighborsClassifier,
    RadiusNeighborsClassifier,
    NearestCentroid,
    LocalOutlierFactor
)
from sklearn.naive_bayes import (
    GaussianNB,
    MultinomialNB,
    ComplementNB,
    BernoulliNB,
    CategoricalNB
)
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis
)
from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.calibration import CalibratedClassifierCV
from sklearn.dummy import DummyClassifier

In [3]:
def create_deep_neural_network(input_shape, num_classes):
    model = Sequential([
        # Input layer
        Dense(256, activation='relu', input_shape=(input_shape,)),
        BatchNormalization(),
        Dropout(0.3),
        
        # Hidden layers
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        
        # Output layer
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [4]:
classifiers = {
    # Scikit-learn Base Classifiers
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Extra Tree': ExtraTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Hist Gradient Boosting': HistGradientBoostingClassifier(random_state=42),
    'Extra Trees': ExtraTreesClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(),
    'Logistic Regression CV': LogisticRegressionCV(random_state=42),
    'Ridge Classifier': RidgeClassifier(random_state=42),
    'Ridge Classifier CV': RidgeClassifierCV(),
    'SGD Classifier': SGDClassifier(random_state=42),
    'Bayesian Ridge': BayesianRidge(),
    'Perceptron': Perceptron(random_state=42),
    'Passive Aggressive': PassiveAggressiveClassifier(random_state=42),
    'SVM (RBF Kernel)': SVC(random_state=42),
    'SVM (Linear Kernel)': SVC(kernel='linear', random_state=42),
    'SVM (Polynomial Kernel)': SVC(kernel='poly', random_state=42),
    'Linear SVM': LinearSVC(random_state=42),
    'Nu SVM': NuSVC(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Radius Nearest Neighbors': RadiusNeighborsClassifier(radius=1.0, outlier_label='most_frequent'),
    'Nearest Centroid': NearestCentroid(),
    'One-Class SVM': OneClassSVM(),
    'Calibrated Classifier': CalibratedClassifierCV(cv=5),
    'Bagging Classifier': BaggingClassifier(random_state=42),
    'Isolation Forest': IsolationForest(random_state=42),
    'Local Outlier Factor': LocalOutlierFactor(n_neighbors=20, novelty=True),

    # Naive Bayes Classifiers
    'Gaussian Naive Bayes': GaussianNB(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Complement Naive Bayes': ComplementNB(),
    'Bernoulli Naive Bayes': BernoulliNB(),
    'Categorical Naive Bayes': CategoricalNB(),

    # Advanced Classifiers
    'Multi-Layer Perceptron': MLPClassifier(random_state=42, max_iter=1000),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    'Label Spreading': LabelSpreading(),
    'Label Propagation': LabelPropagation(),
    'Gaussian Process': GaussianProcessClassifier(random_state=42),

    'Dummy Classifier': DummyClassifier(strategy='most_frequent'),
    'Gaussian Process Regressor': GaussianProcessRegressor(),
    'Passive Aggressive Regressor': PassiveAggressiveRegressor(),
    'Huber Regressor': HuberRegressor(),
    'RANSAC Regressor': RANSACRegressor(),
    'Theil Sen Regressor': TheilSenRegressor(),
}

new_classifiers = {
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'CatBoost': cb.CatBoostClassifier(random_state=42, verbose=False),
    'NGBoost': NGBClassifier(random_state=42),
    'River Random Forest': forest.ARFClassifier(  # Corrected classifier
        n_models=10,
        seed=42
    ),
    'Deep Neural Network': 'DNN'
}
classifiers.update(new_classifiers)

In [5]:
def train_deep_neural_network(X_train, y_train, X_test):
    """
    Train a deep neural network and make predictions
    
    Parameters:
    -----------
    X_train : array-like
        Training features
    y_train : array-like
        Training labels
    X_test : array-like
        Test features
        
    Returns:
    --------
    array-like
        Predicted labels for test data
    """
    # Create and train DNN
    num_classes = len(np.unique(y_train))
    model = create_deep_neural_network(X_train.shape[1], num_classes)
    
    # Early stopping to prevent overfitting
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0
    )
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    
    # Clear Keras session to free memory
    tf.keras.backend.clear_session()
    
    return y_pred_classes

In [6]:
def evaluate_classifiers(X_train, y_train, X_test, y_test, use_scaler=False):
    """
    Evaluate all classifiers with optional scaling
    
    Parameters:
    -----------
    use_scaler : bool
        Whether to use StandardScaler or not
    """
    results = {}
    
    # Prepare the data
    if use_scaler:
        print("Applying StandardScaler transformation...")
        scaler = StandardScaler()
        X_train_prepared = scaler.fit_transform(X_train)
        X_test_prepared = scaler.transform(X_test)
    else:
        print("Using data without StandardScaler...")
        X_train_prepared = X_train
        X_test_prepared = X_test
    
    # Evaluate each classifier
    for name, clf in classifiers.items():
        try:
            print(f"\nTraining {name}...")
            
            if name == 'Deep Neural Network':
                y_pred = train_deep_neural_network(X_train_prepared, 
                                                 y_train.values.ravel(), 
                                                 X_test_prepared)
            elif name == 'River Random Forest':
                # Special handling for River classifier
                y_pred = []
                model = clf
                for i in range(len(X_test_prepared)):
                    model.learn_one(X_train_prepared.iloc[i], y_train.iloc[i])
                for i in range(len(X_test_prepared)):
                    pred = model.predict_one(X_test_prepared.iloc[i])
                    y_pred.append(pred)
            else:
                clf.fit(X_train_prepared, y_train.values.ravel())
                y_pred = clf.predict(X_test_prepared)
            
            accuracy = accuracy_score(y_test, y_pred)
            results[name] = accuracy * 100
            
            print(f"\nAccuracy for {name}:")
            print(results[name])
            # print(f"\nClassification Report for {name}:")
            # print(classification_report(y_test, y_pred))
            
        except Exception as e:
            print(f"Error with {name}: {str(e)}")
            continue
    
    return dict(sorted(results.items(), key=lambda x: x[1], reverse=True))

In [7]:
def run_comparison(x_tr_resample, y_tr_resample, X_test, y_test):
    # Run without scaler
    print("\nRunning models WITHOUT StandardScaler:")
    results_without_scaler = evaluate_classifiers(
        x_tr_resample, y_tr_resample, X_test, y_test, 
        use_scaler=False
    )
    
    print("\nAccuracy Rankings (WITHOUT StandardScaler):")
    print("-" * 50)
    for clf_name, accuracy in results_without_scaler.items():
        print(f"{clf_name:<30} : {accuracy:.2f}%")
    
    # Run with scaler
    print("\nRunning models WITH StandardScaler:")
    results_with_scaler = evaluate_classifiers(
        x_tr_resample, y_tr_resample, X_test, y_test, 
        use_scaler=True
    )
    
    print("\nAccuracy Rankings (WITH StandardScaler):")
    print("-" * 50)
    for clf_name, accuracy in results_with_scaler.items():
        print(f"{clf_name:<30} : {accuracy:.2f}%")

In [8]:
x_tr_resample = pd.read_csv('../Data/sequential/n=7/X_train_smote.csv')
y_tr_resample = pd.read_csv('../Data/sequential/n=7/y_train_smote.csv')
X_test = pd.read_csv('../Data/sequential/n=7/X_test.csv')
y_test = pd.read_csv('../Data/sequential/n=7/y_test.csv')

In [9]:
run_comparison(x_tr_resample, y_tr_resample, X_test, y_test)


Running models WITHOUT StandardScaler:
Using data without StandardScaler...

Training Decision Tree...

Accuracy for Decision Tree:
98.22244794311833

Training Extra Tree...

Accuracy for Extra Tree:
98.42559674961909

Training Random Forest...

Accuracy for Random Forest:
99.13661757237176

Training Gradient Boosting...

Accuracy for Gradient Boosting:
98.83189436262062

Training Hist Gradient Boosting...

Accuracy for Hist Gradient Boosting:
98.88268156424581

Training Extra Trees...

Accuracy for Extra Trees:
99.13661757237176

Training AdaBoost...

Accuracy for AdaBoost:
98.47638395124429

Training Logistic Regression...

Accuracy for Logistic Regression:
97.15591670898934

Training Logistic Regression CV...

Accuracy for Logistic Regression CV:
97.10512950736414

Training Ridge Classifier...

Accuracy for Ridge Classifier:
96.59725749111225

Training Ridge Classifier CV...

Accuracy for Ridge Classifier CV:
96.59725749111225

Training SGD Classifier...

Accuracy for SGD Classifie