In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix  
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier


In [23]:
# Load dataset
file_path = "data/Processed_Features/W100_O50_Features.csv"  # Adjust your file path
data = pd.read_csv(file_path)

# Assume the last column is the target
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target

In [24]:
#Temporary Calculation
X = X.head(100)
y = y.head(100)

In [25]:
def create_holdout_test_set(data, test_size=0.25):
    """Separates a holdout test set randomly."""
    X = data.iloc[:, :-1].values  # Features
    y = data.iloc[:, -1].values   # Target
    X_train_valid, X_holdout, y_train_valid, y_holdout = train_test_split(X, y, test_size=test_size, random_state=42)
    return X_train_valid, X_holdout, y_train_valid, y_holdout

In [26]:
# Split into train-validation and holdout test set
X_train_valid, X_holdout, y_train_valid, y_holdout = create_holdout_test_set(data, test_size=0.25)

In [27]:
# Preprocessing and Scaling
def preprocess_and_scale(X_train, X_test):
    """Scales the data using StandardScaler."""
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

In [28]:
# Evaluation Function
def evaluate_model(model, X_test, y_test):
    """Evaluate model using accuracy, classification report, and confusion matrix."""
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return accuracy, report, cm

In [29]:
# Evaluate on Holdout Test Set
def evaluate_on_holdout(model, X_holdout, y_holdout):
    """Evaluate model on the holdout set."""
    accuracy, report, cm = evaluate_model(model, X_holdout, y_holdout)
    print("Holdout Test Set Results")
    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", cm)

In [30]:
# Logistic Regression with Grid Search and K-Fold Cross-Validation
def logistic_regression_grid_search(X_train, y_train, X_test, y_test):
    """Runs Logistic Regression with Grid Search and K-Fold Cross-Validation."""
    param_grid = {'C': [0.1, 1], 'penalty': ['l2'], 'solver': ['lbfgs', 'saga']}
    kfold = KFold(n_splits=5)
    model = LogisticRegression(max_iter=1000)
    grid = GridSearchCV(model, param_grid, cv=kfold, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    accuracy, report, cm = evaluate_model(best_model, X_test, y_test)
    print("Logistic Regression with Grid Search (K-Fold)")
    print("Best Parameters:", grid.best_params_)
    print("Accuracy:", accuracy)
    # print("Classification Report:\n", report)
    # print("Confusion Matrix:\n", cm)
    return best_model

In [31]:
def decision_tree_grid_search(X_train, y_train, X_test, y_test):
    """Grid Search for Decision Tree."""
    param_grid = {'max_depth': [3, None], 'min_samples_split': [2, 5]}
    kfold = KFold(n_splits=5)
    model = DecisionTreeClassifier()
    grid = GridSearchCV(model, param_grid, cv=kfold, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    accuracy, report, cm = evaluate_model(best_model, X_test, y_test)
    print("Decision Tree Results")
    print("Best Parameters:", grid.best_params_)
    print("Accuracy:", accuracy)
    # print("Classification Report:\n", report)
    # print("Confusion Matrix:\n", cm)
    return best_model

In [32]:
def random_forest_grid_search(X_train, y_train, X_test, y_test):
    param_grid = {'n_estimators': [50], 'max_depth': [5,None], 'min_samples_split': [2]}
    kfold = KFold(n_splits=5)
    model = RandomForestClassifier()
    grid = GridSearchCV(model, param_grid, cv=kfold, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    accuracy, report, cm = evaluate_model(best_model, X_test, y_test)
    print("Random Forest Results")
    print("Best Parameters:", grid.best_params_)
    print("Accuracy:", accuracy)
    # print("Classification Report:\n", report)
    # print("Confusion Matrix:\n", cm)
    return best_model


In [33]:
def gaussian_naive_bayes(X_train, y_train, X_test, y_test):
    model = GaussianNB()
    model.fit(X_train, y_train)
    accuracy, report, cm = evaluate_model(model, X_test, y_test)
    print("Gaussian Naïve Bayes Results")
    print("Accuracy:", accuracy)
    # print("Classification Report:\n", report)
    # print("Confusion Matrix:\n", cm)
    return model


In [34]:
def svm_grid_search(X_train, y_train, X_test, y_test):
    param_grid = {'C': [0.1, 1], 'kernel': ['linear', 'rbf']}
    kfold = KFold(n_splits=5)
    model = SVC(probability=True)
    grid = GridSearchCV(model, param_grid, cv=kfold, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    accuracy, report, cm = evaluate_model(best_model, X_test, y_test)
    print("SVM Results")
    print("Best Parameters:", grid.best_params_)
    print("Accuracy:", accuracy)
    # print("Classification Report:\n", report)
    # print("Confusion Matrix:\n", cm)
    return best_model


In [35]:
def knn_grid_search(X_train, y_train, X_test, y_test):
    param_grid = {'n_neighbors': [3, 5], 'weights': ['uniform', 'distance']}
    kfold = KFold(n_splits=5)
    model = KNeighborsClassifier()
    grid = GridSearchCV(model, param_grid, cv=kfold, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    accuracy, report, cm = evaluate_model(best_model, X_test, y_test)
    print("KNN Results")
    print("Best Parameters:", grid.best_params_)
    print("Accuracy:", accuracy)
    # print("Classification Report:\n", report)
    # print("Confusion Matrix:\n", cm)
    return best_model


In [36]:
def adaboost_grid_search(X_train, y_train, X_test, y_test):
    param_grid = {'n_estimators': [50], 'learning_rate': [0.01]}
    kfold = KFold(n_splits=5)
    model = AdaBoostClassifier()
    grid = GridSearchCV(model, param_grid, cv=kfold, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    accuracy, report, cm = evaluate_model(best_model, X_test, y_test)
    print("AdaBoost Results")
    print("Best Parameters:", grid.best_params_)
    print("Accuracy:", accuracy)
    # print("Classification Report:\n", report)
    # print("Confusion Matrix:\n", cm)
    return best_model


In [37]:
def gradient_boost_grid_search(X_train, y_train, X_test, y_test):
    param_grid = {'learning_rate': [0.1, 0.2], 'n_estimators': [50], 'max_depth': [3]}
    kfold = KFold(n_splits=5)
    model = GradientBoostingClassifier()
    grid = GridSearchCV(model, param_grid, cv=kfold, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    accuracy, report, cm = evaluate_model(best_model, X_test, y_test)
    print("Gradient Boost Results")
    print("Best Parameters:", grid.best_params_)
    print("Accuracy:", accuracy)
    # print("Classification Report:\n", report)
    # print("Confusion Matrix:\n", cm)
    return best_model


In [38]:
def xgboost_grid_search(X_train, y_train, X_test, y_test):
    param_grid = {'learning_rate': [0.01], 'n_estimators': [50], 'max_depth': [3]}
    kfold = KFold(n_splits=5)
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    grid = GridSearchCV(model, param_grid, cv=kfold, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    accuracy, report, cm = evaluate_model(best_model, X_test, y_test)
    print("XGBoost Results")
    print("Best Parameters:", grid.best_params_)
    print("Accuracy:", accuracy)
    # print("Classification Report:\n", report)
    # print("Confusion Matrix:\n", cm)
    return best_model


In [39]:
def ann_grid_search(X_train, y_train, X_test, y_test):
    param_grid = {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'activation': ['relu', 'tanh'], 'learning_rate': ['constant', 'adaptive']}
    kfold = KFold(n_splits=5)
    model = MLPClassifier(max_iter=500)
    grid = GridSearchCV(model, param_grid, cv=kfold, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    accuracy, report, cm = evaluate_model(best_model, X_test, y_test)
    print("Artificial Neural Networks (ANN) Results")
    print("Best Parameters:", grid.best_params_)
    print("Accuracy:", accuracy)
    # print("Classification Report:\n", report)
    # print("Confusion Matrix:\n", cm)
    return best_model


In [40]:
# Preprocess and scale
X_train, X_test, y_train, y_test = train_test_split(X_train_valid, y_train_valid, test_size=0.2, random_state=42)
X_train_scaled, X_test_scaled = preprocess_and_scale(X_train, X_test)
X_holdout_scaled = StandardScaler().fit_transform(X_holdout)

In [None]:
def main():
    # List of models and their functions
    models = {
        "Logistic Regression": logistic_regression_grid_search,
        "Decision Tree": decision_tree_grid_search,
        "Random Forest": random_forest_grid_search,
        "Gaussian Naive Bayes": gaussian_naive_bayes,
        "SVM": svm_grid_search,
        "KNN": knn_grid_search,
        "AdaBoost": adaboost_grid_search,
        "Gradient Boost": gradient_boost_grid_search,
        "XGBoost": xgboost_grid_search,
        "Artificial Neural Networks (ANN)": ann_grid_search,
    }

    # Run each model
    for model_name, model_function in models.items():
        print(f"\n--- Running {model_name} ---")
        best_model = model_function(X_train_scaled, y_train, X_test_scaled, y_test)
        
        # Evaluate on holdout test set
        print(f"\n--- Evaluating {model_name} on Holdout Test Set ---")
        evaluate_on_holdout(best_model, X_holdout_scaled, y_holdout)

if __name__ == "__main__":
    main()


--- Running Logistic Regression ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression with Grid Search (K-Fold)
Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy: 0.479413817166783

--- Evaluating Logistic Regression on Holdout Test Set ---
Holdout Test Set Results
Accuracy: 0.4704445530043967
Classification Report:
               precision    recall  f1-score   support

         133       0.62      0.35      0.45        74
         134       0.78      0.93      0.85        71
         135       0.59      0.93      0.72        75
         136       0.62      0.91      0.74        78
         137       0.51      0.48      0.50        75
         138       0.50      0.24      0.33        83
         139       0.32      0.34      0.33        61
         140       0.50      0.88      0.64        77
         141       0.82      0.86      0.84        59
         142       0.97      0.97      0.97        69
         143       0.53      0.57      0.55        54
         144       0.50      0.03      0.05        71
         145       0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Gaussian Naïve Bayes Results
Accuracy: 0.3913700860665271

--- Evaluating Gaussian Naive Bayes on Holdout Test Set ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Holdout Test Set Results
Accuracy: 0.3733686928606323
Classification Report:
               precision    recall  f1-score   support

         133       0.28      0.69      0.40        74
         134       0.84      0.93      0.88        71
         135       0.71      0.96      0.81        75
         136       0.78      0.94      0.85        78
         137       0.18      0.08      0.11        75
         138       0.45      0.22      0.29        83
         139       0.09      0.21      0.13        61
         140       0.30      0.81      0.44        77
         141       0.88      0.90      0.89        59
         142       0.96      0.94      0.95        69
         143       0.50      0.67      0.57        54
         144       0.00      0.00      0.00        71
         145       0.00      0.00      0.00        78
         146       0.19      0.09      0.12        69
         147       0.10      0.01      0.02        77
         148       0.44      0.48      0.46        60
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


AdaBoost Results
Best Parameters: {'learning_rate': 0.01, 'n_estimators': 50}
Accuracy: 0.0654803442661084

--- Evaluating AdaBoost on Holdout Test Set ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Holdout Test Set Results
Accuracy: 0.06518249703398701
Classification Report:
               precision    recall  f1-score   support

         133       0.00      0.00      0.00        74
         134       0.00      0.00      0.00        71
         135       0.00      0.00      0.00        75
         136       0.00      0.00      0.00        78
         137       0.00      0.00      0.00        75
         138       0.00      0.00      0.00        83
         139       0.00      0.00      0.00        61
         140       0.00      0.00      0.00        77
         141       0.00      0.00      0.00        59
         142       0.00      0.00      0.00        69
         143       0.00      0.00      0.00        54
         144       0.01      0.48      0.02        71
         145       0.20      0.01      0.02        78
         146       0.00      0.00      0.00        69
         147       0.00      0.00      0.00        77
         148       0.00      0.00      0.00        60
  

In [42]:
## Execute All Models One by One

In [None]:
run_decision_tree()

In [None]:
run_random_forest()

In [None]:
run_gradient_boosting()

In [None]:
run_ada_boost()

In [None]:
run_gaussian_nb()

In [None]:
run_svc()

In [None]:
run_knn()

In [None]:
run_xgboost()

In [None]:
run_mlp()