In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
import joblib  # For saving models

# Read the CSV file
file_path = r"F:\Studies\PainStudies Lab\Stress Assessment\WESAD Dataset\WESAD\combined_features_dataset.csv"
data = pd.read_csv(file_path)

# Drop rows where 'Label' is 0 and reset index
data_filtered = data[data['Label'] != 0].reset_index(drop=True)

# Separate features and labels
X_filtered = data_filtered.select_dtypes(include=[np.number]).drop(columns=['Label'])
y_filtered = LabelEncoder().fit_transform(data_filtered['Label'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.3, random_state=42)

# Define models with pipelines where needed
models = {
    "Logistic Regression": LogisticRegression(solver='saga', max_iter=5000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42),
    "SVM": make_pipeline(StandardScaler(), SVC(kernel='linear', probability=True, random_state=42)),
    "KNN": make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5)),
    "MLP Classifier": make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(100,), max_iter=5000, learning_rate_init=0.001, solver='adam', random_state=42)),
    "Naive Bayes": GaussianNB(),
    "Extra Trees": ExtraTreesClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(algorithm='SAMME', random_state=42),
    "Voting Classifier": VotingClassifier(estimators=[
        ('lr', LogisticRegression(solver='saga', max_iter=5000, random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', make_pipeline(StandardScaler(), SVC(probability=True, random_state=42)))
    ], voting='soft')
}

# Evaluate each model using cross-validation and print the accuracy
print("Model Performance (Accuracy):")
model_performance = {}

for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    model_performance[model_name] = (scores.mean(), scores.std())
    print(f"{model_name}: Mean Accuracy = {scores.mean():.4f}, Std = {scores.std():.4f}")

# Train and test models to report test accuracy
print("\nTest Accuracy Scores:")
test_accuracies = {}

for model_name, model in models.items():
    # Train the model on the training set and evaluate on the test set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_accuracies[model_name] = test_accuracy
    print(f"{model_name}: Test Accuracy = {test_accuracy:.4f}")

# Identify the best model and save it
best_model_name = max(test_accuracies, key=test_accuracies.get)
best_model = models[best_model_name]
best_model_accuracy = test_accuracies[best_model_name]
print(f"\nBest Model: {best_model_name} with Test Accuracy = {best_model_accuracy:.4f}")

# Save the best model pipeline
model_save_path = f"{best_model_name.replace(' ', '_').lower()}_best_model.joblib"
joblib.dump(best_model, model_save_path)
print(f"\nBest Model saved to {model_save_path}")


Model Performance (Accuracy):




Logistic Regression: Mean Accuracy = 0.5517, Std = 0.0458
Random Forest: Mean Accuracy = 0.9649, Std = 0.0098
Gradient Boosting: Mean Accuracy = 0.9345, Std = 0.0151
XGBoost: Mean Accuracy = 0.9516, Std = 0.0223
SVM: Mean Accuracy = 0.7160, Std = 0.0199
KNN: Mean Accuracy = 0.8186, Std = 0.0183
MLP Classifier: Mean Accuracy = 0.8975, Std = 0.0247
Naive Bayes: Mean Accuracy = 0.4862, Std = 0.0273
Extra Trees: Mean Accuracy = 0.9725, Std = 0.0139
AdaBoost: Mean Accuracy = 0.6638, Std = 0.0325




Voting Classifier: Mean Accuracy = 0.8661, Std = 0.0230

Test Accuracy Scores:




Logistic Regression: Test Accuracy = 0.5553
Random Forest: Test Accuracy = 0.9690
Gradient Boosting: Test Accuracy = 0.9491
XGBoost: Test Accuracy = 0.9646
SVM: Test Accuracy = 0.7190
KNN: Test Accuracy = 0.8429
MLP Classifier: Test Accuracy = 0.9115
Naive Bayes: Test Accuracy = 0.4779
Extra Trees: Test Accuracy = 0.9757
AdaBoost: Test Accuracy = 0.6372




Voting Classifier: Test Accuracy = 0.8938

Best Model: Extra Trees with Test Accuracy = 0.9757

Best Model saved to extra_trees_best_model.joblib


In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint

# Read the CSV file
file_path = r"F:\Studies\PainStudies Lab\Stress Assessment\WESAD Dataset\WESAD\combined_features_dataset.csv"
data = pd.read_csv(file_path)

# Filter and preprocess data
data_filtered = data[data['Label'] != 0].reset_index(drop=True)
X_filtered = data_filtered.select_dtypes(include=[np.number]).drop(columns=['Label'])
y_filtered = LabelEncoder().fit_transform(data_filtered['Label'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.3, random_state=42)

# Define Extra Trees model with an improved parameter grid
models = {
    "Extra Trees": (ExtraTreesClassifier(random_state=42), {
        'n_estimators': randint(100, 500),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True, False]
    })
}

# Perform RandomizedSearchCV on the Extra Trees model
best_estimators = {}
print("Optimizing hyperparameters for Extra Trees model:")

for model_name, (model, param_grid) in models.items():
    randomized_search = RandomizedSearchCV(
        model, param_distributions=param_grid, 
        n_iter=30, cv=3, scoring='accuracy', 
        n_jobs=-1, random_state=42
    )
    randomized_search.fit(X_train, y_train)
    
    best_estimators[model_name] = randomized_search.best_estimator_
    print(f"{model_name}: Best Parameters = {randomized_search.best_params_}, Best CV Accuracy = {randomized_search.best_score_:.4f}")

# Evaluate the best model on the test set
print("\nTest Accuracy Score of the Optimized Extra Trees Model:")
model = best_estimators["Extra Trees"]
y_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Extra Trees: Test Accuracy = {test_accuracy:.4f}")


Optimizing hyperparameters for Extra Trees model:
Extra Trees: Best Parameters = {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 256}, Best CV Accuracy = 0.8493

Test Accuracy Score of the Optimized Extra Trees Model:
Extra Trees: Test Accuracy = 0.9156


In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
import joblib  # For saving models

# Read the CSV file
file_path = r"F:\Studies\PainStudies Lab\Stress Assessment\WESAD Dataset\WESAD\combined_features_dataset.csv"
data = pd.read_csv(file_path)

# Drop rows where 'Label' is 0 and reset index
data_filtered = data[data['Label'] != 0].reset_index(drop=True)

# Separate features and labels
X_filtered = data_filtered.select_dtypes(include=[np.number]).drop(columns=['Label'])
y_filtered = LabelEncoder().fit_transform(data_filtered['Label'])

# Normalize features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_filtered)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_filtered, test_size=0.3, random_state=42)

# Define models with pipelines where needed
models = {
    "Logistic Regression": LogisticRegression(solver='saga', max_iter=5000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42),
    "SVM": make_pipeline(SVC(kernel='linear', probability=True, random_state=42)),
    "KNN": make_pipeline(KNeighborsClassifier(n_neighbors=5)),
    "MLP Classifier": make_pipeline(MLPClassifier(hidden_layer_sizes=(100,), max_iter=5000, learning_rate_init=0.001, solver='adam', random_state=42)),
    "Naive Bayes": GaussianNB(),
    "Extra Trees": ExtraTreesClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(algorithm='SAMME', random_state=42),
    "Voting Classifier": VotingClassifier(estimators=[
        ('lr', LogisticRegression(solver='saga', max_iter=5000, random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(probability=True, random_state=42))
    ], voting='soft')
}

# Evaluate each model using cross-validation and print the accuracy
print("Model Performance (Accuracy):")
model_performance = {}

for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    model_performance[model_name] = (scores.mean(), scores.std())
    print(f"{model_name}: Mean Accuracy = {scores.mean():.4f}, Std = {scores.std():.4f}")

# Train and test models to report test accuracy
print("\nTest Accuracy Scores:")
test_accuracies = {}

for model_name, model in models.items():
    # Train the model on the training set and evaluate on the test set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(y_pred)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_accuracies[model_name] = test_accuracy
    print(f"{model_name}: Test Accuracy = {test_accuracy:.4f}")

# Identify the best model and save it
best_model_name = max(test_accuracies, key=test_accuracies.get)
best_model = models[best_model_name]
best_model_accuracy = test_accuracies[best_model_name]
print(f"\nBest Model: {best_model_name} with Test Accuracy = {best_model_accuracy:.4f}")

# Save the best model pipeline
model_save_path = f"{best_model_name.replace(' ', '_').lower()}_best_model.joblib"
joblib.dump(best_model, model_save_path)
print(f"\nBest Model saved to {model_save_path}")

# Save the scaler to ensure the same normalization during inference
scaler_save_path = "scaler.joblib"
joblib.dump(scaler, scaler_save_path)
print(f"\nScaler saved to {scaler_save_path}")


Model Performance (Accuracy):
Logistic Regression: Mean Accuracy = 0.6866, Std = 0.0187
Random Forest: Mean Accuracy = 0.9639, Std = 0.0098
Gradient Boosting: Mean Accuracy = 0.9373, Std = 0.0145
XGBoost: Mean Accuracy = 0.9516, Std = 0.0223
SVM: Mean Accuracy = 0.7160, Std = 0.0197
KNN: Mean Accuracy = 0.8205, Std = 0.0193
MLP Classifier: Mean Accuracy = 0.9013, Std = 0.0279
Naive Bayes: Mean Accuracy = 0.4995, Std = 0.0234
Extra Trees: Mean Accuracy = 0.9734, Std = 0.0130
AdaBoost: Mean Accuracy = 0.6676, Std = 0.0299
Voting Classifier: Mean Accuracy = 0.8509, Std = 0.0165

Test Accuracy Scores:
[3 1 3 3 1 0 0 0 3 0 1 3 0 1 2 3 2 0 2 0 0 0 1 0 0 3 3 2 1 3 3 0 0 1 3 1 1
 1 3 1 1 0 0 2 3 0 0 3 0 0 3 1 0 1 1 1 0 0 3 0 1 0 0 3 3 3 0 0 1 0 3 3 1 0
 1 0 0 0 0 2 0 0 0 3 0 3 0 0 3 0 0 2 0 0 0 0 1 0 1 1 0 3 0 3 0 3 2 3 2 0 0
 0 0 0 3 0 0 3 1 3 0 2 1 1 3 0 0 3 3 0 0 1 1 2 0 0 0 0 3 1 2 3 1 0 3 3 2 1
 0 1 0 2 2 0 0 0 3 0 0 0 0 2 3 2 3 0 0 0 3 2 0 2 0 0 0 3 0 2 0 3 3 1 3 0 0
 1 3 1 0 2 0 1 0 3 1