In [None]:
# This is a simple template for a ML modeling approach based on a two-way split (splitting the data into training set and a testing set).
# This approach is convenient and a good starting point.
# In practice, more rigorous evaluation methods like cross-validation or bootstrapping are typically employed to obtain more robust model performance and generalizability.

# The script defines and evaluates several machine learning models using a two-way split approach on datasets related to bipolar disorder classification. Initially, it loads and preprocesses the training and test datasets, performing standard scaling on the features. It then iterates through a predefined set of classifiers including Random Forest, SVM, Gradient Boosting, Logistic Regression, MLP Classifier, and AdaBoost. Each model is trained on the training data and evaluated using metrics such as accuracy, precision, recall, F1-score, and confusion matrix on the test data.

In [6]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split

############ Creating a train / test split #######################

# Load your dataset
# Assuming your dataset is in a CSV file, adjust as needed
df = pd.read_csv('~/code/urop/reddit/data/2019_output/ann-combined.csv')

# Define your features and target variable
X = df.drop(columns=['MHC'])  # Replace 'MHC' with the name of your class label column
y = df['MHC']  # Replace 'MHC' with the name of your class label column

# Perform the train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Combine X and y back into train and test DataFrames
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Optionally save the splits to CSV files
train_df.to_csv('train_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)


################  Setting up the models  ###################


# Define models dictionary globally
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42, probability=True),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "MLP Classifier": MLPClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42)
}

# Function to load and preprocess dataset
def load_and_preprocess_dataset(dataset_path):
    df = pd.read_csv(dataset_path)
    
    # Rename columns
    df.rename(columns={'ID': 'userID', 'MHC': 'condition'}, inplace=True)
    
    # Assuming 'condition' is the target column
    X = df.drop(columns=['condition', 'userID'])  # Adjust columns to drop as needed
    y = df['condition']
    
    # Standard scaling of features if necessary
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, X, y

# Function to train and evaluate models using dedicated training and test datasets
def train_and_evaluate_models(X_train, y_train, X_test, y_test, results_df, dataset_name):
    for name, model in models.items():
        print(f"Training and evaluating {name}...")
        
        # Train the model on the training data
        model.fit(X_train, y_train)
        
        # Predict on the test data
        y_pred = model.predict(X_test)
        
        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        confusion_matrix_result = confusion_matrix(y_test, y_pred)
        
        # Append results to DataFrame
        temp_df = pd.DataFrame({
            'Model': [name],
            'Dataset': [dataset_name],
            'Accuracy': [accuracy],
            'Precision': [precision],
            'Recall': [recall],
            'F1-Score': [fscore],
            'Confusion Matrix': [confusion_matrix_result]
        })
        results_df = pd.concat([results_df, temp_df], ignore_index=True)
    
    return results_df

# Initialize DataFrame to store results
results_df = pd.DataFrame(columns=['Model', 'Dataset', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'Confusion Matrix'])

# Specify paths to training and test datasets
train_dataset_path = "/Users/daniel24/Documents/Research/SMHD/CymoResults/ExpTradClass/data/balancedReversedEng/bipolar_train_means_based.csv"
test_dataset_path = "/Users/daniel24/Documents/Research/SMHD/CymoResults/ExpTradClass/data/balancedReversedEng/bipolar_test_means_based.csv"

print(f"\nProcessing training dataset: {train_dataset_path}")
X_train_scaled, X_train_raw, y_train = load_and_preprocess_dataset(train_dataset_path)

print(f"\nProcessing test dataset: {test_dataset_path}")
X_test_scaled, X_test_raw, y_test = load_and_preprocess_dataset(test_dataset_path)

# Train and evaluate models
results_df = train_and_evaluate_models(X_train_scaled, y_train, X_test_scaled, y_test, results_df, "Bipolar")

# Write results to a CSV file
results_df.to_csv("/Users/daniel24/Documents/Research/SMHD/CymoResults/ExpTradClass/data/balancedReversedEng/results/bipolar_model_evaluation_results.csv", index=False)
print(results_df.head())



Processing training dataset: /Users/daniel24/Documents/Research/SMHD/CymoResults/ExpTradClass/data/balancedReversedEng/bipolar_train_means_based.csv

Processing test dataset: /Users/daniel24/Documents/Research/SMHD/CymoResults/ExpTradClass/data/balancedReversedEng/bipolar_test_means_based.csv
Training and evaluating Random Forest...


  results_df = pd.concat([results_df, temp_df], ignore_index=True)


Training and evaluating SVM...
Training and evaluating Gradient Boosting...
Training and evaluating Logistic Regression...
Training and evaluating MLP Classifier...
Training and evaluating AdaBoost...
                 Model  Dataset  Accuracy  Precision    Recall  F1-Score  \
0        Random Forest  Bipolar  0.770492   0.771972  0.770492  0.770179   
1                  SVM  Bipolar  0.795082   0.802417  0.795082  0.793832   
2    Gradient Boosting  Bipolar  0.786885   0.786962  0.786885  0.786871   
3  Logistic Regression  Bipolar  0.766393   0.766680  0.766393  0.766331   
4       MLP Classifier  Bipolar  0.770492   0.770783  0.770492  0.770430   

         Confusion Matrix  
0  [[179, 65], [47, 197]]  
1  [[175, 69], [31, 213]]  
2  [[194, 50], [54, 190]]  
3  [[183, 61], [53, 191]]  
4  [[184, 60], [52, 192]]  
