In [None]:
# load derivatives
# randomly label
# feature selection
# train RFC
# train dual loop LOO


## Overall Training Process:
```
For each site (Outer Loop - Leave-One-Site-Out Cross-Validation):
    Hold out one site as the test set
    Train on the remaining N-1 sites

    For each hyperparameter configuration (Inner Loop - Grid/Random Search):
        Select a combination of:
            - Preprocessing method (normalization + feature selection)
            - Model type (SVC-lin, SVC-rbf, RFC)
            - Classifier hyperparameters (C, gamma, tree depth, etc.)

        For each fold in cross-validation (Inner Loop - Cross-Validation):
            Split the training data into train/validation folds
            Train the model on the training fold
            Evaluate on the validation fold
        
        Select the best preprocessing/model configuration based on average validation performance

    Train the final model with the best hyperparameters & preprocessing on all N-1 training sites
    Evaluate on the held-out site

```

In [67]:
import pandas as pd

# Load CSV file
features_df = pd.read_csv("/Users/isaacbevers/sensein/b2ai-wrapper/b2ai-data/bridge2ai-voice-corpus-3/derived/static_features.csv")  # Replace "file.csv" with your actual file path

# Display the first few rows
print(features_df.shape)


for i, col in enumerate(features_features_df.columns):
    print(i, col)

(12523, 133)
0 participant
1 task
2 F0semitoneFrom27.5Hz_sma3nz_amean
3 F0semitoneFrom27.5Hz_sma3nz_stddevNorm
4 F0semitoneFrom27.5Hz_sma3nz_percentile20.0
5 F0semitoneFrom27.5Hz_sma3nz_percentile50.0
6 F0semitoneFrom27.5Hz_sma3nz_percentile80.0
7 F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2
8 F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope
9 F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope
10 F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope
11 F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope
12 loudness_sma3_amean
13 loudness_sma3_stddevNorm
14 loudness_sma3_percentile20.0
15 loudness_sma3_percentile50.0
16 loudness_sma3_percentile80.0
17 loudness_sma3_pctlrange0-2
18 loudness_sma3_meanRisingSlope
19 loudness_sma3_stddevRisingSlope
20 loudness_sma3_meanFallingSlope
21 loudness_sma3_stddevFallingSlope
22 spectralFlux_sma3_amean
23 spectralFlux_sma3_stddevNorm
24 mfcc1_sma3_amean
25 mfcc1_sma3_stddevNorm
26 mfcc2_sma3_amean
27 mfcc2_sma3_stddevNorm
28 mfcc3_sma3_amean
29 mfcc3_sma3_stddevNorm
30 mfcc4_s

In [53]:
phenotype_df = pd.read_csv("/Users/isaacbevers/sensein/b2ai-wrapper/b2ai-data/bridge2ai-voice-corpus-3/derived/phenotype.tsv", sep='\t')
phenotype_df.shape

(307, 1055)

In [59]:
participants_df = pd.read_csv("/Users/isaacbevers/sensein/b2ai-wrapper/b2ai-data/bridge2ai-voice-corpus-3/bids/bids/participants.tsv", sep="\t")
print(participants_df.shape)
print("session_site" in participants_df.columns)
participants_df.columns

(307, 1151)
True


Index(['record_id', 'redcap_repeat_instrument', 'redcap_repeat_instance',
       'selected_language', 'consent_status', 'is_feasibility_participant',
       'enrollment_institution', 'age', 'eligible_studies___1',
       'eligible_studies___2',
       ...
       'vocabulary_item_word_4', 'vocabulary_item_difficulty_4',
       'vocabulary_item_word_5', 'vocabulary_item_difficulty_5',
       'vocabulary_item_word_6', 'vocabulary_item_difficulty_6',
       'random_session_id', 'random_recording_acoustic_task_id',
       'random_duration', 'random_item_generation_category'],
      dtype='object', length=1151)

In [77]:
participant_to_site = dict(zip(participants_df["record_id"], participants_df["session_site"]))
features_df["site"] = features_df["participant"].map(participant_to_site)
features_only_df = features_df.drop(columns=['site', 'participant', 'task'])  # Exclude non-feature columns
features_df.shape

(12523, 134)

### Preprocessing

> The first preprocessing step is a site-wise normalization of features. For robustness, this normalization calculates a center (as the median feature value) and a spread (as the interquartile range) per feature for demeaning and scaling data. This filter can center only, scale only or perform both centering and scaling.

In [78]:
# Site-wise normalization
import pandas as pd
import numpy as np

def site_wise_normalization(features_df, features, site_column, mode="both"):
    """
    Perform site-wise normalization using median and interquartile range (IQR).
    
    Args:
        features_df (pd.DataFrame): The dataset including site information.
        features (pd.DataFrame): The feature columns to normalize.
        site_column (str): The column representing site labels.
        mode (str): 'center', 'scale', or 'both' (default).
    
    Returns:
        pd.DataFrame: Normalized feature DataFrame.
    """
    normalized_features = features.copy()

    for site in features_df[site_column].unique():
        site_mask = features_df[site_column] == site
        site_data = features[site_mask]

        median = site_data.median()
        iqr = site_data.quantile(0.75) - site_data.quantile(0.25)  # Interquartile Range (IQR)

        if mode == "center":
            normalized_features.loc[site_mask] = site_data - median
        elif mode == "scale":
            normalized_features.loc[site_mask] = site_data / iqr
        elif mode == "both":
            normalized_features.loc[site_mask] = (site_data - median) / iqr

    return normalized_features

# Define site column and feature columns
site_column = 'site'
features_normalized_center = site_wise_normalization(features_df, features_only_df, site_column='site', mode='center')
features_normalized_scale = site_wise_normalization(features_df, features_only_df, site_column='site', mode='scale')
features_normalized_both = site_wise_normalization(features_df, features_only_df, site_column='site', mode='both')


>The second preprocessing step available is a dimensionality reduction filter excluding features highly predictive of the site of origin of data points. To do so, we fit a classifier based on extremely randomized trees [39], where the variables are the features and the responses are the sites of acquisition. We iteratively fit the classifier and remove the feature most predictive of the site at each step, until certain convergence criteria is met (either a maximum number of features to remove is reached or the performance of the classifier is very low and thus the remaining features do not predict the site at all).

In [74]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split

X = features_df.drop(columns=['site', 'participant', 'task'])  # Drop non-AQM columns
y = features_df['site']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train initial ExtraTreesClassifier to predict site labels
site_predictor = ExtraTreesClassifier(n_estimators=100, random_state=42)
site_predictor.fit(X_train, y_train)

# Get initial accuracy (baseline)
initial_accuracy = site_predictor.score(X_test, y_test)
print(f"Initial site prediction accuracy: {initial_accuracy:.2f}")

# Define stopping criteria
num_sites = len(np.unique(y))  # Number of unique sites
chance_level = 1 / num_sites  # Random guessing accuracy
max_removals = 131 - 64  # Set a maximum number of features to remove (total features - MRIqc number)

# Start feature elimination loop
features_to_remove = []
iteration = 0

while iteration < max_removals:
    # Get feature importances
    feature_importances = site_predictor.feature_importances_
    
    # Identify the most predictive feature
    most_predictive_feature = X_train.columns[np.argmax(feature_importances)]
    features_to_remove.append(most_predictive_feature)

    # Remove the most predictive feature from the dataset
    X_train = X_train.drop(columns=[most_predictive_feature])
    X_test = X_test.drop(columns=[most_predictive_feature])

    # Retrain the classifier without the removed feature
    site_predictor = ExtraTreesClassifier(n_estimators=100, random_state=42)
    site_predictor.fit(X_train, y_train)

    # Get new accuracy
    new_accuracy = site_predictor.score(X_test, y_test)
    print(f"Iteration {iteration + 1}: Removed '{most_predictive_feature}', New Accuracy: {new_accuracy:.2f}")

    # Check stopping conditions
    if new_accuracy <= chance_level:
        print("Stopping: Site prediction accuracy is near chance level.")
        break

    iteration += 1

print(f"Final removed features: {features_to_remove}")
print(f"Final site prediction accuracy: {new_accuracy:.2f}")


Initial site prediction accuracy: 0.79
Iteration 1: Removed 'mfcc2_sma3_amean', New Accuracy: 0.78
Iteration 2: Removed 'mfcc3_sma3_amean', New Accuracy: 0.78
Iteration 3: Removed 'mfcc2V_sma3nz_amean', New Accuracy: 0.77
Iteration 4: Removed 'slopeUV0-500_sma3nz_amean', New Accuracy: 0.76
Iteration 5: Removed 'mfcc1V_sma3nz_amean', New Accuracy: 0.75
Iteration 6: Removed 'slopeUV500-1500_sma3nz_amean', New Accuracy: 0.75
Iteration 7: Removed 'mfcc1_sma3_amean', New Accuracy: 0.74
Iteration 8: Removed 'mfcc3V_sma3nz_amean', New Accuracy: 0.73
Iteration 9: Removed 'loudness_sma3_percentile20.0', New Accuracy: 0.72
Iteration 10: Removed 'equivalentSoundLevel_dBp', New Accuracy: 0.72
Iteration 11: Removed 'alphaRatioUV_sma3nz_amean', New Accuracy: 0.71
Iteration 12: Removed 'hammarbergIndexUV_sma3nz_amean', New Accuracy: 0.71
Iteration 13: Removed 'spectralFlux_sma3_amean', New Accuracy: 0.71
Iteration 14: Removed 'spectralFluxUV_sma3nz_amean', New Accuracy: 0.71
Iteration 15: Removed 'mf

> Finally, a third preprocessing step implements the Winnow algorithm [40] using extremely randomized trees in a similar way to the previous filter, but comparing features to a synthetic, randomly-generated feature. This feature selection filter removes those IQMs below a certain SNR level.

In [82]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier

def winnow_feature_selection(X, y, snr_threshold=1.0):
    """
    Perform Winnow-based feature selection using ExtraTreesClassifier.
    
    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target labels (e.g., site).
        snr_threshold (float): Minimum SNR threshold for feature retention.
    
    Returns:
        pd.DataFrame: Reduced feature set with low SNR features removed.
    """
    X = X.copy()
    
    # Generate a synthetic random feature (noise)
    np.random.seed(42)
    X["random_noise"] = np.random.normal(0, 1, size=len(X))

    # Train ExtraTreesClassifier to measure feature importance
    clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
    clf.fit(X, y)
    
    # Get feature importances
    feature_importances = clf.feature_importances_
    feature_names = X.columns
    
    # Identify the importance of the synthetic feature (random noise)
    random_feature_importance = feature_importances[X.columns.get_loc("random_noise")]

    # Compute SNR for each feature (ratio of feature importance to random noise importance)
    snr = feature_importances / random_feature_importance

    # Select features where SNR exceeds the threshold
    selected_features = feature_names[snr > snr_threshold].tolist()

    # Avoid ValueError: Only remove "random_noise" if it exists
    if "random_noise" in selected_features:
        selected_features.remove("random_noise")

    print(f"Removed {X.shape[1] - len(selected_features)} low-SNR features.")
    
    return X[selected_features]


# Apply Winnow feature selection for each normalization method
features_selected_center = winnow_feature_selection(features_normalized_center, features_df["site"])
features_selected_scale = winnow_feature_selection(features_normalized_scale, features_df["site"])
features_selected_both = winnow_feature_selection(features_normalized_both, features_df["site"])


Removed 35 low-SNR features.
Removed 32 low-SNR features.
Removed 38 low-SNR features.


In [84]:
def add_labels(df, column_name="label", labels=["accept", "exclude", "unsure"], random_seed=None):
    """
    Adds a column with randomly assigned labels to the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame to modify.
    - column_name (str): Name of the new column (default: "label").
    - labels (list): List of labels to sample from (default: ["accept", "exclude", "unsure"]).
    - random_seed (int or None): Random seed for reproducibility (default: None).

    Returns:
    - pd.DataFrame: Modified DataFrame with the new column.
    """
    if random_seed is not None:
        np.random.seed(random_seed)  # Set random seed for reproducibility

    df[column_name] = np.random.choice(labels, size=len(df))
    return df

features_selected_center = add_labels(features_selected_center)
features_selected_scale = add_labels(features_selected_scale)
features_normalized_both = add_labels(features_normalized_both)



>A support-vector machine [37] finds a hyperplane in the high-dimensional space of the features that robustly separates the classes of interest. The SVC then uses the hyperplane to decide the class that is assigned to new samples in the space of features. Two hyper-parameters define the support-vector machine algorithm: a kernel function that defines the similarity between data points to ultimately compute a distance to the hyperplane, and a regularization weight C. In particular, we analyzed here the linear SVC implementation (as of now, “SVC-lin”) and the one based on radial basis functions (denoted by “SVC-rbf”). During model selection, we evaluated the regularization weight C of both SVC and the γ parameter (kernel width) particular to the SVC-rbf.

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

def train_svm_for_label_prediction(features_df, label_column="label", test_size=0.2, random_state=42):
    """
    Trains Support Vector Machines (SVM) with hyperparameter tuning to predict labels ('accept', 'exclude', 'unsure').

    Parameters:
    - features_df (pd.DataFrame): DataFrame containing features and the label column.
    - label_column (str): Name of the column containing classification labels (default: "label").
    - test_size (float): Fraction of data to use for testing (default: 0.2).
    - random_state (int): Random seed for reproducibility.

    Returns:
    - dict: Contains best models and their test accuracies.
    """

    # Ensure label column exists
    if label_column not in features_df.columns:
        raise ValueError(f"Label column '{label_column}' not found in DataFrame.")

    # Separate features and labels
    X = features_df.drop(columns=[label_column])  # Features
    y = features_df[label_column]  # Labels

    # Check for NaN values
    if X.isna().sum().sum() > 0:
        print("Warning: NaN values detected in features. Imputing missing values...")
        imputer = SimpleImputer(strategy="mean")  # Fill NaN with mean of the column
        X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # Split dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    # Standardize features (SVM performs better with standardized inputs)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define hyperparameter grids for linear SVC and RBF SVC
    param_grid_lin = {'C': [0.1, 1, 10, 100]}
    param_grid_rbf = {'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1]}

    # Perform Grid Search for Linear SVC
    grid_search_lin = GridSearchCV(SVC(kernel="linear"), param_grid_lin, cv=5, scoring="accuracy")
    grid_search_lin.fit(X_train_scaled, y_train)
    best_svc_lin = grid_search_lin.best_estimator_

    # Perform Grid Search for RBF SVC
    grid_search_rbf = GridSearchCV(SVC(kernel="rbf"), param_grid_rbf, cv=5, scoring="accuracy")
    grid_search_rbf.fit(X_train_scaled, y_train)
    best_svc_rbf = grid_search_rbf.best_estimator_

    # Evaluate the best models on test set
    y_pred_lin = best_svc_lin.predict(X_test_scaled)
    y_pred_rbf = best_svc_rbf.predict(X_test_scaled)

    acc_lin = accuracy_score(y_test, y_pred_lin)
    acc_rbf = accuracy_score(y_test, y_pred_rbf)

    # Print results
    print(f"Best Linear SVC (C={grid_search_lin.best_params_['C']}), Test Accuracy: {acc_lin:.4f}")
    print(f"Best RBF SVC (C={grid_search_rbf.best_params_['C']}, gamma={grid_search_rbf.best_params_['gamma']}), Test Accuracy: {acc_rbf:.4f}")

    # Return best models and accuracies
    return {
        "best_linear_svc": best_svc_lin,
        "best_rbf_svc": best_svc_rbf,
        "accuracy_linear": acc_lin,
        "accuracy_rbf": acc_rbf
    }

# Example Usage on Labeled Feature Sets
results_center = train_svm_for_label_prediction(features_selected_center)
results_scale = train_svm_for_label_prediction(features_selected_scale)
results_both = train_svm_for_label_prediction(features_normalized_both)


Best Linear SVC (C=10), Test Accuracy: 0.3269
Best RBF SVC (C=0.1, gamma=1), Test Accuracy: 0.3397


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

def train_random_forest(features_df, label_column="label", test_size=0.2, random_state=42):
    """
    Trains a Random Forest Classifier (RFC) with hyperparameter tuning using GridSearchCV.

    Parameters:
    - features_df (pd.DataFrame): DataFrame containing features and the label column.
    - label_column (str): Name of the column containing classification labels (default: "label").
    - test_size (float): Fraction of data to use for testing (default: 0.2).
    - random_state (int): Random seed for reproducibility.

    Returns:
    - dict: Contains best model and its test accuracy.
    """

    # Ensure label column exists
    if label_column not in features_df.columns:
        raise ValueError(f"Label column '{label_column}' not found in DataFrame.")

    # Separate features and labels
    X = features_df.drop(columns=[label_column])  # Features
    y = features_df[label_column]  # Labels

    # Check for NaN values and impute missing data
    if X.isna().sum().sum() > 0:
        print("Warning: NaN values detected in features. Imputing missing values...")
        imputer = SimpleImputer(strategy="mean")  # Fill NaN with mean of the column
        X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # Split dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    # Standardize features (optional for tree-based models, but helps when combined with other models)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define hyperparameter grid for Random Forest
    param_grid = {
        "n_estimators": [50, 100, 200],  # Number of trees
        "max_depth": [None, 10, 20],  # Maximum depth of trees
        "min_samples_split": [2, 5, 10],  # Minimum samples required to split an internal node
        "min_samples_leaf": [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    }

    # Perform Grid Search for Random Forest
    grid_search = GridSearchCV(RandomForestClassifier(random_state=random_state), param_grid, cv=5, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    best_rfc = grid_search.best_estimator_

    # Evaluate the best model on test set
    y_pred = best_rfc.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)

    # Print results
    print(f"Best Random Forest Model: {grid_search.best_params_}, Test Accuracy: {acc:.4f}")

    # Return best model and accuracy
    return {
        "best_rfc": best_rfc,
        "accuracy": acc
    }

# Example Usage on Labeled Feature Sets
results_center = train_random_forest(features_selected_center)
results_scale = train_random_forest(features_selected_scale)
results_both = train_random_forest(features_normalized_both)
