In [1]:
import os
import pandas as pd
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

## Loading Data

In [2]:
#Set training and test folder paths
training_path = 'features_train/features_train'
test_path = 'features_test/features_test'

#Load labels file
labels = pd.read_csv('labels.csv')

#Load feature description files, take out column 0 to use as header for training/test sets
features = pd.read_csv('feature_description.csv', encoding_errors='ignore', header=None, index_col=0)
features = features.index.tolist()

In [3]:
def load_data(folder_path):
    #Init empty dataframe
    res = pd.DataFrame()
    for file in os.listdir(folder_path):
        #for each speaker file
        if file.endswith('.csv'):
            #get participant id from filename, eg filename: 'spk_305.csv'
            participant = float(file.split('_')[1].split('.')[0])
            #find labels for the participant
            label = labels[labels['Participant_ID'] == participant]
            #load participant feature file
            file_path = os.path.join(folder_path, file)
            data_df = pd.read_csv(file_path, header=None, names=features)
            #Add labels and participant id columns
            data_df['participant'] = participant
            data_df['gender'] = label['Gender'].values[0]
            data_df['depression'] = label['Depression'].values[0]
            #combine everything to result
            res = pd.concat([res, data_df])
    return res

In [4]:
#Load training data
training_df = load_data(training_path)
len(training_df)

13626

In [5]:
#Load test data
test_df = load_data(test_path)
len(test_df)

3280

## Data cleaning and preprocessing 

In [6]:
# Check Missing values
missing_values = (training_df.isnull().sum()/len(training_df)) *100
total_missing_values = training_df.isnull().any(axis=1).sum()
print(f'Missing value percent % for each column, total samples {len(training_df)}')
print(f'Number of samples with missing values: {total_missing_values}')
print(missing_values)

Missing value percent % for each column, total samples 13626
Number of samples with missing values: 1
F0semitoneFrom27.5Hz_sma3nz_amean             0.007339
F0semitoneFrom27.5Hz_sma3nz_stddevNorm        0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile20.0    0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile50.0    0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile80.0    0.007339
                                                ...   
StddevUnvoicedSegmentLength                   0.007339
equivalentSoundLevel_dBp                      0.007339
participant                                   0.000000
gender                                        0.000000
depression                                    0.000000
Length: 91, dtype: float64


In [7]:
def preprocess_data(data):
    #drop missing va;ues since there is just 1 sample
    data_nona = data.dropna()
    #Normalization z-score
    scaler = StandardScaler()    
    # Store the columns to keep for later concatenation
    columns_to_keep = ['participant', 'gender', 'depression']
    # Extract the columns to be scaled and drop them from the original DataFrame
    temp = data_nona[columns_to_keep].copy()
    data_nona.drop(columns=columns_to_keep, axis=1, inplace=True)
    # Scale the remaining columns using StandardScaler and convert back to DataFrame
    scaled_data = pd.DataFrame(scaler.fit_transform(data_nona), columns=data_nona.columns)
    scaled_data.reset_index(drop=True, inplace=True)
    temp.reset_index(drop=True, inplace=True)
    # Concatenate the scaled data with the columns we kept earlier
    processed_data = pd.concat([scaled_data, temp], axis=1)
    return processed_data

## Methods to calculate metrics

In [8]:
# Calculates accuracy
# pass true and predicted labels
# return accuracy score
def calculate_total_accuracy(true_labels, predicted_labels):
    return accuracy_score(true_labels, predicted_labels)

# Calculates accuracy
# pass true and predicted labels
# return balanced accuracy score
def calculate_balanced_accuracy(true_labels, predicted_labels):
    #calculkate confusion matrix
    matrix = confusion_matrix(true_labels, predicted_labels)
    TP = matrix[1, 1]
    TN = matrix[0, 0]
    FP = matrix[0, 1]
    FN = matrix[1, 0]
    #For positive class, how many correct predictions
    accuracy_positive = TP/(TP+FN)
    #For negative class how many 
    accuracy_negative = TN/(TN+FP)
    return 0.5*(accuracy_positive + accuracy_negative)

#Calculates Equality of Opportunity
# pass true and predicted labels for male samples
# pass true and predicted labels for female samples
# return balanced accuracy score
def calculate_EO(true_labels_male, 
                 true_labels_female,
                 predicted_labels_male,
                predicted_labels_female):
    #Calculate True pistive rate for male gender with confusion matrix
    matrix_male = confusion_matrix(true_labels_male, predicted_labels_male)
    TP = matrix_male[1, 1]
    TN = matrix_male[0, 0]
    FN = matrix_male[1, 0]
    TPR_male = TP/(TP+FN)

    #Calculate True pistive rate for female gender with confusion matrix
    matrix_female = confusion_matrix(true_labels_female, predicted_labels_female)
    TP = matrix_female[1, 1]
    TN = matrix_female[0, 0]
    FN = matrix_female[1, 0]
    TPR_female = TP/(TP+FN)
    
    # Calculate EO
    return 1-abs(TPR_male-TPR_female) 

#Function to calculate majority votings
#Pass labels
#Returns mode or which label was predicted most
def majority_voting(df):
    counts = df.value_counts()
    return counts.idxmax()

In [9]:
#Function to calculate all metrics
#Pass true labels, predicted labels and a reference(test/val) dataframe
#referece dataframe should have all labels and features
#Returns a dictionary with all the metric calculated
def calculate_metrics(y_true, y_pred, test_data, EO=True):
    # Initialize metrics
    metrics = {}
    #---------------------------------------------------CALCULATING TOTAL METRICS
    #calculate total accuracy
    metrics["Total accuracy"] = calculate_total_accuracy(y_true, y_pred)
    #calculate total balanced accuracy
    metrics["Total Balanced accuracy"] = calculate_balanced_accuracy(y_true, y_pred)
    #calculate total EO
    if(EO):
        #find gender based indices for true labels from data
        male_indices = test_data[test_data['gender']==1].index
        female_indices = test_data[test_data['gender']==0].index
        #separate true labels based on indices
        male_true = y_true.loc[male_indices]
        female_true = y_true.loc[female_indices]
        #Find the corresponding indices for predicted labels from true_labels
        male_true_index_list = male_true.index.tolist()
        female_true_index_list = female_true.index.tolist()
        #Get separated predicted labels based on gender
        male_predicted = y_pred[[male_true_index_list.index(index) for index in male_true_index_list]]
        female_predicted = y_pred[[female_true_index_list.index(index) for index in female_true_index_list]]
        metrics["Total EO"] = calculate_EO(male_true, female_true, male_predicted, female_predicted)
    #-------------------------------------------------CALCULATING AGGREGATED METRICS FOR EACH PARTICIPANT
    predictions_df = pd.DataFrame({'participant': test_data['participant'], 'predicted_label': y_pred, 'true_label': y_true})
    aggregated_y_true = predictions_df.groupby('participant')['true_label'].agg(majority_voting)
    aggregated_y_pred = predictions_df.groupby('participant')['predicted_label'].agg(majority_voting)
    #Calculate aggregated accuracy score
    metrics["Aggregated accuracy score"] = calculate_total_accuracy(aggregated_y_true, aggregated_y_pred)
    #Calculate balanced aggregated accuracy
    metrics["Aggregated balanced accuracy score"] = calculate_balanced_accuracy(aggregated_y_true, aggregated_y_pred)
    if(EO):
        #Calculate aggregated EOs
        male_predictions_df = pd.DataFrame({'participant': test_data['participant'].loc[male_indices], 'predicted_label': y_pred, 'true_label': y_true})
        male_aggregated_y_true = male_predictions_df.groupby('participant')['true_label'].agg(majority_voting)
        male_aggregated_y_pred = male_predictions_df.groupby('participant')['predicted_label'].agg(majority_voting)
        female_predictions_df = pd.DataFrame({'participant': test_data['participant'].loc[female_indices], 'predicted_label': y_pred, 'true_label': y_true})
        female_aggregated_y_true = female_predictions_df.groupby('participant')['true_label'].agg(majority_voting)
        female_aggregated_y_pred = female_predictions_df.groupby('participant')['predicted_label'].agg(majority_voting)
        metrics["Aggregated EO score"] = calculate_EO(male_aggregated_y_true, female_aggregated_y_true, male_aggregated_y_pred, female_aggregated_y_pred)
    
    return metrics

## Data Modeling - Depression Classification
### What models to try?
- Decision tree
- Random forest
- TBD...........

### Model attempt: Decision tree classifier

In [10]:
#------------------------------------------------Tuning for different depths-------------------------------------------------------

warnings.filterwarnings("ignore")
training_data = preprocess_data(training_df)
X = training_data.drop(columns = ['participant', 'gender', 'depression'], axis=1)
y = training_data['depression']

# Define the depths to experiment with
depths = [3, 5, 7, 9, 15, 30, 50, 70, 90]
# Initialize metrics
metrics = {}

# Perform cross-validation for each tree depth
for depth in depths:
    # Initialize decision tree model
    tree = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
    # Cross validation k fold, 4:1::training:validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    #list to store metrics for each cross validation split
    fold_metrics = []

    # Perform cross-validation and collect metrics
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        #Fit the training set
        tree.fit(X_train, y_train)
        #Predict validation set
        y_pred = tree.predict(X_val)
        #calculate metrics
        fold_metrics.append(calculate_metrics(y_val, y_pred, training_data.iloc[val_index]))

    #find avg metrics for each depth
    metrics[depth] = {}
    sums = {}
    for metric in fold_metrics:
        for key, value in metric.items():
            sums[key] = sums.get(key, 0) + value
    for key in sums:
        sums[key]/=len(fold_metrics)
    metrics[depth] = sums

#Print metrics for all the hyperparameters (Depth)
for depth in metrics:
    print(f"for depth {depth}")
    print(metrics[depth])
    print()

for depth 3
{'Total accuracy': 0.739302752293578, 'Total Balanced accuracy': 0.5909240479132576, 'Total EO': 0.9504593123805108, 'Aggregated accuracy score': 0.7586206896551724, 'Aggregated balanced accuracy score': 0.5857142857142856, 'Aggregated EO score': 0.8333333333333334}

for depth 5
{'Total accuracy': 0.7472293577981651, 'Total Balanced accuracy': 0.6138374218217592, 'Total EO': 0.9324445056229891, 'Aggregated accuracy score': 0.7839080459770115, 'Aggregated balanced accuracy score': 0.6109126984126985, 'Aggregated EO score': 0.8833333333333332}

for depth 7
{'Total accuracy': 0.7634495412844038, 'Total Balanced accuracy': 0.6653390774441428, 'Total EO': 0.962120994404248, 'Aggregated accuracy score': 0.8160919540229885, 'Aggregated balanced accuracy score': 0.674404761904762, 'Aggregated EO score': 0.8166666666666667}

for depth 9
{'Total accuracy': 0.7696146788990825, 'Total Balanced accuracy': 0.6867998981419393, 'Total EO': 0.9746201537650562, 'Aggregated accuracy score': 0

In [11]:
# ---------------------------------------------------------Testing for best depth----------------------------------------------------------------
test_data = preprocess_data(test_df)
training_data = preprocess_data(training_df)
X_train = training_data.drop(columns = ['participant', 'gender', 'depression'], axis=1)
y_train = training_data['depression']
X_test = test_data.drop(columns = ['participant', 'gender', 'depression'], axis=1)
y_test = test_data['depression']

best_depth = 70
# Initialize decision tree model
tree = DecisionTreeClassifier(max_depth=best_depth, criterion='entropy')
#Fit the training set
tree.fit(X_train, y_train)
#Predict for test set
y_pred = tree.predict(X_test)
#get metrics
metrics = calculate_metrics(y_test, y_pred, test_data)

print(metrics)

{'Total accuracy': 0.699390243902439, 'Total Balanced accuracy': 0.5916876320821174, 'Total EO': 0.7163647215785408, 'Aggregated accuracy score': 0.7, 'Aggregated balanced accuracy score': 0.5476190476190477, 'Aggregated EO score': 0.8}


## Model attempt: Logistic regressor

In [12]:
warnings.filterwarnings("ignore")
training_data = preprocess_data(training_df)
X = training_data.drop(columns = ['participant', 'gender', 'depression'], axis=1)
y = training_data['depression']

hyperparams = [
    {'penalty': 'l1', 'C': 1.0, 'solver': 'liblinear'},
    {'penalty': 'l2', 'C': 1.0, 'solver': 'liblinear'},
    {'penalty': 'l2', 'C': 0.99, 'solver': 'liblinear'},
    {'penalty': 'l2', 'C': 0.95, 'solver': 'liblinear'},
    {'penalty': 'l2', 'C': 0.9, 'solver': 'liblinear'},
    {'penalty': 'l2', 'C': 1.0, 'solver': 'lbfgs'},
    {'penalty': 'l2', 'C': 1.0, 'solver': 'sag'},
    {'penalty': 'l2', 'C': 1.0, 'solver': 'saga'},
    {'penalty': 'l2', 'C': 1.0, 'solver': 'newton-cg'},
    {'penalty': 'l2', 'C': 1.0, 'solver': 'newton-cholesky'},
]
metrics = {}

for hyperparam in hyperparams:
    print(hyperparam)
    # Initialize Logistic regression model
    model = LogisticRegression(solver=hyperparam['solver'], C=hyperparam['C'], penalty=hyperparam['penalty'])
    # Cross validation k fold, 4:1::training:validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    #list to store metrics for each cross validation split
    fold_metrics = []
    
    # Perform cross-validation and collect metrics
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
        #Fit the training set
        model.fit(X_train, y_train)
        #Predict validation set
        y_pred = model.predict(X_val)
        #calculate metrics
        fold_metrics.append(calculate_metrics(y_val, y_pred, training_data.iloc[val_index]))
    
    #find avg metrics for each depth
    # metrics[depth] = {}
    sums = {}
    for metric in fold_metrics:
        for key, value in metric.items():
            sums[key] = sums.get(key, 0) + value
    for key in sums:
        sums[key]/=len(fold_metrics)
    print(sums)

{'penalty': 'l1', 'C': 1.0, 'solver': 'liblinear'}
{'Total accuracy': 0.7428256880733944, 'Total Balanced accuracy': 0.5884060803856384, 'Total EO': 0.9477892614269802, 'Aggregated accuracy score': 0.7609195402298852, 'Aggregated balanced accuracy score': 0.5666666666666667, 'Aggregated EO score': 0.9333333333333332}
{'penalty': 'l2', 'C': 1.0, 'solver': 'liblinear'}
{'Total accuracy': 0.7423119266055046, 'Total Balanced accuracy': 0.5882667934329493, 'Total EO': 0.9466991528254788, 'Aggregated accuracy score': 0.7586206896551724, 'Aggregated balanced accuracy score': 0.5625, 'Aggregated EO score': 0.9166666666666666}
{'penalty': 'l2', 'C': 0.99, 'solver': 'liblinear'}
{'Total accuracy': 0.7422385321100918, 'Total Balanced accuracy': 0.5882157209099668, 'Total EO': 0.9466991528254788, 'Aggregated accuracy score': 0.7586206896551724, 'Aggregated balanced accuracy score': 0.5625, 'Aggregated EO score': 0.9166666666666666}
{'penalty': 'l2', 'C': 0.95, 'solver': 'liblinear'}
{'Total accura

In [13]:
# ---------------------------------------------------------Testing for best hyperparameter----------------------------------------------------------------
test_data = preprocess_data(test_df)
training_data = preprocess_data(training_df)
X_train = training_data.drop(columns = ['participant', 'gender', 'depression'], axis=1)
y_train = training_data['depression']
X_test = test_data.drop(columns = ['participant', 'gender', 'depression'], axis=1)
y_test = test_data['depression']

best_depth = 70
# Initialize decision tree model
model = LogisticRegression(penalty='l1', C=1.0, solver='liblinear')
#Fit the training set
model.fit(X_train, y_train)
#Predict for test set
y_pred = model.predict(X_test)
#get metrics
metrics = calculate_metrics(y_test, y_pred, test_data)

print(metrics)

{'Total accuracy': 0.761890243902439, 'Total Balanced accuracy': 0.5125821245273796, 'Total EO': 0.9488448844884488, 'Aggregated accuracy score': 0.7, 'Aggregated balanced accuracy score': 0.5, 'Aggregated EO score': 1.0}


## Gender Classification

### Model attempt: Decision Tree

In [14]:
#------------------------------------------------Tuning for different depths-------------------------------------------------------

warnings.filterwarnings("ignore")
training_data = preprocess_data(training_df)
X = training_data.drop(columns = ['participant', 'gender', 'depression'], axis=1)
y = training_data['gender']

# Define the depths to experiment with
depths = [3, 5, 7, 9, 15, 30, 50, 70, 90]
# Initialize metrics
metrics = {}

# Perform cross-validation for each tree depth
for depth in depths:
    # Initialize decision tree model
    tree = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
    # Cross validation k fold, 4:1::training:validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    #list to store metrics for each cross validation split
    fold_metrics = []

    # Perform cross-validation and collect metrics
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        #Fit the training set
        tree.fit(X_train, y_train)
        #Predict validation set
        y_pred = tree.predict(X_val)
        #calculate metrics
        fold_metrics.append(calculate_metrics(y_val, y_pred, training_data.iloc[val_index], False))

    #find avg metrics for each depth
    metrics[depth] = {}
    sums = {}
    for metric in fold_metrics:
        for key, value in metric.items():
            sums[key] = sums.get(key, 0) + value
    for key in sums:
        sums[key]/=len(fold_metrics)
    metrics[depth] = sums

#Print metrics for all the hyperparameters (Depth)
for depth in metrics:
    print(f"for depth {depth}")
    print(metrics[depth])
    print()

for depth 3
{'Total accuracy': 0.9236697247706422, 'Total Balanced accuracy': 0.9192004514666632, 'Aggregated accuracy score': 0.9862068965517242, 'Aggregated balanced accuracy score': 0.9841503267973856}

for depth 5
{'Total accuracy': 0.9285871559633027, 'Total Balanced accuracy': 0.9223470848664723, 'Aggregated accuracy score': 0.993103448275862, 'Aggregated balanced accuracy score': 0.9916666666666668}

for depth 7
{'Total accuracy': 0.9289541284403671, 'Total Balanced accuracy': 0.9258356190184619, 'Aggregated accuracy score': 0.9954022988505746, 'Aggregated balanced accuracy score': 0.9944444444444445}

for depth 9
{'Total accuracy': 0.9257981651376147, 'Total Balanced accuracy': 0.9229877129799933, 'Aggregated accuracy score': 0.9954022988505746, 'Aggregated balanced accuracy score': 0.9944444444444445}

for depth 15
{'Total accuracy': 0.9225688073394496, 'Total Balanced accuracy': 0.9193688045774605, 'Aggregated accuracy score': 0.993103448275862, 'Aggregated balanced accuracy 

In [15]:
# ---------------------------------------------------------Testing for best depth----------------------------------------------------------------
test_data = preprocess_data(test_df)
training_data = preprocess_data(training_df)
X_train = training_data.drop(columns = ['participant', 'gender', 'depression'], axis=1)
y_train = training_data['gender']
X_test = test_data.drop(columns = ['participant', 'gender', 'depression'], axis=1)
y_test = test_data['gender']

best_depth = 70
# Initialize decision tree model
tree = DecisionTreeClassifier(max_depth=best_depth, criterion='entropy')
#Fit the training set
tree.fit(X_train, y_train)
#Predict for test set
y_pred = tree.predict(X_test)
#get metrics
metrics = calculate_metrics(y_test, y_pred, test_data, False)

print(metrics)

{'Total accuracy': 0.8655487804878049, 'Total Balanced accuracy': 0.8616732045232312, 'Aggregated accuracy score': 0.95, 'Aggregated balanced accuracy score': 0.9375}


## Gender classification model attempt: Logistic regression

In [None]:
#------------------------------------------------Tuning for different depths-------------------------------------------------------

warnings.filterwarnings("ignore")
training_data = preprocess_data(training_df)
X = training_data.drop(columns = ['participant', 'gender', 'depression'], axis=1)
y = training_data['gender']

# Define the depths to experiment with
depths = [3, 5, 7, 9, 15, 30, 50, 70, 90]
# Initialize metrics
metrics = {}

# Perform cross-validation for each tree depth
for depth in depths:
    # Initialize decision tree model
    tree = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
    # Cross validation k fold, 4:1::training:validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    #list to store metrics for each cross validation split
    fold_metrics = []

    # Perform cross-validation and collect metrics
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        #Fit the training set
        tree.fit(X_train, y_train)
        #Predict validation set
        y_pred = tree.predict(X_val)
        #calculate metrics
        fold_metrics.append(calculate_metrics(y_val, y_pred, training_data.iloc[val_index], False))

    #find avg metrics for each depth
    metrics[depth] = {}
    sums = {}
    for metric in fold_metrics:
        for key, value in metric.items():
            sums[key] = sums.get(key, 0) + value
    for key in sums:
        sums[key]/=len(fold_metrics)
    metrics[depth] = sums

#Print metrics for all the hyperparameters (Depth)
for depth in metrics:
    print(f"for depth {depth}")
    print(metrics[depth])
    print()