In [93]:
import os
import numpy as np
import pandas as pd
import seaborn as sn
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
import math
import random
import time
import keras
from keras.datasets import cifar10
from keras.utils import to_categorical
from keras import models, layers, optimizers, regularizers
from keras.models import Model, Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from skimage.color import rgb2gray
import sklearn
from sklearn import model_selection, preprocessing, metrics
from scipy import stats

# Load and explore dataset

In [2]:
wd = '/kaggle/input/woz-speech/'
#Set training and test folder paths
training_path = wd+'features_train/features_train'
test_path = wd+'features_test/features_test'

#Load labels file
labels = pd.read_csv(wd+'labels.csv')

#Load feature description files, take out column 0 to use as header for training/test sets
features = pd.read_csv(wd+'feature_description.csv', encoding_errors='ignore', header=None, index_col=0)
features = features.index.tolist()
features

['F0semitoneFrom27.5Hz_sma3nz_amean',
 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm',
 'F0semitoneFrom27.5Hz_sma3nz_percentile20.0',
 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0',
 'F0semitoneFrom27.5Hz_sma3nz_percentile80.0',
 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
 'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
 'loudness_sma3_amean',
 'loudness_sma3_stddevNorm',
 'loudness_sma3_percentile20.0',
 'loudness_sma3_percentile50.0',
 'loudness_sma3_percentile80.0',
 'loudness_sma3_pctlrange0-2',
 'loudness_sma3_meanRisingSlope',
 'loudness_sma3_stddevRisingSlope',
 'loudness_sma3_meanFallingSlope',
 'loudness_sma3_stddevFallingSlope',
 'spectralFlux_sma3_amean',
 'spectralFlux_sma3_stddevNorm',
 'mfcc1_sma3_amean',
 'mfcc1_sma3_stddevNorm',
 'mfcc2_sma3_amean',
 'mfcc2_sma3_stddevNorm',
 'mfcc3_sma3_amean',
 'mfcc3_sma3_stddevNorm',
 'mfcc4_

In [4]:
def load_data(folder_path):
    #Init empty dataframe
    res = pd.DataFrame()
    for file in os.listdir(folder_path):
        #for each speaker file
        if file.endswith('.csv'):
            #get participant id from filename, eg filename: 'spk_305.csv'
            participant = float(file.split('_')[1].split('.')[0])
            #find labels for the participant
            label = labels[labels['Participant_ID'] == participant]
            #load participant feature file
            file_path = os.path.join(folder_path, file)
            data_df = pd.read_csv(file_path, header=None, names=features)
            #Add labels and participant id columns
            data_df['participant'] = participant
            data_df['gender'] = label['Gender'].values[0]
            data_df['depression'] = label['Depression'].values[0]
            #combine everything to result
            res = pd.concat([res, data_df])
    return res

In [5]:
#Load training data
training_df = load_data(training_path)
len(training_df)

13626

In [6]:
#Load test data
test_df = load_data(test_path)
len(test_df)

3280

In [7]:
# Check Missing values
missing_values = (training_df.isnull().sum()/len(training_df)) *100
print(f'Missing value percent % for each column, total samples {len(training_df)}')
print(missing_values)

Missing value percent % for each column, total samples 13626
F0semitoneFrom27.5Hz_sma3nz_amean             0.007339
F0semitoneFrom27.5Hz_sma3nz_stddevNorm        0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile20.0    0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile50.0    0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile80.0    0.007339
                                                ...   
StddevUnvoicedSegmentLength                   0.007339
equivalentSoundLevel_dBp                      0.007339
participant                                   0.000000
gender                                        0.000000
depression                                    0.000000
Length: 91, dtype: float64


In [8]:
training_df = training_df.dropna()
len(training_df)

13625

There was only one row with null values so I think we're good to drop that row.

Designating subjects as validation vs training subjects

In [34]:
def val_split(df: pd.DataFrame):
    df['participant'] = df['participant'].astype(int)
    n = np.unique(df['participant'])
    n_range = range(min(n), max(n))
    train_subs = random.sample(n_range, math.floor(len(n)*0.75))
    train = df[df['participant'].isin(train_subs)]
    val = df[~df['participant'].isin(train_subs)]
    return train, val

In [121]:
train, val = val_split(training_df)

In [40]:
#Used to split into our features and our outcomes for this task
def x_y_split(df: pd.DataFrame):
    x = df.drop(['participant', 'gender', 'depression'], axis = 1)
    gender = df['gender']
    depression = df['depression']
    return x, gender, depression

In [122]:
x_train, g_train, d_train = x_y_split(train)
x_val, g_val, d_val = x_y_split(val)
x_test, g_test, d_test = x_y_split(test_df)

In [94]:
from sklearn.metrics import confusion_matrix
# Calculates accuracy
# pass true and predicted labels
# return accuracy score
def calculate_total_accuracy(true_labels, predicted_labels):
    return accuracy_score(true_labels, predicted_labels)

# Calculates accuracy
# pass true and predicted labels
# return balanced accuracy score
def calculate_balanced_accuracy(true_labels, predicted_labels):
    #calculkate confusion matrix
    matrix = confusion_matrix(true_labels, predicted_labels)
    TP = matrix[1, 1]
    TN = matrix[0, 0]
    FP = matrix[0, 1]
    FN = matrix[1, 0]
    #For positive class, how many correct predictions
    accuracy_positive = TP/(TP+FN)
    #For negative class how many 
    accuracy_negative = TN/(TN+FP)
    return 0.5*(accuracy_positive + accuracy_negative)

#Calculates Equality of Opportunity
# pass true and predicted labels for male samples
# pass true and predicted labels for female samples
# return balanced accuracy score
def calculate_EO(true_labels_male, 
                 true_labels_female,
                 predicted_labels_male,
                predicted_labels_female):
    #Calculate True pistive rate for male gender with confusion matrix
    matrix_male = confusion_matrix(true_labels_male, predicted_labels_male)
    TP = matrix_male[1, 1]
    TN = matrix_male[0, 0]
    FN = matrix_male[1, 0]
    TPR_male = TP/(TP+FN)

    #Calculate True pistive rate for female gender with confusion matrix
    matrix_female = confusion_matrix(true_labels_female, predicted_labels_female)
    TP = matrix_female[1, 1]
    TN = matrix_female[0, 0]
    FN = matrix_female[1, 0]
    TPR_female = TP/(TP+FN)
    
    # Calculate EO
    return 1-abs(TPR_male-TPR_female) 

#Function to calculate majority votings
#Pass labels
#Returns mode or which label was predicted most
def majority_voting(df):
    counts = df.value_counts()
    return counts.idxmax()

In [95]:
#Function to calculate all metrics
#Pass true labels, predicted labels and a reference(test/val) dataframe
#referece dataframe should have all labels and features
#Returns a dictionary with all the metric calculated
def calculate_metrics(y_true, y_pred, test_data):
    # Initialize metrics
    metrics = {}
    #---------------------------------------------------CALCULATING TOTAL METRICS
    #calculate total accuracy
    metrics["Total accuracy"] = calculate_total_accuracy(y_true, y_pred)
    #calculate total balanced accuracy
    metrics["Total Balanced accuracy"] = calculate_balanced_accuracy(y_true, y_pred)
    #calculate total EO
    #find gender based indices for true labels from data
    male_indices = test_data[test_data['gender']==1].index
    female_indices = test_data[test_data['gender']==0].index
    #separate true labels based on indices
    male_true = y_true.loc[male_indices]
    female_true = y_true.loc[female_indices]
    #Find the corresponding indices for predicted labels from true_labels
    male_true_index_list = male_true.index.tolist()
    female_true_index_list = female_true.index.tolist()
    #Get separated predicted labels based on gender
    male_predicted = y_pred[[male_true_index_list.index(index) for index in male_true_index_list]]
    female_predicted = y_pred[[female_true_index_list.index(index) for index in female_true_index_list]]
    metrics["Total EO"] = calculate_EO(male_true, female_true, male_predicted, female_predicted)
    #-------------------------------------------------CALCULATING AGGREGATED METRICS FOR EACH PARTICIPANT
    predictions_df = pd.DataFrame({'participant': test_data['participant'], 'predicted_label': y_pred, 'true_label': y_true})
    aggregated_y_true = predictions_df.groupby('participant')['true_label'].agg(majority_voting)
    aggregated_y_pred = predictions_df.groupby('participant')['predicted_label'].agg(majority_voting)
    #Calculate aggregated accuracy score
    metrics["Aggregated accuracy score"] = calculate_total_accuracy(aggregated_y_true, aggregated_y_pred)
    #Calculate balanced aggregated accuracy
    metrics["Aggregated balanced accuracy score"] = calculate_balanced_accuracy(aggregated_y_true, aggregated_y_pred)
    #Calculate aggregated EOs
    male_predictions_df = pd.DataFrame({'participant': test_data['participant'].loc[male_indices], 'predicted_label': y_pred, 'true_label': y_true})
    male_aggregated_y_true = male_predictions_df.groupby('participant')['true_label'].agg(majority_voting)
    male_aggregated_y_pred = male_predictions_df.groupby('participant')['predicted_label'].agg(majority_voting)
    female_predictions_df = pd.DataFrame({'participant': test_data['participant'].loc[female_indices], 'predicted_label': y_pred, 'true_label': y_true})
    female_aggregated_y_true = female_predictions_df.groupby('participant')['true_label'].agg(majority_voting)
    female_aggregated_y_pred = female_predictions_df.groupby('participant')['predicted_label'].agg(majority_voting)
    metrics["Aggregated EO score"] = calculate_EO(male_aggregated_y_true, female_aggregated_y_true, male_aggregated_y_pred, female_aggregated_y_pred)
    
    return metrics

# Feature selection

Running PCA to find the principal components in the data.

In [72]:
from sklearn import decomposition
from sklearn.feature_selection import SelectKBest

In [125]:
class PCA_(decomposition.PCA):
    def __init__(self, x_train, x_val, x_test):
        super().__init__()
        self.x_train = x_train
        self.x_val = x_val
        self.x_test = x_test
    
    def run_pca(self):
        #Find all PCs for the data
        self.fit(self.x_train)
        #Transform the data into the new feature space
        self.x_train = self.transform(self.x_train)
        self.x_val = self.transform(self.x_val)
        self.x_test = self.transform(self.x_test)
    
    def best_PCs(self, Y, k):
        #Find the k best PCs for predicting a given outcome Y
        best = SelectKBest(k=k).fit(self.x_train, Y)
        return best.transform(self.x_train), best.transform(self.x_val), best.transform(self.x_test)
        

In [126]:
pca = PCA_(x_train, x_val, x_test)
pca.run_pca()

Running independent component analysis to compare to PCA.

In [127]:
class ICA_(decomposition.FastICA):
    def __init__(self, x_train, x_val, x_test):
        super().__init__()
        self.x_train = x_train
        self.x_val = x_val
        self.x_test = x_test
    
    def run_ica(self):
        #Find all ICs for the data
        self.fit(self.x_train)
        #Transform the data into the new feature space
        self.x_train = self.transform(self.x_train)
        self.x_val = self.transform(self.x_val)
        self.x_test = self.transform(self.x_test)
    
    def best_ICs(self, Y, k):
        #Find the k best ICs for predicting a given outcome Y
        best = SelectKBest(k=k).fit(self.x_train, Y)
        return best.transform(self.x_train), best.transform(self.x_val), best.transform(self.x_test)
        

In [129]:
ica = ICA_(x_train, x_val, x_test)
ica.run_ica()



# Linear Perceptron
Create a linear perceptron from the PCs that best predict gender and depression.

In [76]:
from sklearn import linear_model

In [106]:
def perceptron(x_train, y_train, x_val, y_val):
    percept = linear_model.Perceptron()
    percept.fit(x_train, y_train)
    preds = percept.predict(x_val)
    return calculate_balanced_accuracy(y_val, preds)


In [130]:
pc_g_stats = {}
ic_g_stats = {}
for k in range(1, 60):
    g_x_train, g_x_val, g_x_test = pca.best_PCs(g_train, k)
    pc_g_stats[k] = perceptron(g_x_train, g_train, g_x_val, g_val)
    
    g_x_train, g_x_val, g_x_test = ica.best_ICs(g_train, k)
    ic_g_stats[k] = perceptron(g_x_train, g_train, g_x_val, g_val)

g_n_pcs_best = max(pc_g_stats, key=pc_g_stats.get)
g_acc_best = pc_g_stats[g_n_pcs_best]

print('For predicting gender with PCs, the best performance was achieved by using the', g_n_pcs_best, 'best PCs. This achieved a balanced accuracy of', g_acc_best)

g_n_ics_best = max(ic_g_stats, key=ic_g_stats.get)
g_acc_best = ic_g_stats[g_n_ics_best]

print('For predicting gender with ICs, the best performance was achieved by using the', g_n_ics_best, 'best ICs. This achieved a balanced accuracy of', g_acc_best)

For predicting gender with PCs, the best performance was achieved by using the 19 best PCs. This achieved a balanced accuracy of 0.8927601570004363
For predicting gender with ICs, the best performance was achieved by using the 43 best ICs. This achieved a balanced accuracy of 0.9115269387501703


In [131]:
pc_d_stats = {}
ic_d_stats = {}
for k in range(1, 60):
    d_x_train, d_x_val, d_x_test = pca.best_PCs(d_train, k)
    pc_d_stats[k] = perceptron(d_x_train, d_train, d_x_val, d_val)
    
    d_x_train, d_x_val, d_x_test = ica.best_ICs(d_train, k)
    ic_d_stats[k] = perceptron(d_x_train, d_train, d_x_val, d_val)

d_n_pcs_best = max(pc_d_stats, key=pc_d_stats.get)
d_acc_best = pc_d_stats[d_n_pcs_best]

print('For predicting depression with PCs, the best performance was achieved by using the', d_n_pcs_best, 'best PCs. This achieved a balanced accuracy of', d_acc_best)

d_n_ics_best = max(ic_d_stats, key=ic_d_stats.get)
d_acc_best = ic_d_stats[d_n_ics_best]

print('For predicting depression with ICs, the best performance was achieved by using the', d_n_ics_best, 'best ICs. This achieved a balanced accuracy of', d_acc_best)

For predicting depression with PCs, the best performance was achieved by using the 17 best PCs. This achieved a balanced accuracy of 0.5414190093509015
For predicting depression with ICs, the best performance was achieved by using the 57 best ICs. This achieved a balanced accuracy of 0.5316878136781915
