In [1]:
import os
import pandas as pd

## Loading Data

In [2]:
#Set training and test folder paths
training_path = 'features_train/features_train'
test_path = 'features_test/features_test'

#Load labels file
labels = pd.read_csv('labels.csv')

#Load feature description files, take out column 0 to use as header for training/test sets
features = pd.read_csv('feature_description.csv', encoding_errors='ignore', header=None, index_col=0)
features = features.index.tolist()
features

['F0semitoneFrom27.5Hz_sma3nz_amean',
 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm',
 'F0semitoneFrom27.5Hz_sma3nz_percentile20.0',
 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0',
 'F0semitoneFrom27.5Hz_sma3nz_percentile80.0',
 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
 'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
 'loudness_sma3_amean',
 'loudness_sma3_stddevNorm',
 'loudness_sma3_percentile20.0',
 'loudness_sma3_percentile50.0',
 'loudness_sma3_percentile80.0',
 'loudness_sma3_pctlrange0-2',
 'loudness_sma3_meanRisingSlope',
 'loudness_sma3_stddevRisingSlope',
 'loudness_sma3_meanFallingSlope',
 'loudness_sma3_stddevFallingSlope',
 'spectralFlux_sma3_amean',
 'spectralFlux_sma3_stddevNorm',
 'mfcc1_sma3_amean',
 'mfcc1_sma3_stddevNorm',
 'mfcc2_sma3_amean',
 'mfcc2_sma3_stddevNorm',
 'mfcc3_sma3_amean',
 'mfcc3_sma3_stddevNorm',
 'mfcc4_

In [3]:
def load_data(folder_path):
    #Init empty dataframe
    res = pd.DataFrame()
    for file in os.listdir(folder_path):
        #for each speaker file
        if file.endswith('.csv'):
            #get participant id from filename, eg filename: 'spk_305.csv'
            participant = float(file.split('_')[1].split('.')[0])
            #find labels for the participant
            label = labels[labels['Participant_ID'] == participant]
            #load participant feature file
            file_path = os.path.join(folder_path, file)
            data_df = pd.read_csv(file_path, header=None, names=features)
            #Add labels and participant id columns
            data_df['participant'] = participant
            data_df['gender'] = label['Gender'].values[0]
            data_df['depression'] = label['Depression'].values[0]
            #combine everything to result
            res = pd.concat([res, data_df])
    return res

In [4]:
#Load training data
training_df = load_data(training_path)
len(training_df)

13626

In [5]:
#Load test data
test_df = load_data(test_path)
len(test_df)

3280

In [6]:
test_df.head()

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,participant,gender,depression
0,32.160255,0.200581,23.145561,35.63253,36.815937,13.670376,-65.04982,0.0,28.64151,0.0,...,0.840336,1.754386,0.145,0.065,0.266667,0.164384,-24.85618,305.0,1,0
1,28.780031,0.074786,27.129395,28.150295,31.058764,3.929369,52.741413,13.374495,3.220003,4.861248,...,3.070175,3.139013,0.17,0.140712,0.145,0.123119,-20.882643,305.0,1,0
2,29.038708,0.144522,25.411283,25.819115,34.090847,8.679564,65.17677,0.0,-13.047282,0.0,...,1.190476,2.531646,0.075,0.015,0.196667,0.110252,-21.540741,305.0,1,0
3,24.198637,0.077389,22.477812,24.03218,25.9715,3.493689,106.85341,211.20033,14.058883,9.024993,...,2.503682,1.699926,0.166087,0.145086,0.385,0.467377,-23.443264,305.0,1,0
4,23.637993,0.130217,18.551594,25.037369,26.02095,7.469356,40.880257,21.864357,4.867825,10.950583,...,1.754386,2.409639,0.1125,0.078859,0.224,0.308649,-31.355045,305.0,1,0


## Data cleaning and preprocessing 

- Do we assume 0 values as missing data?
- If we impute dataset to fill missing values, should we do them separately for each participant or the whole dataset as a whole?

In [7]:
# Check Missing values
missing_values = (training_df.isnull().sum()/len(training_df)) *100
print(f'Missing value percent % for each column, total samples {len(training_df)}')
print(missing_values)

Missing value percent % for each column, total samples 13626
F0semitoneFrom27.5Hz_sma3nz_amean             0.007339
F0semitoneFrom27.5Hz_sma3nz_stddevNorm        0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile20.0    0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile50.0    0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile80.0    0.007339
                                                ...   
StddevUnvoicedSegmentLength                   0.007339
equivalentSoundLevel_dBp                      0.007339
participant                                   0.000000
gender                                        0.000000
depression                                    0.000000
Length: 91, dtype: float64


## Data Modeling - Gender Classification
### What models to try?
- Decision tree
- Random forest
- TBD...........

In [8]:
X, y =  training_df[features], training_df['gender']