In [None]:
# import libraries
import pandas as pd
import numpy as np
import random
from glob import glob
import os
import librosa
import noisereduce as nr
from scipy.io import wavfile
import warnings
warnings.filterwarnings("ignore")

#**Data cleaning**


In [None]:
df = pd.read_csv('emotions_data.csv')
print('The dataset has {} audio files'.format(df.shape[0]))
df.sample()

The dataset has 16783 audio files


Unnamed: 0,path,filename,dataset,duration,sample_rate,gender,age,emotion
3043,../Audio files/CREMA-D/1038_ITH_ANG_XX.wav,1038_ITH_ANG_XX,CREMA-D,2.869,16000,male,21,angry


Audio data can be processed in many ways. As noted in the preliminary exploration of the data, the main technical problems with our files are:
- The files have different sample rates
- All files tested start and end with some silence
- Some samples are noisier than others

For the first problem we have can resample the files, and they will all be at 16000 Hz, which is the lowest frequency among the samples, the one that is most present in their total number, and an acceptable value to represent the human voice.

In the second case we have will implemented a `trim` function, which in effect cuts initial and final silence from an audio signal.

For the noise the problem is more difficult, since the noise removal operation also affects the quality of the features to be extracted. For this reason I will use the module `noisereduce` and apply a light reduction of 10% for the stationary noise on all the samples. The reason for such reduction is due to the fact that this operation can also sensibly lower the quality of the signal thus removing also meaningful information of the audio for the next steps.
(https://pypi.org/project/noisereduce/)

In [None]:
path = '/Audio files/cleaned_samples/'

def clean_files(dataset): 
    
    df_clean = df[df['dataset'] == dataset].reset_index(drop=True)
    for i in range(0, df_clean.shape[0]):
        y, sr = librosa.load(df_clean.path[i], sr=16000)
        y_trim, _ = librosa.effects.trim(y, top_db=20)
        y_noise_rem = nr.reduce_noise(y=y_trim, sr=sr, prop_decrease=0.1, stationary=True)
        name = (os.path.join(path)+df_clean.filename[i]+'_cleaned.wav')
        wavfile.write(name, sr, y_noise_rem)

In [None]:
clean_files('CREMA-D')

CPU times: user 3min 26s, sys: 46.5 s, total: 4min 12s
Wall time: 4min 19s


In [None]:
clean_files('RAVDESS')

CPU times: user 2min 42s, sys: 8.34 s, total: 2min 51s
Wall time: 2min 54s


In [None]:
clean_files('SAVEE')

CPU times: user 1min 14s, sys: 4.53 s, total: 1min 19s
Wall time: 1min 20s


In [None]:
clean_files('TESS')

CPU times: user 3min 8s, sys: 20.7 s, total: 3min 29s
Wall time: 3min 35s


In [None]:
clean_files('EmoV_DB')

CPU times: user 4min 26s, sys: 35.2 s, total: 5min 1s
Wall time: 5min 11s


In [None]:
clean_files('JL-Corpus')

CPU times: user 1min 30s, sys: 7.04 s, total: 1min 37s
Wall time: 1min 39s


In [None]:
# saving the new cleaned files path in a variable
cleaned_files = sorted(filter(os.path.isfile, glob('../Audio files/cleaned_samples/*.wav')),key=os.path.getmtime)

#**Feature extraction**


Fundamental frequencies extraction
f0 mean, f0 median, f0 standard deviation, f0 min, f0 25% percentile, f0 75% percentile, f0 max


In [None]:
f0_mean, f0_median, f0_std, f0_0, f0_25, f0_75, f0_100 = [], [], [], [], [], [], []

def get_f0(file):
    y, sr = librosa.load(file, sr=16000)
    
    f_zero, _, _ = librosa.pyin(y, sr=sr, fmin=50, fmax=1500, frame_length=1024)

    f0_mean.append(np.nanmean(f_zero))             
    f0_median.append(np.nanmedian(f_zero))         
    f0_std.append(np.nanstd(f_zero))               
    f0_0.append(np.nanpercentile(f_zero, 0))       
    f0_25.append(np.nanpercentile(f_zero, 25))     
    f0_75.append(np.nanpercentile(f_zero, 75))     
    f0_100.append(np.nanpercentile(f_zero, 100))   

In [None]:
for file in cleaned_files:
    get_f0(file)

CPU times: user 7h 28min 24s, sys: 17min 47s, total: 7h 46min 12s
Wall time: 7h 46min 24s


In [None]:
df_f0 = pd.DataFrame()
df_f0['f0_mean'] = f0_mean
df_f0['f0_median'] = f0_median
df_f0['f0_std'] = f0_std
df_f0['f0_0'] = f0_0
df_f0['f0_25'] = f0_25
df_f0['f0_75'] = f0_75
df_f0['f0_100'] = f0_100
df_f0.sample()

Unnamed: 0,f0_mean,f0_median,f0_std,f0_0,f0_25,f0_75,f0_100
12430,295.935383,228.416468,273.725123,155.114476,215.596437,237.841423,1475.70591


In [None]:
zcr_mean, zcr_var, spectral_centroid_mean, spectral_centroid_var, rms_mean, rms_var, chroma_stft_mean, chroma_stft_var = [], [], [], [], [], [], [], []
rolloff_mean, rolloff_var, spectral_bandwidth_mean, spectral_bandwidth_var, harmony_mean, harmony_var, chroma_cqt_mean, chroma_cqt_var = [], [], [], [], [], [], [], []
spectral_contrast_mean, spectral_contrast_var, spectral_flatness_mean, spectral_flatness_var, chroma_cens_mean, chroma_cens_var = [], [], [], [], [], []

for file in cleaned_files:
    y, sr = librosa.load(file, sr=16000)
    
    zcr_mean.append(np.mean(librosa.feature.zero_crossing_rate(y)))
    zcr_var.append(np.var(librosa.feature.zero_crossing_rate(y)))
    
    spectral_centroid_mean.append(np.mean(librosa.feature.spectral_centroid(y)))
    spectral_centroid_var.append(np.var(librosa.feature.spectral_centroid(y)))
    
    spectral_contrast_mean.append(np.mean(librosa.feature.spectral_contrast(y)))
    spectral_contrast_var.append(np.var(librosa.feature.spectral_contrast(y)))
    
    spectral_flatness_mean.append(np.mean(librosa.feature.spectral_flatness(y)))
    spectral_flatness_var.append(np.var(librosa.feature.spectral_flatness(y)))
    
    rms_mean.append(np.mean(librosa.feature.rms(y=y)))               
    rms_var.append(np.var(librosa.feature.rms(y=y)))
    
    chroma_stft_mean.append(np.mean(librosa.feature.chroma_stft(y, sr)))                   
    chroma_stft_var.append(np.var(librosa.feature.chroma_stft(y, sr)))
    
    chroma_cqt_mean.append(np.mean(librosa.feature.chroma_cqt(y, sr)))                   
    chroma_cqt_var.append(np.var(librosa.feature.chroma_cqt(y, sr)))
    
    chroma_cens_mean.append(np.mean(librosa.feature.chroma_cens(y, sr)))                   
    chroma_cens_var.append(np.var(librosa.feature.chroma_cens(y, sr)))
    
    rolloff_mean.append(np.mean(librosa.feature.spectral_rolloff(y,sr)))
    rolloff_var.append(np.var(librosa.feature.spectral_rolloff(y,sr)))
    
    spectral_bandwidth_mean.append(np.mean(librosa.feature.spectral_bandwidth(y,sr)))
    spectral_bandwidth_var.append(np.var(librosa.feature.spectral_bandwidth(y,sr)))
    
    harmony_mean.append(np.mean(librosa.effects.harmonic(y)))
    harmony_var.append(np.var(librosa.effects.harmonic(y)))

CPU times: user 5h 3min 52s, sys: 33min 3s, total: 5h 36min 56s
Wall time: 2h 47min 13s


In [None]:
df_features = pd.DataFrame()
df_features['zcr_mean'] = zcr_mean
df_features['zcr_var'] = zcr_var
df_features['spectral_centroid_mean'] = spectral_centroid_mean
df_features['spectral_centroid_var'] = spectral_centroid_var
df_features['spectral_contrast_mean'] = spectral_contrast_mean
df_features['spectral_contrast_var'] = spectral_contrast_var
df_features['spectral_flatness_mean'] = spectral_flatness_mean
df_features['spectral_flatness_var'] = spectral_flatness_var
df_features['rms_mean'] = rms_mean
df_features['rms_var'] = rms_var
df_features['chroma_stft_mean'] = chroma_stft_mean
df_features['chroma_stft_var'] = chroma_stft_var
df_features['chroma_cqt_mean'] = chroma_cqt_mean
df_features['chroma_cqt_var'] = chroma_cqt_var
df_features['chroma_cens_mean'] = chroma_cens_mean
df_features['chroma_cens_var'] = chroma_cens_var
df_features['spectral_bandwidth_mean'] = spectral_bandwidth_mean
df_features['spectral_bandwidth_var'] = spectral_bandwidth_var
df_features['rolloff_mean'] = rolloff_mean
df_features['rolloff_var'] = rolloff_var
df_features['harmony_mean'] = harmony_mean
df_features['harmony_var'] = harmony_var
# Check a random row
df_features.sample()

Unnamed: 0,zcr_mean,zcr_var,spectral_centroid_mean,spectral_centroid_var,spectral_contrast_mean,spectral_contrast_var,spectral_flatness_mean,spectral_flatness_var,rms_mean,rms_var,...,chroma_cqt_mean,chroma_cqt_var,chroma_cens_mean,chroma_cens_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,harmony_mean,harmony_var
15443,0.100789,0.007527,2038.675889,1702130.0,20.540552,37.336241,0.027502,0.003225,0.107086,0.006086,...,0.359874,0.076113,0.244829,0.023392,1490.425241,300715.401892,2864.853896,3368736.0,8.1e-05,0.006406


MFCC

In [None]:
def extract_mfcc(file):
    
    mfcc_mean, mfcc_var = [], []
    y, sr = librosa.load(file, sr=16000)
    mfcc_mean.append(np.mean(librosa.feature.mfcc(y=y, sr=sr, fmin=50, n_mfcc=30).T, axis=0))
    mfcc_var.append(np.var(librosa.feature.mfcc(y=y, sr=sr, fmin=50, n_mfcc=30).T, axis=0))
    return np.hstack((mfcc_mean, mfcc_var))[0]

extracted_mfcc = []

for file in cleaned_files:
    extracted_mfcc.append(extract_mfcc(file))

name_mfcc_mean, name_mfcc_var = [], []

for i in range(0, 30):
    name_mfcc_mean.append('mfcc'+str(i+1)+'_mean')   # mfcc1_mean, mfcc2_mean, ...
    name_mfcc_var.append('mfcc'+str(i+1)+'_var')     # mfcc1_var, mfcc2_var, ...

name_mfcc = name_mfcc_mean + name_mfcc_var          
df_mfcc = pd.DataFrame(extracted_mfcc, columns = name_mfcc)
df_mfcc.sample()     

Unnamed: 0,mfcc1_mean,mfcc2_mean,mfcc3_mean,mfcc4_mean,mfcc5_mean,mfcc6_mean,mfcc7_mean,mfcc8_mean,mfcc9_mean,mfcc10_mean,...,mfcc21_var,mfcc22_var,mfcc23_var,mfcc24_var,mfcc25_var,mfcc26_var,mfcc27_var,mfcc28_var,mfcc29_var,mfcc30_var
12577,-220.369965,81.935516,-23.836346,-0.899617,0.663118,-2.636038,-16.423683,-13.724349,-8.89509,-11.408977,...,58.321266,46.76601,31.269527,34.102951,34.067581,79.207108,79.191185,109.316628,123.586655,142.036484


In [None]:
data_features = pd.concat([df, df_f0, df_features, df_mfcc], axis=1)
data_features = data_features.drop(['sample_rate'], axis=1)
data_features.sample()

Unnamed: 0,path,filename,dataset,duration,gender,age,emotion,f0_mean,f0_median,f0_std,...,mfcc21_var,mfcc22_var,mfcc23_var,mfcc24_var,mfcc25_var,mfcc26_var,mfcc27_var,mfcc28_var,mfcc29_var,mfcc30_var
793,../Audio files/CREMA-D/1010_TIE_NEU_XX.wav,1010_TIE_NEU_XX,CREMA-D,2.636,female,27,neutral,190.633147,196.56412,41.089922,...,20.663019,18.28289,22.221407,24.661129,40.810772,72.538109,50.762707,33.290585,70.890846,104.620483


In [None]:
data_features.to_csv('emotions_data_features2222.csv', index=False)

In [None]:
data_features.to_csv('emotions_data_features1111.csv', index=False)

In [None]:
data_features.to_csv('emotions_data_features.csv', index=False)