# This jupyter notebook is for performing data manipulations on the raw data
I have used MFCCs of each audio file as the features for my model. This notebook extracts the MFCCs and also handles data augmentation.

## Loading Libraries

In [None]:
import pandas as pd
import glob
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import pandas as pd
import os
import IPython.display as ipd
from tqdm import tqdm_notebook
import IPython.display as ipd

Getting file paths of the audio files.

In [None]:
files = glob.glob(r"D:\Kaggle\datasets\emotion_small\val/*.wav")
print(len(files)) #total number of audio files

Getting the csv file with file names and labels. I have used this dataframe as reference for storing the MFCCs. 

In [None]:
df = pd.read_csv(r"D:\Kaggle\datasets\emotion_small\meld_val_small.csv",encoding = 'utf-8')
df.head()

# Augmentation methods
Data augmentation can increase model's robustness and also help in increasing the training data size when we only have access to a small dataset. 

In [None]:
def noise(data):
    """
    Adding White Noise.
    """
    # you can take any distribution from https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.random.html
    noise_amp = 0.05*np.random.uniform()*np.amax(data)   # more noise reduce the value to 0.5
    data = data.astype('float64') + noise_amp * np.random.normal(size=data.shape[0])
    return data
    
def shift(data):
    """
    Random Shifting.
    """
    s_range = int(np.random.uniform(low=-5, high = 5)*1000)  #default at 500
    return np.roll(data, s_range)
    
def stretch(data, rate=0.8):
    """
    Streching the Sound. Note that this expands the dataset slightly
    """
    data = librosa.effects.time_stretch(data, rate)
    return data
    
def pitch(data, sample_rate):
    """
    Pitch Tuning.
    """
    bins_per_octave = 12
    pitch_pm = 2
    pitch_change =  pitch_pm * 2*(np.random.uniform())   
    data = librosa.effects.pitch_shift(data.astype('float64'), 
                                      sample_rate, n_steps=pitch_change, 
                                      bins_per_octave=bins_per_octave)
    return data
    
def dyn_change(data):
    """
    Random Value Change.
    """
    dyn_change = np.random.uniform(low=-0.5 ,high=7)  # default low = 1.5, high = 3
    return (data * dyn_change)
    
def speedNpitch(data):
    """
    speed and Pitch Tuning.
    """
    # you can change low and high here
    length_change = np.random.uniform(low=0.8, high = 1)
    speed_fac = 1.2  / length_change # try changing 1.0 to 2.0 ... =D
    tmp = np.interp(np.arange(0,len(data),speed_fac),np.arange(0,len(data)),data)
    minlen = min(data.shape[0], tmp.shape[0])
    data *= 0
    data[0:minlen] = tmp[0:minlen]
    return data

# Creating a dictionary to store the MFCCs
The code below creates a dictionary which stores the MFCCs of audio files after the neccessary data augmentations. This is done for easy dataloading in the future. The file names are used as keys.

In [None]:
def get_dictionary(n_mfcc,df=df,augmentation = [],directory ='D:/Kaggle/datasets/emotion_small/val/' ):
    sample_rate = 44100
    dim = (n_mfcc,1 + int(np.floor((sample_rate * 3)/512)))
    df_dict = {}
    input_length = sample_rate * 3
    
    for i in tqdm_notebook(range(len(df))):
        file_path = str(directory)+ '/' + str(df['fname'][i])
        data, _ = librosa.load(file_path, res_type='kaiser_fast',duration=3,sr=22050*2,offset=0.5)
        
        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
                
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant") #padding 
            
            #Data augmentation
            if(len(augmentation) != 0):
                for aug in augmentation:
                    if(aug == 'noise'):
                        data = noise(data)
                    elif(aug == 'shift'):
                        data = shift(data)
                    elif(aug == 'stretch'):
                        data = stretch(data)
                    elif(aug == 'pitch'):
                        data = pitch(data)
                    elif(aug == 'dyn_change'):
                        data = dyn_change(data)
                    elif(aug == 'speedNpitch'):
                        data = speedNpitch(data)
                    else:
                        assert 1 == 0
        S = librosa.feature.melspectrogram(data,sr = 44100)
        log_S = librosa.power_to_db(S,ref = np.max)
        #getting the MFCC values
        mfcc = librosa.feature.mfcc(S = log_S, sr=sample_rate, n_mfcc= dim[0])
        df_dict[str(df['fname'][i])] = mfcc   #storing the values in the dictionary with the file name as the key
    return df_dict

Getting the MFCC dictionary.

In [None]:
df_dict = get_dictionary(n_mfcc = 100,augmentation = [])

### Saving the dictionary as a .npy file.

In [None]:
np.save('D:/Kaggle/datasets/emotion_small/mfcc_dictionaries/mfcc_val_small.npy',df_dict)