In [1]:
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
# Paths for data.
Ravdess = "TESTING/"

Modality (01 = full-AV, 02 = video-only, 03 = audio-only). \
Vocal channel (01 = speech, 02 = song).\
Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).\
Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.\
Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").\
Repetition (01 = 1st repetition, 02 = 2nd repetition).\
Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

In [3]:
ravdess_directory_list = os.listdir(Ravdess)

file_emotion = []
file_path = []
for dir in ravdess_directory_list:
    # as their are 20 different actors in our previous directory we need to extract files for each actor.
    actor = os.listdir(Ravdess + dir)
    for file in actor:
        part = file.split('.')[0]
        part = part.split('-')
        # third part in each file represents the emotion associated to that file.
        file_emotion.append(int(part[2]))
        file_path.append(Ravdess + dir + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Ravdess_df = pd.concat([emotion_df, path_df], axis=1)

# changing integers to actual emotions.
Ravdess_df.Emotions.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)
Ravdess_df.head()

Unnamed: 0,Emotions,Path
0,neutral,TESTING/samples/03-01-01-01-01-02-01.wav
1,calm,TESTING/samples/03-01-02-01-01-01-01.wav
2,calm,TESTING/samples/03-01-02-02-01-01-01.wav
3,calm,TESTING/samples/03-01-02-02-02-01-01.wav
4,happy,TESTING/samples/03-01-03-01-01-01-01.wav


In [4]:
len(Ravdess_df)

10

In [5]:
data_path = Ravdess_df

In [6]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

# taking any example and checking for techniques.
path = np.array(data_path.Path)[0]
data, sample_rate = librosa.load(path)

In [7]:
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [8]:
X, Y = [], []
for path, emotion in zip(data_path.Path, data_path.Emotions):
    feature = get_features(path)
    for ele in feature:
        X.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)

In [9]:
len(X), len(Y), data_path.Path.shape

(30, 30, (10,))

In [10]:
Features = pd.DataFrame(X)
Features.to_csv('features_test.csv', index=False)
Features['labels'] = Y
Features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.271272,0.674966,0.723259,0.724594,0.681302,0.670643,0.674574,0.630036,0.680146,0.708276,...,6.998011e-06,7.050108e-06,6.670963e-06,6.999257e-06,1.21788e-05,9.449916e-06,8.465686e-06,2.638513e-06,1.788902e-07,neutral
1,0.332678,0.802282,0.84025,0.818895,0.800384,0.808311,0.712412,0.649171,0.711171,0.750695,...,0.0001554928,0.0001595514,0.0001619412,0.0001640301,0.0001787189,0.000169857,0.0001604263,0.0001617161,0.0001573211,neutral
2,0.161267,0.640367,0.651222,0.741996,0.684241,0.637188,0.655669,0.688134,0.607509,0.662895,...,8.802704e-07,1.243989e-06,1.323868e-06,1.295318e-06,8.929181e-07,1.325259e-06,1.662287e-06,5.423552e-07,2.56008e-08,neutral
3,0.244276,0.611601,0.698656,0.730102,0.686579,0.656195,0.657558,0.631722,0.669574,0.674671,...,5.811672e-06,4.467301e-06,1.176222e-05,6.051253e-06,3.856389e-06,4.652284e-06,4.236149e-06,1.549818e-06,1.118311e-07,calm
4,0.209021,0.61622,0.650349,0.73372,0.703466,0.706819,0.688933,0.644491,0.663127,0.691119,...,5.827814e-06,4.522502e-06,1.181359e-05,6.061449e-06,3.851339e-06,4.711811e-06,4.261416e-06,1.58141e-06,1.377198e-07,calm
5,0.146571,0.583243,0.645292,0.732506,0.686274,0.631657,0.637448,0.625146,0.63754,0.660131,...,8.805935e-07,8.513472e-07,7.774731e-07,1.601502e-06,2.0813e-06,7.560968e-07,7.1546e-07,2.996915e-07,1.83479e-08,calm
6,0.184828,0.595758,0.655787,0.72769,0.700262,0.663431,0.669693,0.665442,0.728932,0.689506,...,1.724708e-06,1.69631e-06,2.040566e-06,3.192627e-06,5.84485e-06,6.540728e-06,5.534759e-06,2.244282e-06,1.209406e-07,calm
7,0.245379,0.702829,0.753318,0.79769,0.785328,0.778472,0.745212,0.684155,0.740546,0.730967,...,2.888433e-05,2.988849e-05,2.894223e-05,3.031636e-05,3.388246e-05,3.496637e-05,3.339548e-05,3.03604e-05,2.6156e-05,calm
8,0.12687,0.647036,0.565606,0.647612,0.679484,0.62597,0.631253,0.655692,0.675817,0.708252,...,3.145892e-07,2.529523e-07,3.109891e-07,3.263989e-07,4.345117e-07,8.237755e-07,8.989333e-07,3.056483e-07,1.888676e-08,calm
9,0.170067,0.546961,0.641392,0.680814,0.608562,0.574102,0.531891,0.497852,0.499035,0.549248,...,2.197276e-05,1.230685e-05,1.597811e-05,2.259575e-05,3.581418e-05,4.93844e-05,3.513531e-05,1.290045e-05,7.454701e-07,calm


In [11]:
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values

In [12]:
import pickle
pickle.dump(X, open("Features.p", "wb"))

In [13]:
Y

array(['neutral', 'neutral', 'neutral', 'calm', 'calm', 'calm', 'calm',
       'calm', 'calm', 'calm', 'calm', 'calm', 'happy', 'happy', 'happy',
       'happy', 'happy', 'happy', 'happy', 'happy', 'happy', 'sad', 'sad',
       'sad', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad'], dtype=object)

In [14]:
y1=Y

In [15]:
Y = ['neutral','calm','happy','sad','angry','fear','disgust','surprise']

In [16]:
Y =np.array(Y)

In [17]:
# As this is a multiclass classification problem onehotencoding our Y.
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [18]:
Y 

array([[0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]])

In [19]:
X.shape, Y.shape

((30, 162), (8, 8))

In [20]:
x_test=X

In [21]:
# scaling our data with sklearn's Standard scaler
scaler = StandardScaler()
x_test = scaler.fit_transform(X)
x_test.shape

(30, 162)

In [22]:
# making our data compatible to model.
x_test = np.expand_dims(x_test, axis=2)
x_test.shape

(30, 162, 1)

In [23]:
from tensorflow import keras
model = keras.models.load_model('model1.h5')

In [24]:
model.predict(x_test).shape

(30, 8)

In [25]:
# predicting on test data.
pred_test = model.predict(x_test)
y_pred = encoder.inverse_transform(pred_test)

In [26]:
y_pred

array([['happy'],
       ['disgust'],
       ['happy'],
       ['sad'],
       ['happy'],
       ['happy'],
       ['calm'],
       ['calm'],
       ['neutral'],
       ['disgust'],
       ['disgust'],
       ['neutral'],
       ['angry'],
       ['angry'],
       ['surprise'],
       ['angry'],
       ['angry'],
       ['happy'],
       ['angry'],
       ['angry'],
       ['angry'],
       ['sad'],
       ['disgust'],
       ['sad'],
       ['sad'],
       ['happy'],
       ['sad'],
       ['happy'],
       ['surprise'],
       ['disgust']], dtype='<U8')

In [27]:
y_pred.shape

(30, 1)

In [28]:
y_test = encoder.inverse_transform(Y)

In [29]:
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = y_pred.flatten()
df['Actual Labels'] = y1.flatten()

df.head(10)

Unnamed: 0,Predicted Labels,Actual Labels
0,happy,neutral
1,disgust,neutral
2,happy,neutral
3,sad,calm
4,happy,calm
5,happy,calm
6,calm,calm
7,calm,calm
8,neutral,calm
9,disgust,calm


In [30]:
df[df['Predicted Labels'] == df['Actual Labels']]

Unnamed: 0,Predicted Labels,Actual Labels
6,calm,calm
7,calm,calm
17,happy,happy
21,sad,sad
23,sad,sad
24,sad,sad
26,sad,sad


In [32]:
#import tensorflow as tf

#model = tf.keras.models.load_model('model1.h5')
#converter = tf.lite.TFLiteConverter.from_keras_model(model)
#tflite_model = converter.convert()
#open("emotion.tflite", "wb").write(tflite_model)