In [5]:
import pandas as pd
import numpy as np
import os
import sys
from tqdm.notebook import tqdm, trange

# librosa is a Python library for analyzing audio and music. It will be used to extract the data from the audio files
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from joblib import dump, load

# to play the audio files
from IPython.display import Audio

import keras
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [6]:
Ravdess=r"C:\Users\Sharan Prabhath\Downloads\dataset_audio\RAVDESS\\"
Tess =r"C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess"

In [7]:
ravdess_directory_list = os.listdir(Ravdess)

file_emotion = []
file_path = []
for dir in ravdess_directory_list:
    actor = os.listdir(Ravdess + dir)
    for file in actor:
        part = file.split('.')[0]
        part = part.split('-')
        # third part in each file represents the emotion associated to that file.
        file_emotion.append(int(part[2]))
        file_path.append(Ravdess + dir + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Ravdess_df = pd.concat([emotion_df, path_df], axis=1)

# changing integers to actual emotions.
Ravdess_df.Emotions.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)
Ravdess_df.head()

Unnamed: 0,Emotions,Path
0,neutral,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
1,neutral,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
2,neutral,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
3,neutral,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
4,calm,C:\Users\Sharan Prabhath\Downloads\dataset_aud...


In [8]:
tess_directory_list = os.listdir(Tess)

file_emotion = []
file_path = []

for file in tess_directory_list:
    part = file.split('.')[0]
    part = part.split('_')[2]
    if part=='ps':
        file_emotion.append('surprise')
    else:
        file_emotion.append(part)
    file_path.append(Tess + '/' + file)
    print(Tess + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Tess_df = pd.concat([emotion_df, path_df], axis=1)
Tess_df.head()

C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_back_angry.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_back_sad.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_bar_angry.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_bar_sad.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_base_angry.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_base_sad.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_bath_angry.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_bath_sad.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_bean_angry.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_bean_sad.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_beg_angry.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_beg_sad.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_bite_angry.wav
C:\Users\Sharan Prabhath\Downloads\dataset_audio\Tess/OAF_bite_sad.

Unnamed: 0,Emotions,Path
0,angry,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
1,sad,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
2,angry,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
3,sad,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
4,angry,C:\Users\Sharan Prabhath\Downloads\dataset_aud...


In [9]:
# creating Dataframe using all the 2 dataframes we created so far.
data_path = pd.concat([Ravdess_df,Tess_df], axis = 0) 
data_path.to_csv("data_path.csv",index=False)
data_path.head()

Unnamed: 0,Emotions,Path
0,neutral,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
1,neutral,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
2,neutral,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
3,neutral,C:\Users\Sharan Prabhath\Downloads\dataset_aud...
4,calm,C:\Users\Sharan Prabhath\Downloads\dataset_aud...


In [10]:
def create_waveplot(data, sr, e):
    plt.figure(figsize=(10, 3))
    plt.title('Waveplot for audio with {} emotion'.format(e), size=15)
    librosa.display.waveshow(data, sr=sr)
    plt.show()

def create_spectrogram(data, sr, e):
    # stft function converts the data into short term fourier transform
    X = librosa.stft(data)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.figure(figsize=(12, 3))
    plt.title('Spectrogram for audio with {} emotion'.format(e), size=15)
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')   
    #librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar()

In [11]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data):
    return librosa.effects.time_stretch(data, rate=0.8)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data,sr=sampling_rate, n_steps=pitch_factor)

# taking any example and checking for techniques.
path = np.array(data_path.Path)[1]
data, sample_rate = librosa.load(path)

In [12]:
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path)
    
    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [13]:
X, Y = [], []
for path, emotion in tqdm(zip(data_path.Path, data_path.Emotions), total=len(data_path.Path)):
   if emotion in ['angry','happy','fear','sad']:      #filter emotions
       feature = get_features(path)
       for ele in feature:
           X.append(ele)
           # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
           Y.append(emotion)

  0%|          | 0/2840 [00:00<?, ?it/s]

In [15]:
len(X), len(Y), data_path.Path.shape

(5304, 5304, (2840,))

In [16]:
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('features.csv', index=False)
Features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.401143,0.754535,0.789104,0.761949,0.774475,0.771636,0.714849,0.670709,0.698012,0.71702,...,1e-05,6e-06,1.1e-05,7e-06,1e-05,1.1e-05,8e-06,8e-06,8.0886e-07,happy
1,0.298206,0.791347,0.804972,0.789385,0.812579,0.823839,0.708128,0.652464,0.696029,0.731479,...,1e-05,7e-06,1.1e-05,7e-06,1e-05,1.2e-05,9e-06,9e-06,1.167237e-06,happy
2,0.181118,0.73161,0.753697,0.767895,0.748587,0.753665,0.71552,0.683826,0.663693,0.693985,...,1e-06,2e-06,2e-06,2e-06,2e-06,1e-06,2e-06,1e-06,6.943449e-08,happy
3,0.333294,0.755315,0.749708,0.754065,0.767402,0.723681,0.704144,0.684734,0.70971,0.71632,...,1.3e-05,9e-06,6e-06,4e-06,3e-06,3e-06,3e-06,4e-06,3.601406e-07,happy
4,0.365931,0.817403,0.825541,0.840508,0.867017,0.842843,0.728638,0.70197,0.726923,0.743014,...,0.000462,0.00044,0.000459,0.000452,0.000448,0.000467,0.000442,0.000454,0.0004390219,happy


In [17]:
Features = pd.read_csv('features.csv')
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values

In [18]:
# As this is a multiclass classification problem onehotencoding our Y.
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [19]:
# splitting data
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((3978, 162), (3978, 4), (1326, 162), (1326, 4))

In [20]:
# scaling our data with sklearn's Standard scaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

dump(scaler, 'std_scaler.bin', compress=True)

['std_scaler.bin']

In [21]:
# making our data compatible to model.
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((3978, 162, 1), (3978, 4), (1326, 162, 1), (1326, 4))

In [23]:
model = Sequential()
model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(x_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Flatten())
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=4, activation='softmax'))
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_4 (Conv1D)           (None, 162, 256)          1536      
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 81, 256)          0         
 1D)                                                             
                                                                 
 conv1d_5 (Conv1D)           (None, 81, 256)           327936    
                                                                 
 max_pooling1d_5 (MaxPooling  (None, 41, 256)          0         
 1D)                                                             
                                                                 
 conv1d_6 (Conv1D)           (None, 41, 128)           163968    
                                                                 
 max_pooling1d_6 (MaxPooling  (None, 21, 128)         

In [24]:
callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=3,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=False,
)

history=model.fit(x_train, y_train, batch_size=64, epochs=50, validation_data=(x_test, y_test), callbacks=[callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50


In [25]:
print("Accuracy of our model on test data : " , model.evaluate(x_test,y_test)[1]*100 , "%")

Accuracy of our model on test data :  83.18250179290771 %


In [34]:
model.save('SER_model.h5')