#Stage 1 - Import libraries and load data

In [25]:
import os
import random
import sys

#%tensorflow_version 1.4 

## Package
import glob 
import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#import plotly.graph_objs as go
#import plotly.offline as py
#import plotly.tools as tls
import seaborn as sns
import scipy.io.wavfile
from sklearn.model_selection import StratifiedShuffleSplit
#py.init_notebook_mode(connected=True)

from scipy import signal
from tqdm import tqdm


from keras import regularizers
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping
from keras.callbacks import  History, ReduceLROnPlateau, CSVLogger
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.utils import to_categorical

from sklearn.preprocessing import LabelEncoder

In [26]:
#from google.colab import drive
#drive.mount('/content/drive')

# Read list of audio files from the source directory
source_dir_list = os.listdir('/Users/saurabh/Downloads/Emotion_Detection 2/sed_data_set')
source_dir_list.sort()
source_dir_list = source_dir_list[1:]
print (source_dir_list)



['.ipynb_checkpoints', 'Actor_01', 'Actor_02', 'Actor_03', 'Actor_04', 'Actor_05', 'Actor_06', 'Actor_07', 'Actor_08', 'Actor_09', 'Actor_10', 'Actor_11', 'Actor_12', 'Actor_13', 'Actor_14', 'Actor_15', 'Actor_16', 'Actor_17', 'Actor_18', 'Actor_19', 'Actor_20', 'Actor_21', 'Actor_22', 'Actor_23', 'Actor_24']


In [27]:
audio_data = pd.DataFrame(columns=['path', 'source', 'actor', 'gender',
                                'intensity', 'statement', 'repetition', 'emotion'])
count = 0
for directory in source_dir_list:
    file_list = os.listdir('/Users/saurabh/Downloads/Emotion_Detection 2/sed_data_set' + directory)
    for files in file_list:
        name = files.split('.')[0].split('-')
        path = '/Users/saurabh/Downloads/Emotion_Detection 2/sed_data_set' + directory + '/' + files
        src = int(name[1])
        actor = int(name[-1])
        emotion = int(name[2])
        
        if int(actor)%2 == 0:
            gender = "female"
        else:
            gender = "male"
        
        if name[3] == '01':
            intensity = 0
        else:
            intensity = 1
        
        if name[4] == '01':
            statement = 0
        else:
            statement = 1
        
        if name[5] == '01':
            repeat = 0
        else:
            repeat = 1
            
        audio_data.loc[count] = [path, src, actor, gender, intensity, statement, repeat, emotion]
        count += 1

FileNotFoundError: [Errno 2] No such file or directory: '/Users/saurabh/Downloads/Emotion_Detection 2/sed_data_set.ipynb_checkpoints'

In [None]:
print (len(audio_data)) #Total number of audio files 
audio_data.head() #About the audio files in the source directory

#Stage 2 - Plotting a random audio file's waveform and its spectrogram


In [None]:
filename = audio_data.path[random.randint(0,len(audio_data)-1)]
print (filename)

samples, sample_rate = librosa.load(filename)
sample_rate, samples

In [None]:
len(samples), sample_rate

In [None]:
# Define Log Specgram function to generate frequency, times and spectrogram from the audio file
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [None]:
freqs, times, spectrogram = log_specgram(samples, sample_rate)

fig = plt.figure(figsize=(16, 10))
fig1 = fig.add_subplot(211)
fig1.set_title('Raw wave of ' + filename)
fig1.set_ylabel('Amplitude')
librosa.display.waveplot(samples, sr=sample_rate)

fig2 = fig.add_subplot(212)
fig2.imshow(spectrogram.T, aspect='auto', origin='lower', 
           extent=[times.min(), times.max(), freqs.min(), freqs.max()])
fig2.set_yticks(freqs[::16])
fig2.set_xticks(times[::16])
fig2.set_title('Spectrogram of ' + filename)
fig2.set_ylabel('Freqs in Hz')
fig2.set_xlabel('Seconds')

In [None]:
#Trim silence part of the audio file
aa , bb = librosa.effects.trim(samples, top_db=30)
aa, bb

In [None]:
# Plotting Mel Power Spectrogram
S = librosa.feature.melspectrogram(aa, sr=sample_rate, n_mels=128)

# Convert to log scale (dB). We'll use the peak power (max) as reference.
log_S = librosa.power_to_db(S, ref=np.max)

plt.figure(figsize=(12,4))
librosa.display.specshow(log_S, sr=sample_rate, x_axis='time', y_axis='mel')
plt.title('Mel power spectrogram ')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()

In [None]:
# Plotting MFCC
mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13)

#Second order differentiation of the Mel spectrogram
delta2_mfcc = librosa.feature.delta(mfcc, order=2)

plt.figure(figsize=(12, 4))
librosa.display.specshow(delta2_mfcc)
plt.ylabel('MFCC coeffs')
plt.xlabel('Time')
plt.title('MFCC')
plt.colorbar()
plt.tight_layout()

In [None]:
# Silence Trimmed Sample Audio 
ipd.Audio(aa, rate=sample_rate)

#Stage 3- Define the Output classes


In [None]:
label_list = []

for i in range(len(audio_data)):
    if audio_data.emotion[i] == 1:
        lb = "_neutral"
    elif audio_data.emotion[i] == 2:
        lb = "_calm"
    elif audio_data.emotion[i] == 3:
        lb = "_happy"
    elif audio_data.emotion[i] == 4:
        lb = "_sad"
    elif audio_data.emotion[i] == 5:
        lb = "_angry"
    elif audio_data.emotion[i] == 6:
        lb = "_fearful"
    elif audio_data.emotion[i] == 7:
        lb = "_disgust"
    elif audio_data.emotion[i] == 8:
        lb = "_surprised"
    else:
        lb = "_none"
        
    # Add gender to the label 
    label_list.append(audio_data.gender[i]  + lb)

#Append the labels to audio data

audio_data['label'] = label_list
audio_data.tail()

#Stage 4 - Extracting features of all audio files using librosa


In [None]:
#Extract the features using MFCC
data = pd.DataFrame(columns=['feature'])
for i in tqdm(range(len(audio_data))):

    X, sample_rate = librosa.load(audio_data.path[i], res_type='kaiser_fast',duration=3,sr=22050*2,offset=0.5)
    sample_rate = np.array(sample_rate)
    pure_X , bb = librosa.effects.trim(X, top_db=30)
    mfccs = np.mean(librosa.feature.mfcc(y=pure_X, sr=sample_rate, n_mfcc=13), axis=0)
    feature = mfccs
    data.loc[i] = [feature]

In [None]:
# Display features of first 5 Audio Files 
data.head()

In [None]:
# Covert features into a DataFrame and label them accordingly

feature_data = pd.DataFrame(data['feature'].values.tolist())
labels = audio_data.label

labeled_data = pd.concat([feature_data,labels],axis=1)

new_labeled_data = labeled_data.rename(index=str, columns={"0": "label"})

new_labeled_data.head(10)

In [None]:
# Replace null values with 0

print(new_labeled_data.isnull().sum().sum())

new_labeled_data = labeled_data.fillna(0)
new_labeled_data.isnull().sum().sum()

#Plot Emotion Distribution of Data

In [None]:
#Plot the graph of total number of audio files based on its emotional class 

tmp_data = pd.DataFrame()
tmp_data['Emotion'] = list(audio_data.label.value_counts().keys())
tmp_data['Count'] = list(audio_data.label.value_counts())
fig, ax = plt.subplots(figsize=(14, 7))
ax = sns.barplot(x="Emotion", y='Count', color='#FF5733', data=tmp_data)
ax.set_title("Data Distribution")
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)

In [None]:
X = new_labeled_data.drop(['label'], axis=1)
y = new_labeled_data.label
X_Stratified = StratifiedShuffleSplit(1, test_size=0.2, random_state=0)
for train_index, test_index in X_Stratified.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

x_train_cnn = np.expand_dims(X_train, axis=2)
x_test_cnn = np.expand_dims(X_test, axis=2)


In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


In [None]:
x_train_cnn = np.expand_dims(X_train, axis=2)
x_test_cnn = np.expand_dims(X_test, axis=2)

x_train_cnn.shape

In [None]:
new_labeled_data=pd.read_csv('./data2.csv')
X = new_labeled_data.drop(['label'], axis=1)
y = new_labeled_data.label
X_Stratified = StratifiedShuffleSplit(1, test_size=0.2, random_state=0)
for train_index, test_index in X_Stratified.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


from sklearn.externals import joblib
scaler_filename = "scaler.save"
joblib.dump(scaler, scaler_filename) 



x_train_cnn = np.expand_dims(X_train, axis=2)
x_test_cnn = np.expand_dims(X_test, axis=2)

x_train_cnn.shape

In [None]:
print(y_train[0],y_train[1])

In [None]:
x_test_cnn.shape

In [None]:
#Setup Keras util functions
import keras
from keras import backend as K

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def fscore(y_true, y_pred):
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    f_score = 2 * (p * r) / (p + r + K.epsilon())
    return f_score

def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer.lr
    return lr


In [None]:
import tensorflow as tf 
print(tf.__version__)
model = Sequential()
model.add(Conv1D(256, 8, padding='same',input_shape=(X_train.shape[1],1)))
model.add(Activation('relu'))
model.add(Conv1D(256, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.10))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Flatten())
# Edit according to target class no.
model.add(Dense(16))
model.add(Activation(tf.nn.softmax))
opt = keras.optimizers.SGD(lr=0.01, momentum=0.5, decay=0.0, nesterov=False)


In [None]:
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [None]:
lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=20, min_lr=0.01)
mcp_save = ModelCheckpoint('./CNN4.h5', save_best_only=True, monitor='val_loss', mode='min')
cnnhistory=model.fit(x_train_cnn,y_train, batch_size=60, epochs=200, validation_data=(x_test_cnn, y_test), callbacks=[mcp_save, lr_reduce])

In [None]:
import json
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)


from keras.models import model_from_json
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("./CNN4.h5")
print("Loaded model from disk")

##
#test_sound=one_sound('---.wav')
#a,b,c,d,e=feature_extraction(test_sound)
#test_f=stacking 3horizontal (a,b,c,d,e)
#loaded_model.predict(test_f)



# evaluate loaded model on test data
loaded_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
score = loaded_model.evaluate(x_test_cnn, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

preds = loaded_model.predict(x_test_cnn, batch_size=16,
                         verbose=1)

### Prediction Part

In [None]:
model

In [None]:
model.save('CNn6.h5')


In [None]:
new_model = keras.models.load_model('CNn6.h5')


In [None]:
loaded_model.predict_classes(tmp)

In [None]:
loaded_model.predict_classes(tmp)

In [None]:
tmp=x_test_cnn[100].reshape(1,193,1)

In [None]:
#data = pd.DataFrame(columns=['feature'])
audio_path='./sed_data_set/Actor_07/03-01-08-02-01-02-07.wav'
mfccs, chroma, mel, contrast, tonnetz = feature_extraction(audio_path)
extracted_features= np.hstack([mfccs,chroma, mel, contrast, tonnetz]) 
#data.loc[i] = [extracted_features]