<a href="https://colab.research.google.com/github/shehadeh7/APS360-Project/blob/main/NSL_Mean_Models/processingmodel_withMFCCOnly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install pydub
! pip install noisereduce
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download dataset from Kaggle (direct link: https://www.kaggle.com/dmitrybabko/speech-emotion-recognition-en)
!kaggle datasets download dmitrybabko/speech-emotion-recognition-en

Downloading speech-emotion-recognition-en.zip to /content
100% 985M/987M [00:18<00:00, 55.1MB/s]
100% 987M/987M [00:18<00:00, 55.3MB/s]


In [None]:
!mkdir ./data

In [None]:
%%capture
!unzip speech-emotion-recognition-en.zip -d ./data

In [None]:
import numpy as np
import os
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle

In [None]:
emotions = {
  'happy': 0, 
  'sad': 1, 
  'angry': 2, 
  'disgust': 3, 
  'fear': 4, 
  'neutral': 5
}

ravdess_emotions = {
    '01': emotions['neutral'],
    '03': emotions['happy'],
    '04': emotions['sad'],
    '05': emotions['angry'],
    '06': emotions['fear'],
    '07': emotions['disgust'],
}

crema_emotions = {
  'SAD': emotions['sad'], 
  'ANG': emotions['angry'], 
  'DIS': emotions['disgust'],
  'FEA': emotions['fear'], 
  'HAP': emotions['happy'], 
  'NEU': emotions['neutral']
}

savee_emotions = {
    'a': emotions['angry'],
    'd': emotions['disgust'],
    'f': emotions['fear'],
    'h': emotions['happy'],
    'n': emotions['neutral']
}

processed_data = []

data_path = "/content/data"
for root, dirs, files in os.walk(data_path):
  for file in files:
    file_path = os.path.join(root, file)

    dataset = file_path.split('/')[3]
    if dataset == 'Ravdess':
      emotion_label = ravdess_emotions.get(file.split('-')[2], None)
    elif dataset == 'Crema':
      emotion_label = crema_emotions.get(file.split('_')[2], None)
    elif dataset == 'Tess':
      emotion_label = emotions.get(file.split('_')[2].split('.')[0], None)
    else:
      emotion_code = file.split('_')[1][:2]
      if (emotion_code == 'sa'):
        emotion_label = emotions['sad']
      else:
        emotion_label = savee_emotions.get(emotion_code[0], None)

    if (emotion_label != None):
      processed_data.append([file_path, dataset, emotion_label])

In [None]:
import librosa


from pydub import AudioSegment, effects
import noisereduce as nr

frame_length = 2048
hop_length = 512
total_length = 180000 # verify this value?

def extract_feature(file_name): 

    y, sample_rate = librosa.load(file_name, sr=None)

    # Don't process corrupted audio signals
    if not np.any(y):
        return None

    rawsound = AudioSegment.from_file(file_name) 
    # Normalize the audio to +5.0 dBFS.
    normalizedsound = effects.normalize(rawsound, headroom = 5.0) 
    # Transform the normalized audio to np.array of samples.
    normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32')
    # Trim silence from the beginning and the end.
    xt, index = librosa.effects.trim(normal_x, top_db=30)
    # Pad for duration equalization.
    padded_x = librosa.util.fix_length(xt, size=total_length)
    # Noise reduction.
    final_x = nr.reduce_noise(y=padded_x, sr=sample_rate)
    
    # Features extraction   
    # stft = np.abs(librosa.stft(final_x))
    mfccs = librosa.feature.mfcc(y=final_x, sr=sample_rate, n_mfcc=15)
    # print(mfccs.shape)
    # chroma = librosa.feature.chroma_stft(S=stft, sr=sample_rate)
    # print(chroma.shape)
    # mel = librosa.feature.melspectrogram(final_x, sr=sample_rate)
    # print(mel.shape)
    # print(zcr.shape)
    return mfccs

In [None]:
# Obtain features from each wav file
labels = []
features = []

mfccs = []
i = 0 # partially save results?
mylists = [mfccs]
for data in processed_data:
    i += 1
    # if i<=4000:
    #     continue    
    result = extract_feature(data[0])
    if result is not None:
        labels.append(data[2])
        mfccs.append(result)
    if i%100 == 0:
        print(i)

In [None]:
# Change feature lists to np array of size timestamp x features
a_mfccs = np.asarray(mfccs).astype('float32')
a_mfccs = np.swapaxes(a_mfccs, 1, 2)

print('MFCCS shape:', a_mfccs.shape)

MFCCS shape: (11317, 352, 15)


In [None]:
# Create np arrays for data and labels
X = a_mfccs
Y = np.array(labels)

In [None]:
# numpy save x and y
# zip together before saving?
x_path = './x_data'
y_path = './y_data'
np.save(x_path, X)
np.save(y_path, Y)

In [None]:
! zip mfcc_only_X.zip x_data.npy y_data.npy

  adding: x_data.npy (deflated 74%)
  adding: y_data.npy (deflated 94%)


In [None]:
# shuffle X and Y same way
# random_state to seed the shuffle
# X, Y = shuffle(X, Y, random_state=0)
print(X.shape)
print(Y.shape)

(11317, 352, 15)
(11317,)


In [None]:
# !pip install neural-structured-learning

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

X_train_0, X_test, y_train_0, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train_0, y_train_0, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

# Create a base model -- sequential, functional, or subclass.
model = tf.keras.Sequential([
    tf.keras.Input(X.shape[1:3]),
    tf.keras.layers.Flatten(),                             
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),    
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),    
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(6, activation=tf.nn.softmax)
])

# # Wrap the model with adversarial regularization.
# adv_config = nsl.configs.make_adv_reg_config(multiplier=0.2, adv_step_size=0.05)
# adv_model = nsl.keras.AdversarialRegularization(model, adv_config=adv_config)

adam = tf.keras.optimizers.RMSprop(learning_rate=0.0001)

# Compile, train, and evaluate.
model.compile(optimizer=adam,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=64, epochs=200, validation_data=(X_val, y_val))
# model.evaluate(x_test, y_test)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7f973ff04290>

In [None]:
print(Y)

['Ravdess' 'Ravdess' 'Ravdess' ... 'Crema' 'Crema' 'Crema']
