<a href="https://colab.research.google.com/github/shehadeh7/APS360-Project/blob/main/APS360_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install pydub
! pip install noisereduce

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download dataset from Kaggle (direct link: https://www.kaggle.com/dmitrybabko/speech-emotion-recognition-en)
!kaggle datasets download dmitrybabko/speech-emotion-recognition-en

In [5]:
!mkdir ./data

In [6]:
%%capture
!unzip speech-emotion-recognition-en.zip -d ./data

In [7]:
import os

emotions = {
  'happy': 0, 
  'sad': 1, 
  'angry': 2, 
  'disgust': 3, 
  'fear': 4, 
  'neutral': 5
}

ravdess_emotions = {
    '01': emotions['neutral'],
    '03': emotions['happy'],
    '04': emotions['sad'],
    '05': emotions['angry'],
    '06': emotions['fear'],
    '07': emotions['disgust'],
}

crema_emotions = {
  'SAD': emotions['sad'], 
  'ANG': emotions['angry'], 
  'DIS': emotions['disgust'],
  'FEA': emotions['fear'], 
  'HAP': emotions['happy'], 
  'NEU': emotions['neutral']
}

savee_emotions = {
    'a': emotions['angry'],
    'd': emotions['disgust'],
    'f': emotions['fear'],
    'h': emotions['happy'],
    'n': emotions['neutral']
}

processed_data = []

data_path = "/content/data"
for root, dirs, files in os.walk(data_path):
  for file in files:
    file_path = os.path.join(root, file)

    dataset = file_path.split('/')[3]
    if dataset == 'Ravdess':
      emotion_label = ravdess_emotions.get(file.split('-')[2], None)
    elif dataset == 'Crema':
      emotion_label = crema_emotions.get(file.split('_')[2], None)
    elif dataset == 'Tess':
      emotion_label = emotions.get(file.split('_')[2].split('.')[0], None)
    else:
      emotion_code = file.split('_')[1][:2]
      if (emotion_code == 'sa'):
        emotion_label = emotions['sad']
      else:
        emotion_label = savee_emotions.get(emotion_code[0], None)

    if (emotion_label != None):
      processed_data.append([file_path, dataset, emotion_label])

In [8]:
import librosa
import numpy as np

from pydub import AudioSegment, effects
import noisereduce as nr

frame_length = 2048
hop_length = 512
total_length = 180000 # verify this value?

def extract_feature(file_name): 

    _, sample_rate = librosa.load(file_name, sr=None)

    rawsound = AudioSegment.from_file(file_name) 
    # Normalize the audio to +5.0 dBFS.
    normalizedsound = effects.normalize(rawsound, headroom = 5.0) 
    # Transform the normalized audio to np.array of samples.
    normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32')
    # Trim silence from the beginning and the end.
    xt, index = librosa.effects.trim(normal_x, top_db=30)
    # Pad for duration equalization.
    padded_x = librosa.util.fix_length(xt, size=total_length)
    # Noise reduction.
    final_x = nr.reduce_noise(y=padded_x, sr=sample_rate)
    
    # Features extraction   
    stft = np.abs(librosa.stft(final_x))
    mfccs = librosa.feature.mfcc(y=final_x, sr=sample_rate, n_mfcc=15)
    chroma = librosa.feature.chroma_stft(S=stft, sr=sample_rate)
    mel = librosa.feature.melspectrogram(final_x, sr=sample_rate)
    contrast = librosa.feature.spectral_contrast(S=stft, sr=sample_rate)
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(final_x), sr=sample_rate)
    return mfccs,chroma,mel,contrast,tonnetz


In [None]:
#print(processed_data)

In [21]:
# Obtain features from each wav file
labels = []
features = []
i = 0

mfccs = []
chroma = []
mel = []
contrast = []
tonnetz = []
mylists = [mfccs, chroma, mel, contrast, tonnetz]
for data in processed_data:
    labels.append(data[2])
    result = extract_feature(data[0])
    for x, lst in zip(result, mylists):
        lst.append(x)
    i+=1
    if (i>=1000):
        break


In [22]:
# Change feature lists to np array of size timestamp x features
a_mfccs = np.asarray(mfccs).astype('float32')
a_mfccs = np.swapaxes(a_mfccs, 1, 2)
a_chroma = np.asarray(chroma).astype('float32')
a_chroma = np.swapaxes(a_chroma, 1, 2)
a_mel = np.asarray(mel).astype('float32')
a_mel = np.swapaxes(a_mel, 1, 2)
a_contrast = np.asarray(contrast).astype('float32')
a_contrast = np.swapaxes(a_contrast, 1, 2)
a_tonnetz = np.asarray(tonnetz).astype('float32')
a_tonnetz = np.swapaxes(a_tonnetz, 1, 2)

print('MFCCS shape:',a_mfccs.shape)
print('CHROMA shape:',a_chroma.shape)

MFCCS shape: (1000, 352, 15)
CHROMA shape: (1000, 352, 12)


In [23]:
# Create np arrays for data and labels
X = np.concatenate((a_mfccs, a_chroma, a_mel, a_contrast, a_tonnetz), axis=2)
Y = np.array(labels)

In [24]:
import sklearn
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

# dataSize = 1000
# testSplit = 0.1 # 10% for testsplit

#Create a Gaussian Classifier
clf=RandomForestClassifier(criterion='entropy')

#Train the model using the training sets y_pred=clf.predict(X_test)
# Flatten X to a 2d array for random forest
nsamples, nx, ny = X.shape
X_2d = X.reshape((nsamples,nx*ny))
clf.fit(X_2d[:800,:], Y[:800])



RandomForestClassifier(criterion='entropy')

In [25]:
y_pred=clf.predict(X_2d[800:,:])

pscore = sklearn.metrics.accuracy_score(y_pred, Y[800:])
print(pscore)


0.515
