In [5]:
import sys
import os
import pandas as pd
# insert the path to the source code directory
module_path = os.path.abspath(os.path.join('..', '..')) 
sys.path.insert(0, module_path)

from src.utils import load_emodb, load_ravdess, load_tess, load_crema_d, filter_emotions

emodb = filter_emotions(load_emodb())
ravdess = filter_emotions(load_ravdess())
tess = filter_emotions(load_tess())
crema = filter_emotions(load_crema_d())
df = pd.concat([emodb, ravdess, tess, crema]).sample(1000)

Path to dataset files: /Users/sofiafernandes/.cache/kagglehub/datasets/piyushagni5/berlin-database-of-emotional-speech-emodb/versions/1
Path to dataset files: /Users/sofiafernandes/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1
['Actor_16', 'Actor_11', 'Actor_18', 'Actor_20', 'Actor_21', 'Actor_19', 'Actor_10', 'Actor_17', 'Actor_04', 'Actor_03', 'Actor_02', 'Actor_05', 'audio_speech_actors_01-24', 'Actor_12', 'Actor_15', 'Actor_23', 'Actor_24', 'Actor_22', 'Actor_14', 'Actor_13', 'Actor_09', 'Actor_07', 'Actor_06', 'Actor_01', 'Actor_08']
Path to dataset files: /Users/sofiafernandes/.cache/kagglehub/datasets/ejlok1/toronto-emotional-speech-set-tess/versions/1
['TESS']
['YAF_disgust', 'OAF_Pleasant_surprise', 'OAF_happy', 'YAF_sad', 'TESS Toronto emotional speech set data', 'YAF_happy', 'YAF_neutral', 'OAF_Fear', 'OAF_angry', 'YAF_pleasant_surprised', 'YAF_fear', 'OAF_neutral', 'OAF_disgust', 'YAF_angry', 'OAF_Sad']
Path to dataset files: /Users/sofiafe

# A Novel Hybrid Deep Learning Technique for Speech Emotion Detection using Feature Engineering

https://arxiv.org/pdf/2507.07046v1

In [6]:
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="librosa")
from tqdm import tqdm
tqdm.pandas()

def prepare_dataset(df, max_len=200):
    X, y = [], []
    for _, row in tqdm(df.iterrows()):
        feats = extract_features(row["filename"], max_len=max_len)
        X.append(feats)
        y.append(row["label"])

    X = np.array(X)   # (samples, frames, features)
    le = LabelEncoder()
    y = le.fit_transform(y)  # numeric labels
    return X, y, le


def extract_features(filename, max_len=200):
    """
    Extracts acoustic features from an audio file and pads/truncates them to max_len frames.
    Returns a feature vector (frames x features).
    """
    y, sr = librosa.load(filename, sr=None)

    # ---- MFCCs ----
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)

    # ---- Standard deviation across frames (static) ----
    mfcc_std = np.std(mfcc, axis=1, keepdims=True)  # (20,1)
    mfcc_std = np.repeat(mfcc_std, mfcc.shape[1], axis=1)  # repeat across time frames

    # ---- Chroma features ----
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_cqt = librosa.feature.chroma_cqt(y=y, sr=sr)
    chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
    chroma = np.vstack([chroma_stft, chroma_cqt, chroma_cens])

    # ---- Log Mel Spectrogram ----
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=64)
    mel = librosa.power_to_db(mel)

    # ---- Spectral Contrast ----
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr, n_bands=6)

    # ---- Energy (RMS) ----
    energy = librosa.feature.rms(y=y)
    energy = np.repeat(energy, mfcc.shape[1], axis=1)  # repeat to match frames

    # ---- Zero Crossing Rate ----
    zcr = librosa.feature.zero_crossing_rate(y)
    zcr = np.repeat(zcr, mfcc.shape[1], axis=1)

    # ---- Align all features to the same frame length ----
    min_frames = min(mfcc.shape[1], chroma.shape[1], mel.shape[1],
                     contrast.shape[1], energy.shape[1], zcr.shape[1])

    mfcc = mfcc[:, :min_frames]
    mfcc_delta = mfcc_delta[:, :min_frames]
    mfcc_delta2 = mfcc_delta2[:, :min_frames]
    mfcc_std = mfcc_std[:, :min_frames]
    chroma = chroma[:, :min_frames]
    mel = mel[:, :min_frames]
    contrast = contrast[:, :min_frames]
    energy = energy[:, :min_frames]
    zcr = zcr[:, :min_frames]

    # ---- Concatenate ----
    features = np.vstack([mfcc, mfcc_delta, mfcc_delta2, mfcc_std,
                          chroma, mel, contrast, energy, zcr])

    # ---- Pad/Truncate to max_len ----
    if features.shape[1] < max_len:
        pad_width = max_len - features.shape[1]
        features = np.pad(features, ((0,0),(0,pad_width)), mode="constant")
    else:
        features = features[:, :max_len]

    return features.T  # (frames, features)



In [7]:
from tensorflow.keras import layers, models

def build_model(input_shape, num_classes):
    inp = layers.Input(shape=input_shape)

    # 3 stacked BiLSTM layers with normalization and dropout
    x = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(inp) #512
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)

    x = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)

    # Final BiLSTM layer output per sequence → aggregated via last timestep
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=False))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)

    # Dense + LeakyReLU
    x = layers.Dense(128)(x)
    x = layers.LeakyReLU()(x)
    x = layers.Dropout(0.3)(x)

    # Final output (clip-level softmax)
    out = layers.Dense(num_classes, activation="softmax")(x)

    model = models.Model(inputs=inp, outputs=out)
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])
    return model


In [8]:
len(df)

1000

In [9]:
# Prepare dataset
X, y, le = prepare_dataset(df, max_len=200)

# Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


0it [00:00, ?it/s]

1000it [05:34,  2.99it/s]


In [10]:
# Build model
model = build_model(input_shape=(200, X.shape[2]), num_classes=len(le.classes_))

model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=16, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7f7ef3ebd970>

In [11]:
# evaluate
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes, target_names=le.classes_))


              precision    recall  f1-score   support

       angry       0.49      0.61      0.54        33
     disgust       0.40      0.31      0.35        26
        fear       0.36      0.12      0.18        34
       happy       0.29      0.46      0.36        35
     neutral       0.32      0.30      0.31        27
         sad       0.51      0.60      0.55        35
    surprise       1.00      0.70      0.82        10

    accuracy                           0.42       200
   macro avg       0.48      0.44      0.44       200
weighted avg       0.43      0.42      0.41       200



In [None]:
# save end to end model


In [15]:
from src.utils import load_iemocap, load_meld
iemocap = filter_emotions(load_iemocap())
meld = filter_emotions(load_meld())
y_test = le.fit_transform(meld['label'])

X_test = meld['filename'].progress_apply(extract_features)

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes, target_names=le.classes_))

  y, sr = librosa.load(filename, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(filename, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(filename, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(filename, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(filename, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(filename, sr

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).