In [None]:
!pip install librosa soundfile scikit-learn numpy



In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'ravdess-emotional-speech-audio' dataset.
Path to dataset files: /kaggle/input/ravdess-emotional-speech-audio


*RAVDESS* /
   * Actor_01/
   * Actor_02/
   * Actor_03/
   * ...

Inside each folder we have

* 03-01-01-01-01-01-01.wav
* 03-01-02-01-01-01-01.wav etc

To understand each file split by -

So,

03-01-05-01-02-01-12.wav

becomes

03 | 01 | 05 | 01 | 02 | 01 | 12

Where it maps to

* 01 → neutral
* 02 → calm
* 03 → happy
* 04 → sad
* 05 → angry
* 06 → fear
* 07 → disgust
* 08 → surprise



In [None]:

import os

BASE_PATH = "/kaggle/input/ravdess-emotional-speech-audio"

EMOTION_MAP = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fear",
    "07": "disgust",
    "08": "surprise"
}

files = []
labels = []
speakers = []

for folder in sorted(os.listdir(BASE_PATH)):

    if not folder.startswith("Actor_"):
        continue

    folder_path = os.path.join(BASE_PATH, folder)

    for fname in os.listdir(folder_path):

        if not fname.endswith(".wav"):
            continue

        parts = fname.split("-")
        if len(parts) < 3:
            continue

        emo = parts[2]

        if emo not in EMOTION_MAP:
            continue

        files.append(os.path.join(folder_path, fname))
        labels.append(EMOTION_MAP[emo])
        speakers.append(folder)

print("files   :", len(files))
print("labels  :", len(labels))
print("speakers:", len(speakers))


files   : 1440
labels  : 1440
speakers: 1440


In [None]:
import librosa
import numpy as np

SR = 16000
N_MFCC = 40

def extract_mfcc_dd_cmvn_features(path):

    y, sr = librosa.load(path, sr=SR, mono=True)
    y, _ = librosa.effects.trim(y, top_db=30)

    mfcc = librosa.feature.mfcc(
        y=y,
        sr=sr,
        n_mfcc=N_MFCC
    )

    delta1 = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)

    feats = np.vstack([mfcc, delta1, delta2])   # (120, T)

    # ------------------------
    # CMVN (per utterance)
    # ------------------------
    mean = np.mean(feats, axis=1, keepdims=True)
    std  = np.std(feats, axis=1, keepdims=True) + 1e-8

    feats = (feats - mean) / std

    # pooling
    feat_mean = np.mean(feats, axis=1)
    feat_std  = np.std(feats, axis=1)

    final_feat = np.hstack([feat_mean, feat_std])

    return final_feat


In [None]:
from tqdm import tqdm
import numpy as np

X = []

for p in tqdm(files):
    X.append(extract_mfcc_dd_cmvn_features(p))

X = np.array(X)

y = np.array(labels)
groups = np.array(speakers)

print(X.shape, y.shape, groups.shape)


100%|██████████| 1440/1440 [00:30<00:00, 47.50it/s]

(1440, 240) (1440,) (1440,)





In [None]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=42
)

train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train = X[train_idx]
X_test  = X[test_idx]

y_train = y[train_idx]
y_test  = y[test_idx]

# safety check
print("Train samples:", X_train.shape[0])
print("Test samples :", X_test.shape[0])

print("Common speakers:",
      set(groups[train_idx]).intersection(set(groups[test_idx])))


Train samples: 1140
Test samples : 300
Common speakers: set()


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(
        kernel="rbf",
        C=10,
        gamma="scale",
        class_weight="balanced"
    ))
])

pipeline.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report

y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       angry       0.13      0.17      0.15        40
        calm       0.21      0.33      0.25        40
     disgust       0.23      0.17      0.20        40
        fear       0.20      0.20      0.20        40
       happy       0.09      0.07      0.08        40
     neutral       0.60      0.15      0.24        20
         sad       0.12      0.12      0.12        40
    surprise       0.14      0.12      0.13        40

    accuracy                           0.17       300
   macro avg       0.22      0.17      0.17       300
weighted avg       0.19      0.17      0.17       300



In [None]:
import librosa
import numpy as np

SR = 16000
N_MELS = 64
MAX_LEN = 300   # number of time frames (we will pad / cut)

def extract_logmel(path):

    y, sr = librosa.load(path, sr=SR, mono=True)
    y, _ = librosa.effects.trim(y, top_db=30)

    mel = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_mels=N_MELS,
        n_fft=1024,
        hop_length=256
    )

    logmel = librosa.power_to_db(mel)

    # per-utterance normalization
    logmel = (logmel - np.mean(logmel)) / (np.std(logmel) + 1e-8)


    # logmel shape = (N_MELS, T)

    # pad / cut time axis
    if logmel.shape[1] < MAX_LEN:
        pad_width = MAX_LEN - logmel.shape[1]
        logmel = np.pad(logmel, ((0,0),(0,pad_width)))
    else:
        logmel = logmel[:, :MAX_LEN]

    return logmel


In [None]:
from tqdm import tqdm

X_img = []

for p in tqdm(files):
    X_img.append(extract_logmel(p))

X_img = np.array(X_img)

print(X_img.shape)


100%|██████████| 1440/1440 [00:19<00:00, 72.57it/s]


(1440, 64, 300)


In [None]:
X_img = X_img[..., np.newaxis]

print(X_img.shape)


(1440, 64, 300, 1, 1)


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_enc = le.fit_transform(labels)

print(le.classes_)


['angry' 'calm' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']


In [None]:
from sklearn.model_selection import GroupShuffleSplit
import numpy as np

groups = np.array(speakers)

gss = GroupShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=42
)

train_idx, test_idx = next(gss.split(X_img, y_enc, groups=groups))

X_train = X_img[train_idx]
X_test  = X_img[test_idx]

y_train = y_enc[train_idx]
y_test  = y_enc[test_idx]

print(X_train.shape, X_test.shape)


(1140, 64, 300, 1, 1) (300, 64, 300, 1, 1)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

num_classes = len(np.unique(y_enc))

model = models.Sequential([
    layers.Input(shape=(64, 300, 1)),

    layers.Conv2D(16, (3,3), activation="relu"),
    layers.MaxPool2D((2,2)),

    layers.Conv2D(32, (3,3), activation="relu"),
    layers.MaxPool2D((2,2)),

    layers.Conv2D(64, (3,3), activation="relu"),
    layers.MaxPool2D((2,2)),

    layers.Flatten(),

    layers.Dense(128, activation="relu"),
    layers.Dropout(0.3),

    layers.Dense(num_classes, activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [None]:
history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=32
)


Epoch 1/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 458ms/step - accuracy: 0.1786 - loss: 2.0153
Epoch 2/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 449ms/step - accuracy: 0.3922 - loss: 1.6027
Epoch 3/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 460ms/step - accuracy: 0.5522 - loss: 1.2685
Epoch 4/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 459ms/step - accuracy: 0.5971 - loss: 1.0951
Epoch 5/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 448ms/step - accuracy: 0.7124 - loss: 0.8147
Epoch 6/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 450ms/step - accuracy: 0.7918 - loss: 0.5980
Epoch 7/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 449ms/step - accuracy: 0.8481 - loss: 0.4589
Epoch 8/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 481ms/step - accuracy: 0.8855 - loss: 0.3198
Epoch 9/15
[1m36/36[0m [32m━━

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test accuracy:", test_acc)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 115ms/step - accuracy: 0.3889 - loss: 3.8135
Test accuracy: 0.4099999964237213


In [None]:
def predict_emotion_cnn_with_gate(path):

    gate = speech_music_gate(path)

    if gate != "speech":
        return "REJECTED (not clean speech)"

    x = extract_logmel(path)
    x = x[np.newaxis, ..., np.newaxis]

    pred = model.predict(x)
    cls = np.argmax(pred, axis=1)[0]

    return le.inverse_transform([cls])[0]


In [None]:
test_path = files[1]

print("Gate result:", speech_music_gate(test_path))
print("Emotion:", predict_emotion_cnn_with_gate(test_path))


Gate result: speech
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Emotion: calm


In [None]:
def debug_gate(path):
    y, sr = librosa.load(path, sr=SR, mono=True)
    y, _ = librosa.effects.trim(y, top_db=30)

    flatness = np.mean(librosa.feature.spectral_flatness(y=y))
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))

    print("flatness:", flatness, "zcr:", zcr)

debug_gate(files[0])


flatness: 0.021596361 zcr: 0.09652432528409091


In [None]:
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
import numpy as np
import soundfile as sf

RECORD_SECONDS = 4

def record_audio(filename="mic.wav", seconds=4):

    js = f"""
    async function record() {{
      const stream = await navigator.mediaDevices.getUserMedia({{audio: true}});
      const mediaRecorder = new MediaRecorder(stream);
      let chunks = [];

      mediaRecorder.ondataavailable = e => chunks.push(e.data);
      mediaRecorder.start();

      await new Promise(resolve => setTimeout(resolve, {seconds*1000}));
      mediaRecorder.stop();

      await new Promise(resolve => mediaRecorder.onstop = resolve);

      const blob = new Blob(chunks, {{ type: 'audio/webm' }});
      const reader = new FileReader();
      reader.readAsDataURL(blob);

      await new Promise(resolve => reader.onloadend = resolve);

      return reader.result;
    }}
    record();
    """


    data = output.eval_js(js)
    header, encoded = data.split(",", 1)
    audio = b64decode(encoded)

    with open(filename, "wb") as f:
        f.write(audio)

    print("Saved:", filename)

record_audio("mic.wav", seconds=4)


Saved: mic.wav


In [None]:
import librosa

y, sr = librosa.load("mic.wav", sr=16000)
print("Duration:", len(y) / sr)


  y, sr = librosa.load("mic.wav", sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Duration: 2.88


In [None]:
import numpy as np

def predict_realtime(path):

    gate = speech_music_gate(path)
    print("Gate:", gate)

    if gate != "speech":
        return "REJECTED (not clean speech)"

    x = extract_logmel(path)        # your normalized log-mel extractor
    x = x[np.newaxis, ..., np.newaxis]

    prob = model.predict(x, verbose=0)
    cls = np.argmax(prob, axis=1)[0]

    return le.inverse_transform([cls])[0]


In [None]:
print("Predicted emotion:", predict_realtime("mic.wav"))


  y, sr = librosa.load(path, sr=SR, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(path, sr=SR, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Gate: speech
Predicted emotion: sad


In [None]:
# Save Keras model
model.save("emotion_cnn_model.keras")

# Save label encoder
import pickle
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

print("Saved model and label encoder.")


Saved model and label encoder.


In [None]:
from google.colab import files
files.download("emotion_cnn_model.keras")
files.download("label_encoder.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
model.save("emotion_cnn_model.keras")


In [None]:
!python backend_emotion_server.py mic.wav


2026-02-13 16:06:10.543683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770998770.628153   27651 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770998770.639831   27651 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770998770.666149   27651 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770998770.666212   27651 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770998770.666219   27651 computation_placer.cc:177] computation placer alr

In [None]:
!pip install fastapi uvicorn librosa tensorflow soundfile



In [None]:
!python -m uvicorn backend_emotion_server:app --reload

[32mINFO[0m:     Will watch for changes in these directories: ['/content']
[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started reloader process [[36m[1m28086[0m] using [36m[1mWatchFiles[0m
2026-02-13 16:07:39.990681: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770998860.015697   28093 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770998860.022724   28093 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770998860.040944   28093 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.

In [None]:
!pip install tf2onnx



In [None]:
import tensorflow as tf
import tf2onnx

model = tf.keras.models.load_model("emotion_cnn_model.keras")

input_signature = [tf.TensorSpec([None, 64, 300, 1], tf.float32, name="input")]

@tf.function(input_signature=input_signature)
def model_fn(x):
    return model(x)

output_path = "emotion_cnn_model.onnx"

tf2onnx.convert.from_function(
    model_fn,
    input_signature=input_signature,
    output_path=output_path
)

print("ONNX model saved successfully.")


ERROR:tf2onnx.tfonnx:rewriter <function rewrite_constant_fold at 0x7a8f9e2cd1c0>: exception `np.cast` was removed in the NumPy 2.0 release. Use `np.asarray(arr, dtype=dtype)` instead.


ONNX model saved successfully.


In [None]:
from google.colab import files
files.download("emotion_cnn_model.onnx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.24.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Downloading onnxruntime-1.24.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnxruntime
Successfully installed onnxruntime-1.24.1


In [None]:
import onnxruntime as ort

session = ort.InferenceSession("emotion_cnn_model.onnx")

print("ONNX model loaded successfully.")
print("Input name:", session.get_inputs()[0].name)
print("Input shape:", session.get_inputs()[0].shape)


ONNX model loaded successfully.
Input name: input
Input shape: ['unk__33', 64, 300, 1]


In [None]:
import librosa
import pickle

with open("label_encoder.pkl", "rb") as f:
    le = pickle.load(f)

def extract_logmel(path):
    SR = 16000
    N_MELS = 64
    MAX_LEN = 300

    y, sr = librosa.load(path, sr=SR, mono=True)
    y, _ = librosa.effects.trim(y, top_db=30)

    mel = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_mels=N_MELS,
        n_fft=1024,
        hop_length=256
    )

    logmel = librosa.power_to_db(mel)
    logmel = (logmel - np.mean(logmel)) / (np.std(logmel) + 1e-8)

    if logmel.shape[1] < MAX_LEN:
        pad = MAX_LEN - logmel.shape[1]
        logmel = np.pad(logmel, ((0, 0), (0, pad)))
    else:
        logmel = logmel[:, :MAX_LEN]

    return logmel.astype(np.float32)


audio_path = "mic.wav"

x = extract_logmel(audio_path)
x = x[np.newaxis, ..., np.newaxis]

inputs = {session.get_inputs()[0].name: x}
outputs = session.run(None, inputs)

probs = outputs[0][0]
cls = np.argmax(probs)

emotion = str(le.inverse_transform([cls])[0])
confidence = float(np.max(probs))

print("Emotion:", emotion)
print("Confidence:", confidence)


Emotion: calm
Confidence: 0.996911883354187


  y, sr = librosa.load(path, sr=SR, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
