# Moodify - Speech Emotion Recognition (SER)

In [None]:
import warnings
from pathlib import Path

import librosa

# import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from ipyfilechooser import FileChooser
from IPython.display import Audio, display
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

warnings.filterwarnings("ignore")

## Load Audio Data

In [None]:
fc = FileChooser("../data/raw")
fc.filter_pattern = ["*.wav"]

display(fc)

In [None]:
# Fallback sample WAV file in case user did not select any via FileChooser
FALLBACK_WAV_PATH = "../data/raw/01-01-05-02-02-01-11.wav"
SAMPLE_WAV_SPEECH_PATH = Path(
    fc.selected if fc.selected is not None else FALLBACK_WAV_PATH
)

In [None]:
def print_stats(waveform, sample_rate=None, src=None):
    if src:
        print("-" * 10)
        print("Source:", src)
        print("-" * 10)
    if sample_rate:
        print("Sample Rate:", sample_rate)
        print("Shape:", tuple(waveform.shape))
        print("Dtype:", waveform.dtype)
        print(f" - Max:     {waveform.max().item():6.3f}")
        print(f" - Min:     {waveform.min().item():6.3f}")
        print(f" - Mean:    {waveform.mean().item():6.3f}")
        print(f" - Std Dev: {waveform.std().item():6.3f}")
        print()

In [None]:
waveform, sample_rate = librosa.load(SAMPLE_WAV_SPEECH_PATH, sr=None)
print_stats(waveform, sample_rate, SAMPLE_WAV_SPEECH_PATH)
Audio(waveform, rate=sample_rate)

## Compute Features using Librosa

In [None]:
spectral_centroid = librosa.feature.spectral_centroid(y=waveform, sr=sample_rate)
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=waveform, sr=sample_rate)
chromagram = librosa.feature.chroma_stft(y=waveform, sr=sample_rate)
spectrogram = librosa.amplitude_to_db(np.abs(librosa.stft(waveform)), ref=np.max)
mel_spectrogram = librosa.feature.melspectrogram(y=waveform, sr=sample_rate)

plt.figure(figsize=(20, 10))
plt.suptitle("Original Audio Features")

plt.subplot(3, 2, 1)
librosa.display.waveshow(waveform, sr=sample_rate)
plt.title("Waveform")

plt.subplot(3, 2, 2)
plt.semilogy(spectral_centroid.T, label="Spectral Centroid")
plt.title("Spectral Centroid")

plt.subplot(3, 2, 3)
plt.semilogy(spectral_bandwidth.T, label="Spectral Bandwidth")
plt.title("Spectral Bandwidth")

plt.subplot(3, 2, 4)
librosa.display.specshow(chromagram, sr=sample_rate, x_axis="time", y_axis="chroma")
plt.title("Chromagram")

plt.subplot(3, 2, 5)
librosa.display.specshow(spectrogram, sr=sample_rate, x_axis="time", y_axis="log")
plt.title("Spectrogram")

plt.subplot(3, 2, 6)
librosa.display.specshow(
    librosa.power_to_db(mel_spectrogram, ref=np.max),
    sr=sample_rate,
    x_axis="time",
    y_axis="mel",
)
plt.title("Mel Spectrogram")

plt.tight_layout()
plt.show()

## Speech Emotion Prediction using Whisper AI

In [None]:
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using device: {device}")

model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
model = AutoModelForAudioClassification.from_pretrained(model_id)

feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)
id2label = model.config.id2label

In [None]:
# Resample the waveform to 16kHz
target_sr = feature_extractor.sampling_rate
resampled_waveform = librosa.resample(
    y=waveform, orig_sr=sample_rate, target_sr=target_sr
)

spectral_centroid = librosa.feature.spectral_centroid(
    y=resampled_waveform, sr=target_sr
)
spectral_bandwidth = librosa.feature.spectral_bandwidth(
    y=resampled_waveform, sr=target_sr
)
chromagram = librosa.feature.chroma_stft(y=resampled_waveform, sr=target_sr)
spectrogram = librosa.amplitude_to_db(
    np.abs(librosa.stft(resampled_waveform)), ref=np.max
)
mel_spectrogram = librosa.feature.melspectrogram(y=resampled_waveform, sr=target_sr)

plt.figure(figsize=(20, 10))
plt.suptitle("Original Audio Features")

plt.subplot(3, 2, 1)
librosa.display.waveshow(resampled_waveform, sr=target_sr)
plt.title("Waveform")

plt.subplot(3, 2, 2)
plt.semilogy(spectral_centroid.T, label="Spectral Centroid")
plt.title("Spectral Centroid")

plt.subplot(3, 2, 3)
plt.semilogy(spectral_bandwidth.T, label="Spectral Bandwidth")
plt.title("Spectral Bandwidth")

plt.subplot(3, 2, 4)
librosa.display.specshow(chromagram, sr=target_sr, x_axis="time", y_axis="chroma")
plt.title("Chromagram")

plt.subplot(3, 2, 5)
librosa.display.specshow(spectrogram, sr=target_sr, x_axis="time", y_axis="log")
plt.title("Spectrogram")

plt.subplot(3, 2, 6)
librosa.display.specshow(
    librosa.power_to_db(mel_spectrogram, ref=np.max),
    sr=target_sr,
    x_axis="time",
    y_axis="mel",
)
plt.title("Mel Spectrogram")

plt.tight_layout()
plt.show()

In [None]:
inputs = feature_extractor(
    resampled_waveform,
    sampling_rate=feature_extractor.sampling_rate,
    truncation=True,
    return_tensors="pt",
)

model = model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)

probs = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze()
emotion_labels = model.config.id2label
emotion_scores = {emotion_labels[i]: probs[i].item() for i in range(len(probs))}
emotion_scores = dict(
    sorted(emotion_scores.items(), key=lambda item: item[1], reverse=True)
)

emotions = list(emotion_scores.keys())
scores = list(emotion_scores.values())

plt.figure(figsize=(8, 4))
sns.barplot(x=emotions, y=scores, palette="viridis", hue=emotions)
plt.xlabel("Emotion")
plt.ylabel("Confidence Score")
plt.title("Emotion Predictions")
plt.ylim(0, 1)
plt.show()

print("\nEmotion Predictions:\n")
for emotion, score in emotion_scores.items():
    print(f"- {emotion}: {score:.4f}")

top_emotion_index = torch.argmax(probs).item()
top_emotion = emotion_labels[top_emotion_index]

print(
    f"\nPredominant Emotion: {top_emotion} (Confidence: {emotion_scores[top_emotion]:.4f})"
)