# Dashboard for the Emotion Classifier
The instruction to launch the dashboard are in the README.md file of my [Github repository](https://github.com/siryacaiazza/emotion-detector)

In [1]:
# Install required libraries

!pip install -q streamlit

!pip install -q pyngrok

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Write a file to retrieve the models

%%writefile config.py
root_dir = '/content'
resnet_dir = root_dir + '/resnet_best_early_stop.pth'
custom_dir = root_dir + '/custom_best_early_stop.pth'

Writing config.py


In [3]:
%%writefile app.py

# Import packages
import streamlit as st
import plotly.express as px
import os
import torch
import torch.nn as nn
import torch.optim as optim
import config
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from IPython.core.debugger import Tracer
import soundfile as sf
import torch.utils.data as data
from torchvision import transforms, utils, models, ops
from multiprocessing import cpu_count, Pool
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import seaborn as sns
import io
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import librosa
import librosa.display
import time
import librosa.effects
import IPython.display as ipd
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
import copy

# Define the same functions of the code, but return the figures to display in the dashboard
def plot_waveplot(audio_data, sr):
    fig, ax = plt.subplots(figsize=(10, 3))
    ax.set_title('Waveplot of the audio', size=15)
    librosa.display.waveshow(audio_data, sr=sr, ax=ax)
    ax.set_xlabel('Time (s)')
    ax.set_ylabel('Amplitude')
    return fig

def plot_spectrogram(audio_data, sr):
    X = librosa.stft(audio_data)
    Xdb = librosa.amplitude_to_db(abs(X))
    fig, ax = plt.subplots(figsize=(11, 3))
    ax.set_title('Spectrogram of the audio', size=15)
    img = librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz', ax=ax)
    fig.colorbar(img, ax=ax, format="%+2.f dB")
    return fig

def plot_mfcc(audio_data, sr):
    fig, ax = plt.subplots(figsize=(10, 4))
    mfcc_features = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=40)
    img = librosa.display.specshow(mfcc_features, x_axis='time', sr=sr, vmin=-1000, vmax=200, ax=ax)
    fig.colorbar(img, ax=ax, format='%+2.0f dB')
    ax.set_title('MFCC of the audio')
    return fig, mfcc_features

# Create a class to preprocess the uploaded file audio
class InferenceDataset(data.Dataset):
    def __init__(self, ms=4000):
        self.ms = ms
        self.target_sr = 16000
        self.n_samples = self.target_sr * ms // 1000

    def _preprocess(self, y, sr):
      try:
        # Convert to mono if needed
        if len(y.shape) > 1:
            y = librosa.to_mono(y)
        # Resample to 16000 Hz
        if sr != 16000:
            y = librosa.resample(y, orig_sr=sr, target_sr=self.target_sr)
        # Replicate if audio is too short
        if len(y) < self.n_samples:
            y = np.tile(y, self.n_samples // len(y) + 1)
        # Crop in the center
        start = (len(y) - self.n_samples) // 2
        y = y[start:start + self.n_samples]

        # Define win_lenght and hop_lenght
        win_length = int(0.03 * self.target_sr)
        hop_length = int(0.015 * self.target_sr)

        # Compute the MFCC
        mfcc = librosa.feature.mfcc(y=y, sr=self.target_sr, n_mfcc=40, win_length=win_length, hop_length=hop_length)
        # Normalize MFCC
        mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
        # Return the mfcc
        return mfcc
      except Exception as e:
        st.error(f"Error during audio processing: {e}")
        return None

NUM_CLASSES = 14
NUM_MFCC_FEATURES = 40
NUM_TIME_FRAMES = 267

# Copy the models to instantiate them in this code
# Define a custom AudioClassifier
class AudioClassifier(nn.Module):
  def __init__(self, num_mfcc_features = NUM_MFCC_FEATURES, num_time_frames = NUM_TIME_FRAMES, num_classes = NUM_CLASSES):
        super(AudioClassifier, self).__init__()
        self.relu = nn.ReLU() # Use ReLU as activiation

        # Start by mapping the channel to 512 neurons and bottleneck it to the number of classes

        self.conv0 = nn.Conv2d(1, 512, kernel_size=(5,5), stride=1, padding=2)
        self.bn0 = nn.BatchNorm2d(512) # Use batch normalization for stability
        self.pool = nn.MaxPool2d(kernel_size=(2,2), stride=2) # Use Maxpooling for efficiency

        self.conv1 = nn.Conv2d(512, 512, kernel_size=(5,5), stride=1, padding=2)
        self.bn1 =  nn.BatchNorm2d(512)

        self.conv2 = nn.Conv2d(512, 256, kernel_size=(5,5), stride=1, padding=2)
        self.bn2 = nn.BatchNorm2d(256)

        self.conv3 = nn.Conv2d(256, 128, kernel_size=(5,5), stride=1, padding=2)
        self.bn3 =  nn.BatchNorm2d(128)

        self.conv4 = nn.Conv2d(128, 64, kernel_size=(5,5), stride=1, padding=2)
        self.bn4 =  nn.BatchNorm2d(64)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # Add temporal pooling

        self.dropout1 = nn.Dropout(0.2) # Use some dropouts to prevent overfitting
        self.dropout2 = nn.Dropout(0.3)
        self.fc1 = nn.Linear(64, 32) # Add linear layers to combine features and determine class
        self.fc2 = nn.Linear(32, num_classes)

  # Define the forward method
  def forward(self, x):
        x = self.pool(self.relu(self.bn0(self.conv0(x))))
        x = self.pool(self.relu(self.bn1(self.conv1(x))))
        x = self.pool(self.relu(self.bn2(self.conv2(x))))
        x = self.pool(self.relu(self.bn3(self.conv3(x))))
        x = self.dropout1(x)
        x = self.pool(self.relu(self.bn4(self.conv4(x))))

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)

        x = self.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

# Use ResNet18 for a pre-trained audio classifier
class PretrainedAudioClassifier(nn.Module):
    def __init__(self, num_mfcc_features = NUM_MFCC_FEATURES, num_time_frames = NUM_TIME_FRAMES, num_classes = NUM_CLASSES):
        super(PretrainedAudioClassifier, self).__init__()
        # Load the ResNet18 model
        self.resnet = models.resnet18(weights='DEFAULT')
        # Modify the first convolutional layer to accept 1 input channel
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        # Remove the original fully connected layer and substitute it with identity
        self.resnet.fc = nn.Identity()
        # Add adaptive average pooling 2D for temporal pooling
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        # Add classifier layer
        self.classifier = nn.Linear(512, num_classes) # 512 is the output feature size of ResNet18 before the FC layer

    def forward(self, x):
        x = self.resnet(x)
        # Check if the dimension of the input are collapsed
        if x.dim() == 3:
            # Reshape to (batch_size, 1, 1, spatial_dim)
            x = x.unsqueeze(-2)
        elif x.dim() == 2:
             # Reshape to (batch_size, 1, 1, 1)
             x = x.unsqueeze(-1).unsqueeze(-1)

        x = self.avgpool(x) # Apply adaptive average pooling 2D
        x = x.view(x.size(0), -1)  # Flatten the output
        x = self.classifier(x) # Output
        return x

# --- Configuration ---

# Set the page configuration for a wider layout
st.set_page_config(layout="wide", page_title="Audio Analysis Dashboard")

# --- Page Header ---
st.title("😄 Emotion Classification Dashboard")
st.markdown("""
Welcome to the emotion classification dashboard! Upload your audio files in .wav format, visualize
their MFCC, waveplot and spectrogram,
and run inference with different models.
""")

@st.cache_resource
def load_model(model_name):
    st.write(f"Loading model: {model_name}...")
    # Initialize the model to None
    model = None
    # Give the labels in the right order (alphabetical)
    labels = ['female_angry', 'female_disgust', 'female_fear', 'female_happy', 'female_neutral', 'female_sad', 'female_surprise', 'male_angry', 'male_disgust', 'male_fear', 'male_happy', 'male_neutral', 'male_sad', 'male_surprise']
    # Configure the hyperparameters
    NUM_CLASSES = len(labels)
    NUM_MFCC_FEATURES = 40
    NUM_TIME_FRAMES = 194
    try:
      if model_name == "ResNet18":
          # Initialize the model
          model = PretrainedAudioClassifier(NUM_MFCC_FEATURES, NUM_TIME_FRAMES, NUM_CLASSES)
          # Load the dictonary with all the information
          checkpoint = torch.load(config.resnet_dir, map_location=torch.device('cpu'))
          # Retrieve the state dictonary
          model_state_dict = checkpoint['net_state_dict']
          # Load the trained models
          model.load_state_dict(model_state_dict)
          # Return model and labels
          return model, labels
      # Do the same for the second model
      elif model_name == "Custom Model":
          model = AudioClassifier(NUM_MFCC_FEATURES, NUM_TIME_FRAMES, NUM_CLASSES)
          checkpoint = torch.load(config.custom_dir, map_location=torch.device('cpu'))
          model_state_dict = checkpoint['net_state_dict']
          model.load_state_dict(model_state_dict)
          return model, labels
      else:
          st.error(f"Unknown model name: {model_name}")
          return None, labels
      model.eval() # Set model to evaluation mode
      st.success(f"{model_name} loaded successfully!")
      return model, labels
    except Exception as e:
        st.error(f"Error loading model '{model_name}': {e}. Check path, model architecture, and file integrity.")
        return None, labels

@st.cache_data
def preprocess_for_inference(audio_file_bytes):
    # Initialize everything to none
    y_raw, sr_raw = None, None
    processed_mfcc = None

    try:
        # Load the audio file
        audio_io = io.BytesIO(audio_file_bytes)

        # Read the wave and the sample rate
        y_raw, sr_raw = sf.read(audio_io)

        # Ensure the type is compatible
        if y_raw.dtype != np.float32:
            y_raw = y_raw.astype(np.float32)

        # Preprocess the file with the InferenceDataset class
        inference_processor = InferenceDataset(ms=4000)
        processed_mfcc = inference_processor._preprocess(y_raw, sr_raw)

        return y_raw, sr_raw, processed_mfcc

    except Exception as e:
        st.error(f"Error during audio processing for inference: {e}")
        st.error(f"Type of uploaded file: {type(audio_file_bytes)}")
        st.error(f"Length of uploaded bytes: {len(audio_file_bytes)} bytes")
        return None, None, None


st.header("📤 Upload Audio File")
uploaded_file = st.file_uploader("Choose an audio file...", type=["wav"])

#Initialize everyting to None
audio_data_raw = None
sr_raw = None
mfccs_for_model = None

if uploaded_file is not None:
    # Read the file
    file_contents = uploaded_file.read()
    # Display the audio player directly from the uploaded file bytes
    st.audio(file_contents, format=f'audio/{uploaded_file.type.split("/")[-1]}', start_time=0)
    st.success("File uploaded successfully!")

    with st.spinner("Processing audio and extracting features..."):
        # Pass the byte stream to the processing function
        audio_data_raw, sr_raw, mfccs_for_model = preprocess_for_inference(file_contents)

    if audio_data_raw is not None and mfccs_for_model is not None:

        st.subheader("🎧 Audio Features & Models Inference")
        # Set two columns
        col1, col2 = st.columns([2, 1])  # Wider left column for plots
        with col1:
          st.subheader("📊 Visualization of Audio Features")

          # Create tabs for better organization and plot the waveform, MFCC and spectrogram
          tab_waveform, tab_mfcc, tab_spectrogram = st.tabs(["Waveform", "MFCC", "Spectrogram"])
          with tab_waveform:
              st.write("#### Waveform")
              fig_wave = plot_waveplot(audio_data_raw, sr_raw)
              st.pyplot(fig_wave)
              plt.close(fig_wave)
          with tab_mfcc:
                st.write("#### Mel-frequency Cepstral Coefficients (MFCCs)")
                fig_mfcc, _ = plot_mfcc(audio_data_raw, sr_raw)
                st.pyplot(fig_mfcc)
                plt.close(fig_mfcc)
          with tab_spectrogram:
              st.write("#### Spectrogram")
              fig_spec = plot_spectrogram(audio_data_raw, sr_raw)
              st.pyplot(fig_spec)
              plt.close(fig_spec)
        # In the second colum run the inferences
        with col2:
          st.header("🧠 Model Inference")
          model_options = ["ResNet18", "Custom Model"]
          selected_model_name = st.selectbox("Choose a model for inference:", model_options)
          selected_model_idx = model_options.index(selected_model_name)
          current_model_name = model_options[selected_model_idx]

          st.info(f"Selected Model: **{current_model_name}**")

          if st.button(f"Run Inference with {current_model_name}"):
            if mfccs_for_model is not None:
                # Load the model
                model, labels = load_model(current_model_name)

                if model and labels:
                    st.subheader(f"Results from {current_model_name}:")
                    with st.spinner("Running inference..."):
                        try:
                            # Prepare MFCCs for the model, squeeze size to simulate batch size of 1 and add channel dimension
                            mfccs_tensor = torch.tensor(mfccs_for_model).float().unsqueeze(0).unsqueeze(0)

                            with torch.no_grad(): # Disable gradient calculation for inference
                                outputs = model(mfccs_tensor)
                                # For classification transform all probabilities in order for them to be between [0,1]
                                probabilities = torch.softmax(outputs, dim=1)[0]
                                # Find the predicted class by finding the argmax and save its index using .item()
                                predicted_class_idx = torch.argmax(probabilities).item()
                                # Find the predicted label name
                                predicted_label = labels[predicted_class_idx]

                                st.success(f"**Prediction:** `{predicted_label}`")
                                st.write("---")
                                st.write("#### Probabilities:")
                                # Display probabilities for all classes
                                prob_df = pd.DataFrame({
                                    'Class': labels,
                                    'Probability': probabilities.numpy()
                                })
                                prob_df = prob_df.sort_values(by='Probability', ascending=False)
                                st.dataframe(prob_df.set_index('Class'))

                        except Exception as e:
                            st.error(f"Error during model inference: {e}. Please check model input shape and loading.")
                            st.write(f"Expected MFCC shape: {mfccs_tensor.shape if 'mfccs_tensor' in locals() else 'N/A'}")
                            st.write(f"Raw MFCCs for model shape: {mfccs_for_model.shape}")
                else:
                    st.error("Could not load the selected model. Please check model definitions and paths.")
            else:
                st.warning("Please upload and preprocess an audio file first to run inference.")

else:
  st.info("Upload an audio file (.wav) to get started!")

Writing app.py


In [4]:
# Run streamlit
!nohup streamlit run app.py &

nohup: appending output to 'nohup.out'


In [5]:
# Use ngrok to run the app
from pyngrok import ngrok
!ngrok config add-authtoken 2zghj3h5IaUPeaXw810rFZiXotS_4zDtUxAVb65xq9m3LFHcb
# Setup a tunnel to the streamlit port 8501
public_url = ngrok.connect(addr=8501, proto="http")
public_url

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


<NgrokTunnel: "https://b70c8d1ef150.ngrok-free.app" -> "http://localhost:8501">