In [1]:
# install required Python packages
! pip install gradio matplotlib seaborn numpy scipy
! pip install torchaudio transformers librosa

# import the required libraries
import os
import time
import shutil
import numpy as np
import librosa
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor


Collecting gradio
  Using cached gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scipy
  Downloading scipy-1.15.2-cp311-cp311-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Using cached aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting anyio<5.0,>=3.0 (from gradio)
  Using cached anyio-4.9.0-py3-none-any.whl.metadata (4.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Using cached fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Using cached ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Using cached gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Using cached groovy-0.1.2-py3-none-any.whl.metadata (6.1 k

In [3]:
# load model

# if you want to use a different model or it is stored in a different spot, replace the path below with the path to your model
save_path = os.path.expanduser("~/desktop/finetuned_wav2vec2_emotion_model")


assert os.path.isdir(save_path), f"{save_path} not found!"
model = Wav2Vec2ForSequenceClassification.from_pretrained(save_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(save_path)
emotion_labels = [label for _, label in sorted(model.config.id2label.items(), key=lambda x: int(x[0]))]


def get_unique_filename(directory, base_name, extension):
    # generate a unique filename by appending (1), (2), etc. if needed 
    full_path = os.path.join(directory, base_name + extension)
    if not os.path.exists(full_path):
        return full_path
    i = 1
    while True:
        new_name = f"{base_name}({i}){extension}"
        full_path = os.path.join(directory, new_name)
        if not os.path.exists(full_path):
            return full_path
        i += 1

def analyze_emotion_with_save(audio_path):
    if not audio_path or not isinstance(audio_path, str) or not os.path.exists(audio_path):
        return None, "", "", ""
    try:
        # wait for file to be ready
        start = time.time()
        while (not os.path.exists(audio_path) or os.path.getsize(audio_path) == 0):
            time.sleep(0.1)
            if time.time() - start > 10:
                return None, "", "", "Audio file did not finish uploading. Please try again."
        
        # generate unique filename
        downloads_dir = os.path.expanduser("~/Downloads")
        os.makedirs(downloads_dir, exist_ok=True)
        original_name = os.path.basename(audio_path)
        base, ext = os.path.splitext(original_name)
        unique_dest = get_unique_filename(downloads_dir, base, ext)
        shutil.copy(audio_path, unique_dest)
        
        # run emotion analysis
        status = f"Audio saved to {unique_dest}"
        audio, sr = librosa.load(audio_path, sr=None)
        if sr != 8000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=8000)

        audio, _ = librosa.effects.trim(audio)
        inputs = feature_extractor(audio, sampling_rate=8000, return_tensors="pt")
        with torch.no_grad():
            logits = model(**inputs).logits
        probs = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()[0]
        idx = np.argmax(probs)
        emotion, conf = emotion_labels[idx], probs[idx]
        
        # plot results
        colors = sns.color_palette("husl", len(emotion_labels))
        fig, ax = plt.subplots(figsize=(8,3))
        ax.bar(emotion_labels, probs, color=colors)
        ax.set_ylabel("Confidence")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        
        return fig, emotion, f"{conf:.2f}", status
    except Exception as e:
        return None, "Error", "0.00", f"Analysis failed: {str(e)}"


def clear_all():
    return None, None, "", "", ""

with gr.Blocks(theme="soft") as demo:
    gr.Markdown(
        """
        # Emotion Recognition from English Speech
        Speak directly into your mic or upload a .wav file to detect the speaker's emotion using a fine-tuned Wav2Vec2 model!
        - Your recording will be saved to your Downloads folder.
        """
    )
    with gr.Row():
        with gr.Column(scale=1):
            audio_input = gr.Audio(type="filepath", label="Upload .wav", streaming=False)
            clear_btn = gr.Button("Clear")
        with gr.Column(scale=2):
            plot_output = gr.Plot(label="Emotion Confidence")
            emotion_output = gr.Textbox(label="Predicted Emotion")
            conf_output = gr.Textbox(label="Confidence")
            status_output = gr.Textbox(label="Status")

    audio_input.change(
        analyze_emotion_with_save,
        inputs=audio_input,
        outputs=[plot_output, emotion_output, conf_output, status_output]
    )
    clear_btn.click(
        clear_all,
        inputs=None,
        outputs=[audio_input, plot_output, emotion_output, conf_output, status_output]
    )

if __name__ == "__main__":
    demo.launch(
        share=True,
        inbrowser=True,
        debug=True
    )


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://bfca4b30b7c1cff13b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://bfca4b30b7c1cff13b.gradio.live
