In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
base_path = "/content/drive/MyDrive/SER_yt"
import os
os.makedirs(base_path, exist_ok=True)

In [3]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [4]:
import gradio as gr
import librosa
import numpy as np
import torch
import torch.nn.functional as F
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

In [5]:
#Load your model and processor
model_path = "/content/drive/MyDrive/SER_yt/my_saved_model"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
processor = Wav2Vec2Processor.from_pretrained(model_path)
model.eval()


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [6]:
# Label mapping
id2label = {
    0: "happy",
    1: "neutral",
    2: "sad",
    3: "fear",
    4: "disgust",
    5: "ps",
    6: "angry"
}


In [24]:
def predict_emotion_gradio(audio_input):
    try:
        # Check if the input is a tuple (mic recording): (sr, numpy_array)
        if isinstance(audio_input, tuple):
            sr, audio_np = audio_input
            audio_np = audio_np.astype(np.float32)

        # Else it's a file path (upload)
        else:
            audio_np, sr = librosa.load(audio_input, sr=None)
            audio_np = audio_np.astype(np.float32)

        # Resample if needed
        if sr != 16000:
            audio_np = librosa.resample(audio_np, orig_sr=sr, target_sr=16000)
            sr = 16000

        # Pad or trim to 2 seconds (32000 samples)
        audio_np = audio_np[:32000] if len(audio_np) > 32000 else np.pad(audio_np, (0, 32000 - len(audio_np)), mode='constant')

        # Process input
        inputs = processor(audio_np, sampling_rate=sr, return_tensors="pt")

        with torch.no_grad():
            logits = model(**inputs).logits
            probs = F.softmax(logits, dim=-1)

        predicted_id = torch.argmax(probs, dim=-1).item()
        confidence = torch.max(probs).item()

        return f"{id2label[predicted_id]} (Confidence: {round(confidence * 100, 2)}%)"

    except Exception as e:
        return f"Error: {str(e)}"


In [28]:
interface = gr.Interface(
    fn=predict_emotion_gradio,
    inputs=gr.Audio(type="numpy", label="Upload or Record Audio"),
    outputs="text",
    title="Speech Emotion Recognition",
    description="Upload or record a .wav audio and get the predicted emotion"
)



In [None]:
interface.launch(share=True, debug=True)