# Streamlit Sample (GUI)

## Libraries

In [1]:
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import librosa
import torch
import numpy as np
import pathlib
from IPython import display


# Sample data

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("bitlord/urdu-language-speech-dataset")

print("Path to dataset files:", path)


Path to dataset files: /home/fiatch/.cache/kagglehub/datasets/bitlord/urdu-language-speech-dataset/versions/143


## Model

[Original model](https://huggingface.co/firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3)

In [3]:
# load model
model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
model = AutoModelForAudioClassification.from_pretrained(model_id)

feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)
id2label = model.config.id2label

## Function definitions

In [4]:
def preprocess_audio(audio_path, feature_extractor, max_duration=30.0):
    audio_array, sampling_rate = librosa.load(audio_path, sr=feature_extractor.sampling_rate)
    
    max_length = int(feature_extractor.sampling_rate * max_duration)
    if len(audio_array) > max_length:
        audio_array = audio_array[:max_length]
    else:
        audio_array = np.pad(audio_array, (0, max_length - len(audio_array)))

    inputs = feature_extractor(
        audio_array,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
    )
    return inputs

In [5]:
def predict_emotion(audio_path, model, feature_extractor, id2label, max_duration=30.0):
    inputs = preprocess_audio(audio_path, feature_extractor, max_duration)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_id = torch.argmax(logits, dim=-1).item()
    predicted_label = id2label[predicted_id]
    
    return predicted_label

## Example

In [6]:
import sounddevice as sd
from scipy.io.wavfile import write
import os

In [42]:
print(sd.query_devices())

   0 HD-Audio Generic: ZQE-CAA (hw:0,3), ALSA (0 in, 2 out)
   1 HD-Audio Generic: MSI MAG271C (hw:0,7), ALSA (0 in, 2 out)
   2 HD-Audio Generic: HDMI 2 (hw:0,8), ALSA (0 in, 8 out)
   3 HD-Audio Generic: HDMI 3 (hw:0,9), ALSA (0 in, 8 out)
   4 EPOS H3PRO Dongle: USB Audio (hw:1,0), ALSA (1 in, 2 out)
   5 HD-Audio Generic: ALC1220 Analog (hw:2,0), ALSA (2 in, 6 out)
   6 HD-Audio Generic: ALC1220 Digital (hw:2,1), ALSA (0 in, 2 out)
   7 HD-Audio Generic: ALC1220 Alt Analog (hw:2,2), ALSA (2 in, 0 out)
>  8 HD Webcam C525: USB Audio (hw:3,0), ALSA (1 in, 0 out)
<  9 EPOS H3PRO: USB Audio (hw:4,0), ALSA (1 in, 2 out)
  10 hdmi, ALSA (0 in, 2 out)


In [41]:
sd.default.device = (8,9)  # (input_device, output_device)

In [43]:
device_info = sd.query_devices(sd.default.device[0], 'input')
print(f"Suggested sample rate: {device_info['default_samplerate']}")


Suggested sample rate: 48000.0


In [34]:
print(device_info)

{'name': 'EPOS H3PRO: USB Audio (hw:4,0)', 'index': 9, 'hostapi': 0, 'max_input_channels': 1, 'max_output_channels': 2, 'default_low_input_latency': 0.024, 'default_low_output_latency': 0.008, 'default_high_input_latency': 0.096, 'default_high_output_latency': 0.032, 'default_samplerate': 48000.0}


In [35]:
audio_path = path + "/Happy/SM5_F4_H058.wav"

display.Audio(audio_path)


In [10]:
predicted_emotion = predict_emotion(audio_path, model, feature_extractor, id2label)
print(f"Predicted Emotion: {predicted_emotion}")

Predicted Emotion: happy


## Recording test

In [46]:
# Recording settings
duration = 3  # seconds
sample_rate = 48000  # Hz
output_path = "data/raw"
os.makedirs(output_path, exist_ok=True)

In [52]:
# Record audio
print("🎙️ Recording... Speak now!")
audio = sd.rec(int(duration * sample_rate), samplerate=int(sample_rate), channels=1, dtype='int16')
sd.wait()
print("✅ Done recording!")

# Save as WAV
filename = os.path.join(output_path, "speech_sample.wav")
write(filename, sample_rate, audio)
print(f"📁 Audio saved to: {filename}")

🎙️ Recording... Speak now!
✅ Done recording!
📁 Audio saved to: data/raw/speech_sample.wav


In [53]:
predicted_emotion = predict_emotion(filename, model, feature_extractor, id2label)
print(f"Predicted Emotion: {predicted_emotion}")

Predicted Emotion: fearful
