In [24]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [25]:
base_path = "/content/drive/MyDrive/SER_yt"
import os
os.makedirs(base_path, exist_ok=True)

In [26]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torch
import librosa
import numpy as np
import torch.nn.functional as F

# Load your saved model and processor
model_path = "/content/drive/MyDrive/SER_yt/my_saved_model"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
processor = Wav2Vec2Processor.from_pretrained(model_path)
model.eval()


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [27]:
# Corrected label map
id2label = {
    0: "happy",
    1: "neutral",
    2: "sad",
    3: "fear",
    4: "disgust",
    5: "ps",
    6: "angry"
}

In [28]:
import torch.nn.functional as F

def predict_emotion(audio_path):
    # Load and preprocess the audio
    speech, sr = librosa.load(audio_path, sr=16000)

    # Pad or trim the audio to 2 seconds (32000 samples)
    speech = speech[:32000] if len(speech) > 32000 else np.pad(speech, (0, 32000 - len(speech)), mode='constant')

    # Process with processor
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt")

    # Predict
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=-1)

    predicted_id = torch.argmax(probs, dim=-1).item()
    confidence = torch.max(probs).item()

    return id2label[predicted_id], round(confidence * 100, 2)


In [30]:
# Test on sample audio
test_audio_path = "/content/drive/MyDrive/SER_yt/Dataset/TESS Toronto emotional speech set data/YAF_sad/YAF_wife_sad.wav"
emotion, confidence = predict_emotion(test_audio_path)
print(f"Predicted Emotion: {emotion} (Confidence: {confidence}%)")

Predicted Emotion: sad (Confidence: 86.87%)


In [34]:
import os
import random

dataset_root = "/content/drive/MyDrive/SER_yt/Dataset/TESS Toronto emotional speech set data"

# Gather all .wav file paths recursively
all_audio_files = []
for root, dirs, files in os.walk(dataset_root):
    for file in files:
        if file.endswith(".wav"):
            all_audio_files.append(os.path.join(root, file))

# Choose one random file
random_audio_path = random.choice(all_audio_files)

# Predict emotion
emotion, confidence = predict_emotion(random_audio_path)
print(f"Random File: {os.path.basename(random_audio_path)}")
print(f"Predicted Emotion: {emotion} (Confidence: {confidence}%)")


Random File: OAF_dog_fear.wav
Predicted Emotion: fear (Confidence: 86.92%)
