In [2]:
# Install required libraries
!pip install torch transformers librosa soundfile

import torch
import librosa
import soundfile as sf
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

# Load pre-trained model and feature extractor
model_name = "facebook/wav2vec2-large-robust-ft-swbd-300h"  # Example model, fine-tune for emotion detection
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

# Define emotion labels (example labels, adjust based on your dataset)
emotion_labels = {
    0: "neutral",
    1: "happy",
    2: "sad",
    3: "angry",
    4: "fearful",
    5: "disgust",
    6: "surprised"
}

# Load and preprocess audio file
def preprocess_audio(file_path):
    # Load audio file
    audio, sr = librosa.load(file_path, sr=16000)  # Resample to 16kHz
    # Extract features
    inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
    return inputs

# Perform emotion detection
def detect_emotion(file_path):
    # Preprocess audio
    inputs = preprocess_audio(file_path)
    # Perform inference
    with torch.no_grad():
        logits = model(**inputs).logits
    # Get predicted emotion
    predicted_class = torch.argmax(logits, dim=-1).item()
    emotion = emotion_labels.get(predicted_class, "unknown")
    return emotion

# Test the function
file_path = "surprised.wav"  # Replace with your audio file path
emotion = detect_emotion(file_path)
print(f"Detected Emotion: {emotion}")



Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-robust-ft-swbd-300h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected Emotion: happy


In [None]:
import torch
import librosa
import soundfile as sf
import numpy as np
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import matplotlib.pyplot as plt

# Load pre-trained model and feature extractor
model_name = "facebook/wav2vec2-large-robust-ft-swbd-300h"  # Example model, you can replace it with a suitable one
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

# Define emotion labels
emotion_labels = {
    0: "neutral",
    1: "happy",
    2: "sad",
    3: "angry",
    4: "fearful",
    5: "disgust",
    6: "surprised"
}

# Load and preprocess audio file
def preprocess_audio(file_path):
    # Load audio file
    audio, sr = librosa.load(file_path, sr=16000)  # Resample to 16kHz
    # Extract features
    inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
    return inputs

# Perform emotion detection
def detect_emotion(file_path):
    # Preprocess audio
    inputs = preprocess_audio(file_path)
    # Perform inference
    with torch.no_grad():
        logits = model(**inputs).logits
    # Get predicted emotion
    predicted_class = torch.argmax(logits, dim=-1).item()
    emotion = emotion_labels.get(predicted_class, "unknown")
    return predicted_class, emotion

# Define a simple dataset (file paths and ground truth labels)
test_files = ["surprised.wav"]  # Replace with your audio files
ground_truth = [6]  # Replace with actual labels corresponding to the audio files

# Store predicted labels
predicted_labels = []

# Perform emotion detection for each file in the test set
for file in test_files:
    predicted_class, _ = detect_emotion(file)
    predicted_labels.append(predicted_class)

# Calculate metrics
accuracy = accuracy_score(ground_truth, predicted_labels)
precision = precision_score(ground_truth, predicted_labels, average='weighted')
recall = recall_score(ground_truth, predicted_labels, average='weighted')
f1 = f1_score(ground_truth, predicted_labels, average='weighted')

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Plot the metrics
metrics = [accuracy, precision, recall, f1]
metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

plt.figure(figsize=(10, 6))
plt.bar(metrics_names, metrics, color='skyblue')
plt.xlabel('Metric')
plt.ylabel('Score')
plt.title('Emotion Detection Performance Metrics')
plt.ylim(0, 1)
plt.show()


In [4]:
!pip install transformers torchaudio librosa pandas numpy matplotlib seaborn

import torch
import librosa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

# Load publicly available model
model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)

# Emotion labels for this specific model
emotion_labels = {
    0: "Anger",
    1: "Sadness",
    2: "Neutral",
    3: "Happiness",
    4: "Fear"
}

def analyze_emotion(audio_path):
    # Load and resample audio
    waveform, sr = librosa.load(audio_path, sr=16000)
    
    # Preprocess
    inputs = processor(
        waveform,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True,
        return_attention_mask=True
    )
    
    # Inference
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get probabilities
    probs = torch.softmax(outputs.logits, dim=1).numpy()[0]
    return probs

def plot_results(waveform, sr, probabilities):
    plt.figure(figsize=(15, 5))
    
    # Waveform
    plt.subplot(1, 3, 1)
    librosa.display.waveshow(waveform, sr=sr)
    plt.title("Audio Waveform")
    
    # Spectrogram
    plt.subplot(1, 3, 2)
    X = librosa.stft(waveform)
    Xdb = librosa.amplitude_to_db(abs(X))
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
    plt.colorbar(format="%+2.0f dB")
    plt.title("Spectrogram")
    
    # Emotion probabilities
    plt.subplot(1, 3, 3)
    sns.barplot(x=list(emotion_labels.values()), y=probabilities, palette="viridis")
    plt.title("Emotion Probabilities")
    plt.xticks(rotation=45)
    plt.ylabel("Confidence")
    
    plt.tight_layout()
    plt.show()

# Usage
audio_path = "sad.wav"  # Replace with your file
probs = analyze_emotion(audio_path)
waveform, sr = librosa.load(audio_path, sr=16000)
plot_results(waveform, sr, probs)

# Print results
print("Emotion Predictions:")
for emotion, prob in zip(emotion_labels.values(), probs):
    print(f"{emotion}: {prob:.4f}")

Collecting torch==2.5.1 (from torchaudio)
  Using cached torch-2.5.1-cp310-cp310-win_amd64.whl.metadata (28 kB)
Using cached torch-2.5.1-cp310-cp310-win_amd64.whl (203.1 MB)
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 2.2.2
    Uninstalling torch-2.2.2:
      Successfully uninstalled torch-2.2.2
Successfully installed torch-2.5.1


  You can safely remove it manually.

[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/661M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: cannot import name '_data_ptr_allocated' from 'torch.distributed.utils' (C:\Users\SASAPU TARUN\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\distributed\utils.py)

In [5]:
!pip install --upgrade torch torchaudio transformers

Collecting torch
  Downloading torch-2.6.0-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting torchaudio
  Downloading torchaudio-2.6.0-cp310-cp310-win_amd64.whl.metadata (6.7 kB)
Downloading torch-2.6.0-cp310-cp310-win_amd64.whl (204.2 MB)
   ---------------------------------------- 0.0/204.2 MB ? eta -:--:--
   ---------------------------------------- 1.3/204.2 MB 8.4 MB/s eta 0:00:25
    --------------------------------------- 2.9/204.2 MB 7.6 MB/s eta 0:00:27
    --------------------------------------- 4.2/204.2 MB 7.6 MB/s eta 0:00:27
   - -------------------------------------- 5.5/204.2 MB 7.3 MB/s eta 0:00:28
   - -------------------------------------- 6.6/204.2 MB 6.9 MB/s eta 0:00:29
   - -------------------------------------- 7.3/204.2 MB 6.2 MB/s eta 0:00:32
   - -------------------------------------- 8.4/204.2 MB 6.0 MB/s eta 0:00:33
   - -------------------------------------- 9.7/204.2 MB 5.9 MB/s eta 0:00:34
   -- ------------------------------------- 10.7/204.2 MB 5.9


[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
!pip install transformers torchaudio librosa pandas numpy matplotlib seaborn

import torch
import librosa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

# Load publicly available model
model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)

# Emotion labels for this specific model
emotion_labels = {
    0: "Anger",
    1: "Sadness",
    2: "Neutral",
    3: "Happiness",
    4: "Fear"
}

def analyze_emotion(audio_path):
    # Load and resample audio
    waveform, sr = librosa.load(audio_path, sr=16000)
    
    # Preprocess
    inputs = processor(
        waveform,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True,
        return_attention_mask=True
    )
    
    # Inference
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get probabilities
    probs = torch.softmax(outputs.logits, dim=1).numpy()[0]
    return probs

def plot_results(waveform, sr, probabilities):
    plt.figure(figsize=(15, 5))
    
    # Waveform
    plt.subplot(1, 3, 1)
    librosa.display.waveshow(waveform, sr=sr)
    plt.title("Audio Waveform")
    
    # Spectrogram
    plt.subplot(1, 3, 2)
    X = librosa.stft(waveform)
    Xdb = librosa.amplitude_to_db(abs(X))
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
    plt.colorbar(format="%+2.0f dB")
    plt.title("Spectrogram")
    
    # Emotion probabilities
    plt.subplot(1, 3, 3)
    sns.barplot(x=list(emotion_labels.values()), y=probabilities, palette="viridis")
    plt.title("Emotion Probabilities")
    plt.xticks(rotation=45)
    plt.ylabel("Confidence")
    
    plt.tight_layout()
    plt.show()

# Usage
audio_path = "sad.wav"  # Replace with your file
probs = analyze_emotion(audio_path)
waveform, sr = librosa.load(audio_path, sr=16000)
plot_results(waveform, sr, probs)

# Print results
print("Emotion Predictions:")
for emotion, prob in zip(emotion_labels.values(), probs):
    print(f"{emotion}: {prob:.4f}")




[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: cannot import name '_data_ptr_allocated' from 'torch.distributed.utils' (C:\Users\SASAPU TARUN\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\distributed\utils.py)