# **Identifying Emotional Inconsistencies in Speech**

**Anomaly Detection Logic**
| Case             | Logic                                                   | Confidence | Message                                                               |
| ---------------- | ------------------------------------------------------- | ---------- | --------------------------------------------------------------------- |
| ✅ Full Match     | Text emotion matches majority audio emotion             | High       | "No vocal anomaly detected. Emotion is consistent across modalities." |
| ⚠️ Partial Match | Text matches at least one audio model, but not majority | Medium     | "Partial match. Voice may carry subtle differences in tone."          |
| ❌ Mismatch       | Text emotion does not match any audio model             | High       | "Vocal anomaly detected. Spoken emotion differs from text meaning."   |


In [76]:
import os
import torch
import joblib

# Constands
SAMPLE_RATE = 16000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
text_emotions_map = { 0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise" }
text_to_audio_map = {
    "sadness": ["SAD"],
    "joy": ["HAP"],
    "love": ["HAP", "NEU"],
    "anger": ["ANG", "DIS"],   # Disgust can be close to anger
    "fear": ["FEA", "DIS"],    # Fear sometimes overlaps with disgust
    "surprise": ["HAP", "NEU"]
}

# Get the absolute path of the current script
current_dir = os.getcwd()
print("current_dir", current_dir)

# sample Audio to Predict
basic_audio_path = f"test/youre-funny-1.wav"

# path of trained models
svm_model_path = joblib.load(f"{current_dir}/SVM/svm_model.joblib")
cnn_model_path = f"{current_dir}/CNN/cnn_model.pth"
wav2vec2_model_path = f"{current_dir}/Wav2Vec2/best_model_v1.pt"
roberta_model_path = f"{current_dir}/roberta/model"
roberta_tokinezer_path = f"{current_dir}/roberta/tokenizer"
rf_model_path = joblib.load(f"{current_dir}/RF/rf_model.joblib")
le = joblib.load("SVM/label_encoder.joblib")
json_path = "anomaly_results.json"


# normalizing helper hashmaps
audio_emotion_map = {'HAP': 0, 'SAD': 1, 'ANG': 2, 'FEA': 3, 'DIS': 4, 'NEU': 5}
label_to_emotion = {0: "HAP", 1: "SAD", 2: "ANG", 3: "FEA", 4: "DIS", 5: "NEU"}
normalized_label_map = { 'ANG': 'angry', 'DIS': 'disgust', 'FEA': 'fear', 'HAP': 'happy', 'NEU': 'neutral', 'SAD': 'sad'}

# hashMap to Store Final Result 
result_map = { "audio_input": basic_audio_path }
audio_input_path = f"{current_dir}/{basic_audio_path}"

current_dir d:\Projects\Research Paper\Voice Emotion Classification\experiment\SoundSense


In [77]:
from manage_models_v2 import *

# Normalize Input Auido
try:
    audio, sr, output_path = convert_wav_audio(audio_input_path, output_dir="test/normalised")
    audio_input_path = output_path  # Update to point to corrected file
    print(f"Processed audio shape: {audio.shape}, Sample rate: {sr}, New input path: {audio_input_path}")
except ValueError as e:
    print(f"Error: {e}")

Corrected audio saved to: test/normalised\youre-funny-1.wav
Processed audio shape: (1, 37905), Sample rate: 16000, New input path: test/normalised\youre-funny-1.wav


# **Convert Text from Auido using wav2vec2-base-960h**

In [78]:
import torchaudio
import IPython.display as ipd 
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load pre-trained ASR model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()

# Load and resample audio
def load_audio(file_path, target_sr=16000):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != target_sr:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)
        waveform = resampler(waveform)
    return waveform.squeeze()

# Transcription function
def transcribe_audio(file_path):
    waveform = load_audio(file_path)
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription.lower()

# Example usage
text = transcribe_audio(audio_input_path)
result_map["audio_text"] = text
ipd.Audio(audio_input_path)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
print("Transcribed Text:", text)

Transcribed Text: ha ha you're funny your funny


# **Predict Emotion from Audio Text using RoBERTa**

In [80]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch.nn.functional as F

# Function to predict text 
def roberta_model(text) -> str:
    roberta_model = RobertaForSequenceClassification.from_pretrained(roberta_model_path)
    tokenizer = RobertaTokenizer.from_pretrained(roberta_tokinezer_path)

    # Set the model to evaluation mode
    roberta_model.eval()

    # Move the model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    roberta_model.to(device)

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")

    # Move the inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Pass the inputs through the model
    with torch.no_grad():  # No gradients needed for inference
        outputs = roberta_model(**inputs)

    # Get the predicted label (highest logit value)
    logits = outputs.logits
    prediction = outputs.logits.argmax(dim=-1)
    label = prediction.item()

    # Calculate the predicted probability using softmax
    probabilities = F.softmax(logits, dim=-1)  # Convert logits to probabilities
    predicted_probability = probabilities[0][label].item()  # Get the probability of the predicted label

    return text_emotions_map[label], predicted_probability

text_emotion, predicted_probability  = roberta_model(text)
result_map["roberta_text_emotion"] = text_emotion
print(f"Predicted Text Emotion: {text_emotion} with probability of {predicted_probability}")

Predicted Text Emotion: surprise with probability of 0.9983890056610107


# **Predict Emotion from Audio Tone using Wav2Vec2.0**

In [81]:
import torchaudio
import os
import numpy as np
from transformers import Wav2Vec2FeatureExtractor
from manage_models import EmotionRecognitionModel  

# Load model
wav2vec2_model = EmotionRecognitionModel(num_labels=len(audio_emotion_map))
wav2vec2_model.load_state_dict(torch.load(wav2vec2_model_path, map_location=device))
wav2vec2_model.to(device)
wav2vec2_model.eval()

# Load feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")

# Preprocessing function
def load_and_process_audio(file_path, sample_rate=SAMPLE_RATE):
    waveform, sr = torchaudio.load(file_path)
    if sr != sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)
        waveform = resampler(waveform)
    waveform = waveform.squeeze(0)
    return waveform

# Prediction function
def predict_emotion_wav2vec2(file_path):
    waveform = load_and_process_audio(file_path)

    # Extract features using feature_extractor
    inputs = feature_extractor(
        waveform.numpy(),
        sampling_rate=SAMPLE_RATE,
        return_tensors="pt",
        padding=True
    )
    input_values = inputs.input_values.to(device)

    with torch.no_grad():
        logits = wav2vec2_model(input_values)
        predicted_class_id = torch.argmax(logits, dim=-1).item()

    # Convert index back to label
    reverse_emotion_map = {v: k for k, v in audio_emotion_map.items()}
    predicted_emotion = reverse_emotion_map[predicted_class_id]
    return predicted_emotion

# Example usage
wav2vec2_predicted_emotion = normalized_label_map[predict_emotion_wav2vec2(audio_input_path)]
result_map["wav2vec2_audio_emotion"] = wav2vec2_predicted_emotion
print("Predicted Emotion:", wav2vec2_predicted_emotion)

  wav2vec2_model.load_state_dict(torch.load(wav2vec2_model_path, map_location=device))


Predicted Emotion: disgust


# **Predict Emotion from Audio Tone using CNN**

In [82]:
from manage_models import AudioCNN, extract_features 

cnn_model = AudioCNN()
cnn_model.load_state_dict(torch.load(cnn_model_path, map_location=device))
cnn_model.eval()
cnn_model.to(device)

# Real-World Prediction using CNN
def predict_emotion_cnn(file_path):
    cnn_model.eval()
    mel = extract_features(file_path, augment=False)
    if mel is None:
        return None
    mel = torch.tensor(mel, dtype=torch.float32).unsqueeze(0).unsqueeze(-1)
    mel = mel.to(device).permute(0, 3, 1, 2)
    with torch.no_grad():
        output = cnn_model(mel)
        _, predicted = torch.max(output, 1)
    return label_to_emotion[predicted.item()]


cnn_audio_emotion = normalized_label_map[predict_emotion_cnn(audio_input_path)]
result_map["cnn_audio_emotion"] = cnn_audio_emotion
print(f"Predicted emotion: {cnn_audio_emotion}")

Predicted emotion: angry


  cnn_model.load_state_dict(torch.load(cnn_model_path, map_location=device))


# **Predict Emotion from Audio Tone using SVM**

In [83]:
from manage_models_v2 import ml_model_extract_features 


def predict_emotion_svm(file_path):
    try:
        features = ml_model_extract_features(file_path)
        if features is None:
            print("Failed to extract features.")
            return None
        features = np.expand_dims(features, axis=0)  # reshape to (1, n_features)
        prediction = svm_model_path.predict(features)
        predicted_label = le.inverse_transform(prediction)[0]
        return predicted_label
    except Exception as e:
        print(f"Error in SVM prediction: {e}")
        return None

# Example usage
svm_predicted_emotion = predict_emotion_svm(audio_input_path)
result_map["svm_audio_emotion"] = svm_predicted_emotion
print("SVM Predicted Emotion:", svm_predicted_emotion)

SVM Predicted Emotion: angry


# **Predict Emotion from Audio Tone using Random Forest**

In [84]:
def predict_emotion_rf(file_path):
    try:
        features = ml_model_extract_features(file_path)
        if features is None:
            print("Failed to extract features.")
            return None
        features = np.expand_dims(features, axis=0)  # reshape to (1, n_features)
        prediction = rf_model_path.predict(features)
        predicted_label = le.inverse_transform(prediction)[0]
        return predicted_label
    except Exception as e:
        print(f"Error in RF prediction: {e}")
        return None

# Example usage
rf_predicted_emotion = predict_emotion_rf(audio_input_path)
result_map["rf_audio_emotion"] = rf_predicted_emotion
print("SVM Predicted Emotion:", rf_predicted_emotion)


SVM Predicted Emotion: angry


# **Anomaly Detection using Predicted Results**


In [85]:
# Normalization from audio model outputs to CREMA-D standard
audio_normalizer = { "angry": "ANG", "happy": "HAP", "sad": "SAD", "neutral": "NEU", "fear": "FEA", "disgust": "DIS" }

def evaluate_mismatch(result_map):
    text_emotion = result_map["roberta_text_emotion"].lower()
    valid_audio_emotions = text_to_audio_map.get(text_emotion, [])

    # Collect predictions from all audio models
    audio_preds = [
        result_map["wav2vec2_audio_emotion"],
        result_map["cnn_audio_emotion"],
        result_map["svm_audio_emotion"],
        result_map["rf_audio_emotion"]
    ]

    # Normalize predictions to match CREMA-D labels (ANG, DIS, etc.)
    normalized_preds = [audio_normalizer.get(pred.lower()) for pred in audio_preds if pred]

    # Count matching predictions
    matches = [pred for pred in normalized_preds if pred in valid_audio_emotions]
    match_ratio = len(matches) / len(normalized_preds)

    # Decision logic
    if match_ratio == 1.0:
        status = "✅ No vocal-text mismatch detected. Emotion is consistent across modalities."
    elif match_ratio >= 0.5:
        status = "⚠️ Partial mismatch: Some models disagree. Possible nuanced expression or mild anomaly."
    else:
        status = "❗ Mismatch detected: Text and voice emotions do not align clearly. Possible vocal anomaly."

    # Optional: detailed explanation
    report = {
        "status": status,
        "match_ratio": match_ratio,
        "text_emotion": text_emotion,
        "expected_audio_emotions": valid_audio_emotions,
        "audio_predictions": normalized_preds,
    }

    return {**report, **result_map}

# **Final Anomaly Report**

In [86]:
import json
anomaly_report = evaluate_mismatch(result_map)

with open(json_path, "w") as f:
    json.dump(anomaly_report, f, indent=4)

print(anomaly_report["status"])
print(f"For complete anomany report check {json_path}")

❗ Mismatch detected: Text and voice emotions do not align clearly. Possible vocal anomaly.
For complete anomany report check anomaly_results.json
