In [None]:
# Import Libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from transformers import AutoFeatureExtractor, AutoModel
import librosa
import random

In [None]:
# Configuration - Must match training configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PRETRAINED_MODEL_NAME = "openai/whisper-large-v3"
PRETRAINED_MODEL_OUTPUT_DIM = 1280
PRETRAINED_MAX_LEN_PADDING = 448
SAMPLE_RATE = 16000

# Paths for saved models and features
MODEL_DIR = Path('./model')
FEATURE_DIR = Path('./model_features')
BEST_MODEL_PATH = MODEL_DIR / f'best_lstm_with_{PRETRAINED_MODEL_NAME.split("/")[-1]}_features.pth'
LABEL_ENCODER_PATH = MODEL_DIR / 'label_encoder.joblib'
FEATURES_METADATA_PATH = FEATURE_DIR / 'features_metadata.csv'

# LSTM Model Parameters (must match training)
LSTM_HIDDEN_SIZE = 256
LSTM_NUM_LAYERS = 3
LSTM_DROPOUT = 0.3

In [None]:
# Model Architecture (Re-define as in serv9.ipynb)
class EmotionClassifierLSTM(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, n_layers: int, num_classes: int, dropout_prob: float):
        super().__init__()
        self.lstm_layer = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            batch_first=True,
            dropout=dropout_prob if n_layers > 1 else 0,
            bidirectional=True
        )
        self.classifier_hidden_layer = nn.Linear(hidden_dim * 2, hidden_dim)
        self.relu_activation = nn.ReLU()
        self.dropout_layer = nn.Dropout(dropout_prob)
        self.output_layer = nn.Linear(hidden_dim, num_classes)

    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
        batch_size = input_features.size(0)
        num_directions = 2
        h0 = torch.zeros(self.lstm_layer.num_layers * num_directions, batch_size, self.lstm_layer.hidden_size).to(input_features.device)
        c0 = torch.zeros(self.lstm_layer.num_layers * num_directions, batch_size, self.lstm_layer.hidden_size).to(input_features.device)
        lstm_output, _ = self.lstm_layer(input_features, (h0, c0))
        last_step_output = lstm_output[:, -1, :]
        x = self.classifier_hidden_layer(last_step_output)
        x = self.relu_activation(x)
        x = self.dropout_layer(x)
        logits = self.output_layer(x)
        return logits

In [None]:
# Dataset Class (Re-define as in serv9.ipynb)
class EmotionDataset(Dataset):
    def __init__(self, 
                 data_df: pd.DataFrame, 
                 feature_col: str, 
                 label_col: str, 
                 label_encoder: LabelEncoder):
        self.data_df = data_df
        self.feature_col = feature_col
        self.label_col = label_col
        self.label_encoder = label_encoder

    def __len__(self) -> int:
        return len(self.data_df)

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        data_row = self.data_df.iloc[index]
        feature_path = data_row[self.feature_col]
        label_text = data_row[self.label_col]

        feature_tensor = torch.load(feature_path)
        label_id = self.label_encoder.transform([label_text])[0]
        label_tensor = torch.tensor(label_id, dtype=torch.long)
        
        return feature_tensor, label_tensor

In [None]:
# Load data and prepare for evaluation
print(f"Loading features metadata from: {FEATURES_METADATA_PATH}")
df_features = pd.read_csv(FEATURES_METADATA_PATH)

print(f"Loading label encoder from: {LABEL_ENCODER_PATH}")
label_encoder = joblib.load(LABEL_ENCODER_PATH)
NUM_CLASSES = len(label_encoder.classes_)

print(f"Mapping classes: {dict(zip(label_encoder.classes_, range(NUM_CLASSES)))}")

# Re-split data to get the test set (must match training split)
df_features['encoded_labels'] = label_encoder.transform(df_features['labels'])
_, val_test_df = train_test_split(df_features, test_size=0.30, stratify=df_features['encoded_labels'], random_state=42)
_, test_df = train_test_split(val_test_df, test_size=0.50, stratify=val_test_df['encoded_labels'], random_state=42)

print(f"Test set size: {len(test_df)}")

# Create test dataset and data loader
test_dataset = EmotionDataset(
    data_df=test_df,
    feature_col="feature_path",
    label_col="labels",
    label_encoder=label_encoder,
)

BATCH_SIZE = 16 # Must match training batch size
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, drop_last=False
)

In [None]:
# Initialize and load the best model
print(f"Initializing model and loading weights from: {BEST_MODEL_PATH}")
model = EmotionClassifierLSTM(
    input_dim=PRETRAINED_MODEL_OUTPUT_DIM,
    hidden_dim=LSTM_HIDDEN_SIZE,
    n_layers=LSTM_NUM_LAYERS,
    num_classes=NUM_CLASSES,
    dropout_prob=LSTM_DROPOUT
).to(DEVICE)

model.load_state_dict(torch.load(BEST_MODEL_PATH, map_location=DEVICE))
model.eval() # Set to evaluation mode

print("Model loaded successfully.")

In [None]:
# Function to get predictions (from serv9.ipynb)
def get_predictions(model, data_loader, device):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for inputs, targets in tqdm(data_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            preds.extend(torch.max(outputs, 1)[1].cpu().numpy())
            labels.extend(targets.cpu().numpy())

    return np.array(preds), np.array(labels)

# Get predictions from the test set
print("Getting predictions on the test set...")
y_pred, y_true = get_predictions(model, test_loader, DEVICE)

class_names = label_encoder.classes_
print("Predictions obtained.")

In [None]:
# Visualize Confusion Matrix
print("Generating Confusion Matrix...")
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Test Set')
plt.show()
print("Confusion Matrix displayed.")

In [None]:
# Classification Report
print("Generating Classification Report...")
print("\nClassification Report - Test Set:")
try:
    print(classification_report(y_true, y_pred, target_names=class_names, zero_division=0))
except ValueError:
    print(classification_report(y_true, y_pred, zero_division=0))
    print(f"Mapping ID to class names: {dict(enumerate(class_names))}")
print("Classification Report displayed.")

In [None]:
# Sanity Check with Test Data (Optional, from serv9.ipynb)
def run_sanity_checks(current_model, test_df, label_encoder, device, num_checks=20):
    current_model.eval()  # Mode evaluasi
    correct_predictions = 0
    
    print("--- Starting Sanity Check ---\n")
    total_samples = len(test_df)
    sample_size = min(num_checks, total_samples)
    indices = random.sample(range(total_samples), sample_size)

    for idx, df_idx in enumerate(indices):
        row = test_df.iloc[df_idx]
        feature_path = row['feature_path']
        true_label = row['labels']

        # Load features and predict
        with torch.no_grad():
            feature_tensor = torch.load(feature_path, map_location=device).unsqueeze(0)
            output = current_model(feature_tensor)
            _, predicted_id = torch.max(output, 1)
            predicted_label = label_encoder.inverse_transform(predicted_id.cpu().numpy())[0]

        # Display results
        original_audio_path = row.get('path', 'N/A')
        print(f"Sample {idx+1}/{sample_size}:")
        print(f"   Original Audio   : {original_audio_path}")
        print(f"   Feature File     : {feature_path}")
        print(f"   True Label       : {true_label}")
        print(f"   Predicted Label  : {predicted_label}")
        result = "Correct" if true_label == predicted_label else "INCORRECT"
        print(f"   Result           : {result}\n")

        if true_label == predicted_label:
            correct_predictions += 1

    # Final summary
    accuracy = (correct_predictions / sample_size) * 100 if sample_size > 0 else 0
    print("--- Sanity Check Summary ---")
    print(f"Total samples checked : {sample_size}")
    print(f"Correct predictions   : {correct_predictions}")
    print(f"Accuracy              : {accuracy:.2f}%\n")
    print("--- Sanity Check Complete ---")

NUM_SANITY_CHECKS = 20 # Number of random samples to check
run_sanity_checks(model, test_df, label_encoder, DEVICE, NUM_SANITY_CHECKS)