# Model Evaluation Notebook
This notebook evaluates a fine-tuned Audio Spectrogram Transformer (AST) model on an unseen audio dataset. It loads a pre-trained model and uses its feature extractor to process raw audio directly. 

In [None]:
import torch
import evaluate
import numpy as np
from datasets import load_dataset, Audio, DatasetDict
from transformers import ASTFeatureExtractor, ASTForAudioClassification, Trainer, TrainingArguments

## Configuration

In [None]:
# --- Evaluation Configuration ---
DATASET_NAME = "YOUR_EVALUATION_DATASET_NAME_OR_PATH"  # E.g., 'username/my_test_audio_dataset' or a local path

# --- Model & Cache Paths ---
MODEL_HUB_ID = "username/my_test_audio_dataset"  # Path to the directory where the fine-tuned model was saved
CACHE_DIR = './cache'

# --- Audio Processing Parameters (should match training) ---
TARGET_SAMPLE_RATE = 16000  # Hz (16kHz)
CHUNK_LENGTH_MS = 1000      # milliseconds (1 second)
CHUNK_LENGTH_SAMPLES = int(TARGET_SAMPLE_RATE * CHUNK_LENGTH_MS / 1000)

# --- Evaluation Hyperparameters ---
BATCH_SIZE = 32

## Device Configuration

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

## Load Dataset

In [None]:
def load_and_prepare_dataset(dataset_name: str) -> DatasetDict:
    """Load and prepare the evaluation dataset."""
    try:
        ds = load_dataset(dataset_name, split='train', cache_dir=CACHE_DIR)
        # Ensure audio is at the target sample rate and mono
        ds = ds.cast_column("audio", Audio(sampling_rate=TARGET_SAMPLE_RATE, mono=True))
        print(f"Dataset {dataset_name} loaded successfully with {ds.num_rows} examples.")
            
        return ds
    except Exception as e:
        print(f"Failed to load or prepare dataset {dataset_name}: {e}")
        raise


## Metrics Computation

In [None]:
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")

AVERAGE_MODE = "binary"  # For binary classification

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    metrics = {}
    metrics.update(accuracy_metric.compute(predictions=predictions, references=labels))
    metrics.update(precision_metric.compute(predictions=predictions, references=labels, average=AVERAGE_MODE))
    metrics.update(recall_metric.compute(predictions=predictions, references=labels, average=AVERAGE_MODE))
    metrics.update(f1_metric.compute(predictions=predictions, references=labels, average=AVERAGE_MODE))
    return metrics

## Load Fine-tuned Model, Feature Extractor

In [None]:
try:
    # Load model and feature extractor from Hugging Face Hub ID
    print(f"Attempting to load model and feature extractor from Hugging Face Hub ID: {MODEL_HUB_ID}")
    model = ASTForAudioClassification.from_pretrained(MODEL_HUB_ID, cache_dir=CACHE_DIR)
    # It's good practice to explicitly set sampling_rate for feature_extractor if known
    feature_extractor = ASTFeatureExtractor.from_pretrained(MODEL_HUB_ID, sampling_rate=TARGET_SAMPLE_RATE)
    print(f"Successfully loaded model and feature extractor from {MODEL_HUB_ID}")
    model.to(DEVICE)
except Exception as e:
    print(f"Error loading model/feature extractor from {MODEL_HUB_ID}: {e}")

## Preprocessing Function for Evaluation
This function processes raw audio using the loaded `ASTFeatureExtractor`. It truncates or pads audio to `CHUNK_LENGTH_SAMPLES` (1 second) before creating the spectrogram features, ensuring consistency with the training process.

In [None]:
def preprocess_for_evaluation(examples):
    # 'examples['input_values']' is expected to be a list of audio dicts {'array': ..., 'sampling_rate': ...}
    audio_arrays = [x["array"] for x in examples['input_values']]
    
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=TARGET_SAMPLE_RATE, # Or directly use feature_extractor.sampling_rate
        max_length=CHUNK_LENGTH_SAMPLES,  # Ensures audio is processed as 1-second segments
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    # The trainer expects the features to be in a column named 'input_values'
    examples["input_values"] = inputs.input_values
    return examples


In [None]:
dataset = load_and_prepare_dataset(DATASET_NAME)

In [None]:
# Apply the preprocessing
dataset = dataset.rename_column('audio', 'input_values')
processed_dataset = dataset.with_transform(preprocess_for_evaluation)

## Trainer Initialization and Evaluation

In [None]:
training_args = TrainingArguments(
    output_dir="./eval_results",
    per_device_eval_batch_size=BATCH_SIZE,
    do_train=False,
    do_eval=True,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=processed_dataset,
    processing_class=feature_extractor,
    compute_metrics=compute_metrics
)

In [None]:
print("Starting evaluation...")
results = trainer.evaluate()

In [None]:
print("--- Evaluation Results ---")
for key, value in results.items():
    print(f"{key}: {value}")