In [17]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset as TorchDataset
from transformers import ASTConfig, ASTForAudioClassification, Trainer, TrainingArguments
import evaluate
from sklearn.metrics import classification_report, confusion_matrix
import random

# --- Set Random Seeds for Reproducibility ---
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# --- Configuration ---
MODEL_NAME = "MIT/ast-finetuned-audioset-10-10-0.4593"
MANIFEST_PATH = 'data/processed/manifest.csv'
OUTPUT_DIR = './results'
BEST_MODEL_DIR = './best_model'
NUM_EPOCHS = 10
BATCH_SIZE = 8
NUM_FOLDS = 10
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.01

# AST normalization stats (from AudioSet pretraining)
AUDIOSET_MEAN = -4.2677393
AUDIOSET_STD = 4.5689974

# --- Class Labels ---
CLASS_LABELS = [
    "air_conditioner", "car_horn", "children_playing", "dog_bark", 
    "drilling", "engine_idling", "gun_shot", "jackhammer", 
    "siren", "street_music"
]
id2label = {i: label for i, label in enumerate(CLASS_LABELS)}
label2id = {label: i for i, label in enumerate(CLASS_LABELS)}

# --- Custom Dataset with Lazy Loading ---
class SpectrogramDataset(TorchDataset):
    """
    Custom dataset that loads PRE-COMPUTED spectrograms on-the-fly.
    AST internally expects audio waveforms, but we can pass spectrograms 
    directly by matching the expected format after feature extraction.
    """
    def __init__(self, dataframe, mean=AUDIOSET_MEAN, std=AUDIOSET_STD, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.mean = mean
        self.std = std
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def normalize_spectrogram(self, spectrogram):
        """Apply AudioSet normalization."""
        normalized = (spectrogram - self.mean) / (self.std * 2)
        return normalized
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Load spectrogram lazily
        spectrogram = np.load(row['spectrogram_path'])
        
        # Handle different possible shapes from preprocessing
        if spectrogram.ndim == 3:
            spectrogram = spectrogram.squeeze(0)
        
        # Ensure shape is [n_mels, time] where n_mels should be 128
        if spectrogram.shape[0] != 128 and spectrogram.shape[1] == 128:
            spectrogram = spectrogram.T
        elif spectrogram.shape[0] != 128 and spectrogram.shape[1] != 128:
            raise ValueError(f"Expected one dimension to be 128 (n_mels), got shape {spectrogram.shape}")
        
        # Now spectrogram is [128, time]
        
        # Apply augmentation if provided (operates on [n_mels, time])
        if self.transform:
            spectrogram = self.transform(spectrogram)
        
        # Normalize the spectrogram
        spectrogram = self.normalize_spectrogram(spectrogram)
        
        # AST expects [time, frequency] after internal processing
        # Transpose from [128, time] to [time, 128]
        spectrogram = spectrogram.T
        
        # Convert to tensor - final shape: [time, 128]
        input_values = torch.tensor(spectrogram, dtype=torch.float32)
        
        # Create the inputs dictionary
        inputs = {
            'input_values': input_values,
            'labels': torch.tensor(row['classID'], dtype=torch.long)
        }
        
        return inputs

# --- Data Collator for Batching ---
class SpectrogramCollator:
    """
    Custom collator to handle batching of spectrograms with padding/truncation.
    AST expects FIXED input shape: [batch_size, 1024, 128]
    All spectrograms must be exactly 1024 time frames.
    """
    def __init__(self, target_length=1024):
        self.target_length = target_length  # Fixed length required by AST
    
    def __call__(self, features):
        padded_inputs = []
        labels = []
        
        for f in features:
            input_val = f['input_values']  # Shape: [time, 128]
            current_time = input_val.shape[0]
            
            # Always resize to exactly target_length
            if current_time > self.target_length:
                # Truncate
                input_val = input_val[:self.target_length, :]
            elif current_time < self.target_length:
                # Pad
                pad_amount = self.target_length - current_time
                padding = torch.zeros((pad_amount, input_val.shape[1]), dtype=input_val.dtype)
                input_val = torch.cat([input_val, padding], dim=0)
            # If equal, use as-is
            
            padded_inputs.append(input_val)
            labels.append(f['labels'])
        
        # Stack all - creates [batch, 1024, 128]
        input_values = torch.stack(padded_inputs, dim=0)
        labels_tensor = torch.stack(labels, dim=0)
        
        # Verify shape
        assert input_values.shape[1] == self.target_length, f"Expected time dim {self.target_length}, got {input_values.shape[1]}"
        assert input_values.shape[2] == 128, f"Expected 128 mel bins, got {input_values.shape[2]}"
        
        return {
            'input_values': input_values,
            'labels': labels_tensor
        }

# --- Simple Data Augmentation ---
class SpectrogramAugmentation:
    """SpecAugment-style augmentation: time and frequency masking"""
    def __init__(self, time_mask_param=20, freq_mask_param=20, probability=0.5):
        self.time_mask_param = time_mask_param
        self.freq_mask_param = freq_mask_param
        self.probability = probability
    
    def __call__(self, spectrogram):
        if random.random() > self.probability:
            return spectrogram
        
        spec = spectrogram.copy()
        
        # Time masking
        if random.random() > 0.5 and spec.shape[1] > self.time_mask_param:
            t = random.randint(1, self.time_mask_param)
            t0 = random.randint(0, spec.shape[1] - t)
            spec[:, t0:t0+t] = spec.mean()
        
        # Frequency masking
        if random.random() > 0.5 and spec.shape[0] > self.freq_mask_param:
            f = random.randint(1, self.freq_mask_param)
            f0 = random.randint(0, spec.shape[0] - f)
            spec[f0:f0+f, :] = spec.mean()
        
        return spec

# --- Load Data ---
print("Loading manifest...")
manifest_df = pd.read_csv(MANIFEST_PATH)

# Validate spectrogram files
print("Validating spectrogram paths...")
missing_files = [path for path in manifest_df['spectrogram_path'] if not os.path.exists(path)]

if missing_files:
    print(f"WARNING: {len(missing_files)} spectrogram files not found!")
    print(f"First few missing: {missing_files[:5]}")
    raise FileNotFoundError("Some spectrogram files are missing")

print(f"Found {len(manifest_df)} spectrograms")

# --- Custom Metrics ---
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids
    
    accuracy = evaluate.load("accuracy")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    f1 = evaluate.load("f1")
    
    return {
        'accuracy': accuracy.compute(predictions=predictions, references=labels)['accuracy'],
        'precision': precision.compute(predictions=predictions, references=labels, average='macro')['precision'],
        'recall': recall.compute(predictions=predictions, references=labels, average='macro')['recall'],
        'f1': f1.compute(predictions=predictions, references=labels, average='macro')['f1']
    }

# --- Main Training Loop ---
fold_accuracies = []
fold_f1_scores = []
best_fold_f1 = 0.0

augmentation = SpectrogramAugmentation()
data_collator = SpectrogramCollator(target_length=1024)

for k in range(1, NUM_FOLDS + 1):
    print(f"\n{'='*60}")
    print(f"Starting Training for Fold {k}/{NUM_FOLDS}")
    print(f"{'='*60}")
    
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Split data
    train_df = manifest_df[manifest_df['fold'] != k]
    eval_df = manifest_df[manifest_df['fold'] == k]
    
    print(f"Train samples: {len(train_df)}, Eval samples: {len(eval_df)}")
    
    # Create datasets
    train_dataset = SpectrogramDataset(train_df, transform=augmentation)
    eval_dataset = SpectrogramDataset(eval_df, transform=None)
    
    # Debug: Check shapes
    if k == 1:
        print(f"\nChecking dataset format...")
        sample = train_dataset[0]
        print(f"  Sample input_values shape: {sample['input_values'].shape}")
        print(f"  Expected: [time, 128]")
        
        test_batch = data_collator([train_dataset[0], train_dataset[1]])
        print(f"  Batched input_values shape: {test_batch['input_values'].shape}")
        print(f"  Expected: [2, time, 128]\n")
    
    # Load Model - use from_pretrained but reinitialize classifier
    model = ASTForAudioClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(CLASS_LABELS),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )
    
    # Training Arguments
    training_args = TrainingArguments(
        output_dir=f'{OUTPUT_DIR}/fold_{k}',
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        warmup_ratio=0.1,
        logging_dir=f'./logs/fold_{k}',
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=2,
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=4 if torch.cuda.is_available() else 0,
        seed=42,
        remove_unused_columns=False,
    )
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )
    
    # Train
    print("Starting training...")
    trainer.train()
    
    # Evaluate
    print("Evaluating on validation fold...")
    eval_results = trainer.evaluate()
    
    current_accuracy = eval_results['eval_accuracy']
    current_f1 = eval_results['eval_f1']
    
    fold_accuracies.append(current_accuracy)
    fold_f1_scores.append(current_f1)
    
    print(f"\nFold {k} Results:")
    print(f"  Accuracy:  {current_accuracy:.4f}")
    print(f"  Precision: {eval_results['eval_precision']:.4f}")
    print(f"  Recall:    {eval_results['eval_recall']:.4f}")
    print(f"  F1 Score:  {current_f1:.4f}")
    
    # Detailed classification report
    predictions = trainer.predict(eval_dataset)
    pred_labels = np.argmax(predictions.predictions, axis=1)
    true_labels = predictions.label_ids
    
    print("\nDetailed Classification Report:")
    print(classification_report(
        true_labels, 
        pred_labels, 
        target_names=CLASS_LABELS,
        digits=4
    ))
    
    # Save best model
    if current_f1 > best_fold_f1:
        best_fold_f1 = current_f1
        print(f"\nNew best model found in fold {k} (F1: {current_f1:.4f})!")
        print(f"Saving to {BEST_MODEL_DIR}")
        trainer.save_model(BEST_MODEL_DIR)
        
        # Save confusion matrix and metrics
        cm = confusion_matrix(true_labels, pred_labels)
        np.save(f'{BEST_MODEL_DIR}/confusion_matrix.npy', cm)
        
        class_report = classification_report(
            true_labels, 
            pred_labels, 
            target_names=CLASS_LABELS,
            output_dict=True
        )
        pd.DataFrame(class_report).transpose().to_csv(
            f'{BEST_MODEL_DIR}/class_metrics.csv'
        )
    
    # Cleanup
    if k < NUM_FOLDS:
        import shutil
        checkpoint_dir = f'{OUTPUT_DIR}/fold_{k}'
        if os.path.exists(checkpoint_dir):
            shutil.rmtree(checkpoint_dir)

# --- Final Summary ---
print("\n" + "="*60)
print("Cross-Validation Complete")
print("="*60)

mean_accuracy = np.mean(fold_accuracies)
std_accuracy = np.std(fold_accuracies)
mean_f1 = np.mean(fold_f1_scores)
std_f1 = np.std(fold_f1_scores)

print(f"\nFold Accuracies: {[f'{acc:.4f}' for acc in fold_accuracies]}")
print(f"Average Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")

print(f"\nFold F1 Scores: {[f'{f1:.4f}' for f1 in fold_f1_scores]}")
print(f"Average F1 Score: {mean_f1:.4f} ± {std_f1:.4f}")

print(f"\nBest model saved to: {BEST_MODEL_DIR}")
print(f"Best F1 Score: {best_fold_f1:.4f}")

# Save final results
results_summary = pd.DataFrame({
    'fold': range(1, NUM_FOLDS + 1),
    'accuracy': fold_accuracies,
    'f1_score': fold_f1_scores
})
results_summary.to_csv(f'{OUTPUT_DIR}/cv_results.csv', index=False)
print(f"\nResults saved to: {OUTPUT_DIR}/cv_results.csv")

Loading manifest...
Validating spectrogram paths...
Found 8732 spectrograms

Starting Training for Fold 1/10
Train samples: 7859, Eval samples: 873

Checking dataset format...
  Sample input_values shape: torch.Size([10, 128])
  Expected: [time, 128]
  Batched input_values shape: torch.Size([2, 1024, 128])
  Expected: [2, time, 128]



Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4241,1.072684,0.715922,0.733641,0.748837,0.712498
2,0.3482,1.399186,0.778923,0.802379,0.802143,0.791332
3,0.1016,1.631806,0.774341,0.782862,0.806057,0.791302
4,0.1244,1.783436,0.764032,0.773579,0.798676,0.777257
5,0.0249,2.293173,0.705613,0.754509,0.746624,0.740723
6,0.0121,2.025012,0.757159,0.783489,0.790317,0.782356
7,0.0187,2.021762,0.768614,0.7906,0.804619,0.7902
8,0.0397,1.813349,0.75945,0.792311,0.791169,0.781042
9,0.0055,1.677921,0.78236,0.809106,0.814456,0.806296
10,0.0165,1.819502,0.785796,0.80466,0.81599,0.808419


Downloading builder script: 7.56kB [00:00, 10.5MB/s]
Downloading builder script: 7.38kB [00:00, 9.41MB/s]
Downloading builder script: 6.79kB [00:00, 10.4MB/s]


Evaluating on validation fold...



Fold 1 Results:
  Accuracy:  0.7858
  Precision: 0.8047
  Recall:    0.8160
  F1 Score:  0.8084

Detailed Classification Report:
                  precision    recall  f1-score   support

 air_conditioner     0.6333    0.5700    0.6000       100
        car_horn     0.8947    0.9444    0.9189        36
children_playing     0.9223    0.9500    0.9360       100
        dog_bark     0.8532    0.9300    0.8900       100
        drilling     0.5400    0.5400    0.5400       100
   engine_idling     0.8953    0.8021    0.8462        96
        gun_shot     0.9459    1.0000    0.9722        35
      jackhammer     0.6522    0.6250    0.6383       120
           siren     0.8095    0.9884    0.8901        86
    street_music     0.9000    0.8100    0.8526       100

        accuracy                         0.7858       873
       macro avg     0.8047    0.8160    0.8084       873
    weighted avg     0.7836    0.7858    0.7827       873


New best model found in fold 1 (F1: 0.8084)!
Saving to

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4881,0.813151,0.743243,0.791228,0.767179,0.763684
2,0.317,1.208485,0.772523,0.82662,0.787932,0.789257
3,0.1203,1.475425,0.778153,0.853207,0.788295,0.792459
4,0.1266,0.986272,0.807432,0.842151,0.805522,0.816413
5,0.0246,1.32187,0.810811,0.845513,0.819097,0.824455
6,0.0011,1.376557,0.800676,0.810253,0.796658,0.791573
7,0.0001,1.178063,0.837838,0.863845,0.83652,0.845992
8,0.0314,1.294562,0.797297,0.827906,0.811227,0.814143
9,0.0107,1.39669,0.808559,0.840684,0.810145,0.818428
10,0.0121,1.409237,0.829955,0.864873,0.82852,0.831471


Evaluating on validation fold...



Fold 2 Results:
  Accuracy:  0.8378
  Precision: 0.8638
  Recall:    0.8365
  F1 Score:  0.8460

Detailed Classification Report:
                  precision    recall  f1-score   support

 air_conditioner     0.6058    0.6300    0.6176       100
        car_horn     1.0000    0.6905    0.8169        42
children_playing     0.8829    0.9800    0.9289       100
        dog_bark     0.9245    0.9800    0.9515       100
        drilling     0.8557    0.8300    0.8426       100
   engine_idling     0.6667    0.7400    0.7014       100
        gun_shot     1.0000    0.8857    0.9394        35
      jackhammer     0.8077    0.7000    0.7500       120
           siren     1.0000    0.9890    0.9945        91
    street_music     0.8952    0.9400    0.9171       100

        accuracy                         0.8378       888
       macro avg     0.8638    0.8365    0.8460       888
    weighted avg     0.8423    0.8378    0.8374       888


New best model found in fold 2 (F1: 0.8460)!
Saving to

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4614,1.015183,0.743784,0.79964,0.771678,0.777334
2,0.1864,1.306763,0.743784,0.813523,0.781665,0.763517
3,0.1177,1.254789,0.775135,0.793079,0.801767,0.784514
4,0.159,1.472486,0.755676,0.799471,0.78727,0.77648
5,0.0183,2.188994,0.674595,0.739752,0.705084,0.694337
6,0.0701,1.473075,0.783784,0.813726,0.814695,0.807257
7,0.0212,1.524834,0.791351,0.809519,0.816062,0.807763
8,0.0265,1.622917,0.794595,0.822029,0.818565,0.811957
9,0.0006,1.890111,0.784865,0.818879,0.812014,0.804313
10,0.0003,1.845352,0.79027,0.825699,0.817325,0.807735


Evaluating on validation fold...



Fold 3 Results:
  Accuracy:  0.7946
  Precision: 0.8220
  Recall:    0.8186
  F1 Score:  0.8120

Detailed Classification Report:
                  precision    recall  f1-score   support

 air_conditioner     0.8750    0.5600    0.6829       100
        car_horn     0.8571    0.9767    0.9130        43
children_playing     0.8333    0.9500    0.8879       100
        dog_bark     1.0000    0.8700    0.9305       100
        drilling     0.6967    0.8500    0.7658       100
   engine_idling     0.7674    0.6168    0.6839       107
        gun_shot     1.0000    0.9722    0.9859        36
      jackhammer     0.6415    0.5667    0.6018       120
           siren     0.9360    0.9832    0.9590       119
    street_music     0.6131    0.8400    0.7089       100

        accuracy                         0.7946       925
       macro avg     0.8220    0.8186    0.8120       925
    weighted avg     0.8056    0.7946    0.7912       925


Starting Training for Fold 4/10
Train samples: 7742, E

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4787,0.972032,0.741414,0.789106,0.75602,0.748239
2,0.2597,1.01706,0.774747,0.845179,0.76053,0.774443
3,0.1112,1.074,0.811111,0.850904,0.801262,0.805368
4,0.0849,1.019274,0.813131,0.827316,0.812911,0.817881
5,0.0828,1.123505,0.824242,0.833485,0.816375,0.820305
6,0.0682,1.28544,0.819192,0.838501,0.818581,0.812236
7,0.0311,1.101576,0.829293,0.842003,0.82837,0.832608
8,0.0295,1.253309,0.826263,0.851056,0.820018,0.829376
9,0.0042,1.094727,0.841414,0.848257,0.839175,0.838935
10,0.022,1.119292,0.841414,0.857857,0.839814,0.840802


Evaluating on validation fold...



Fold 4 Results:
  Accuracy:  0.8414
  Precision: 0.8579
  Recall:    0.8398
  F1 Score:  0.8408

Detailed Classification Report:
                  precision    recall  f1-score   support

 air_conditioner     0.8617    0.8100    0.8351       100
        car_horn     0.9556    0.7288    0.8269        59
children_playing     0.9175    0.8900    0.9036       100
        dog_bark     0.9300    0.9300    0.9300       100
        drilling     0.5274    0.7700    0.6260       100
   engine_idling     0.8667    0.9720    0.9163       107
        gun_shot     0.9474    0.9474    0.9474        38
      jackhammer     0.7500    0.4500    0.5625       120
           siren     0.9486    1.0000    0.9736       166
    street_music     0.8738    0.9000    0.8867       100

        accuracy                         0.8414       990
       macro avg     0.8579    0.8398    0.8408       990
    weighted avg     0.8521    0.8414    0.8385       990


Starting Training for Fold 5/10
Train samples: 7796, E

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4616,0.821743,0.805556,0.842547,0.795454,0.800918
2,0.3163,0.83351,0.818376,0.84043,0.83326,0.828376
3,0.2225,0.963042,0.845085,0.876719,0.857215,0.857967
4,0.0927,0.945159,0.870726,0.891886,0.881287,0.878162
5,0.0869,1.013343,0.852564,0.887841,0.863624,0.868093
6,0.1041,1.025642,0.867521,0.882656,0.87716,0.873726
7,0.074,1.151937,0.866453,0.883147,0.878581,0.871432
8,0.0477,1.18542,0.875,0.899698,0.88641,0.881966
9,0.022,1.215128,0.869658,0.887475,0.881385,0.878451
10,0.0158,1.201162,0.877137,0.891564,0.888572,0.884638


Evaluating on validation fold...



Fold 5 Results:
  Accuracy:  0.8771
  Precision: 0.8916
  Recall:    0.8886
  F1 Score:  0.8846

Detailed Classification Report:
                  precision    recall  f1-score   support

 air_conditioner     0.9149    0.8600    0.8866       100
        car_horn     0.9592    0.9592    0.9592        98
children_playing     0.9560    0.8700    0.9110       100
        dog_bark     0.9684    0.9200    0.9436       100
        drilling     0.9062    0.8700    0.8878       100
   engine_idling     0.6849    0.4673    0.5556       107
        gun_shot     0.8889    1.0000    0.9412        40
      jackhammer     0.6860    0.9833    0.8082       120
           siren     1.0000    0.9859    0.9929        71
    street_music     0.9510    0.9700    0.9604       100

        accuracy                         0.8771       936
       macro avg     0.8916    0.8886    0.8846       936
    weighted avg     0.8823    0.8771    0.8734       936


New best model found in fold 5 (F1: 0.8846)!
Saving to

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4271,1.100714,0.755772,0.820525,0.757158,0.756576
2,0.1608,1.584369,0.761847,0.782747,0.789444,0.776715
3,0.1375,1.489294,0.748481,0.786614,0.774848,0.770579
4,0.1122,1.721475,0.787363,0.845844,0.804261,0.806784
5,0.0431,1.764671,0.793439,0.825072,0.810972,0.809905
6,0.0706,1.91525,0.792224,0.830552,0.813347,0.817425
7,0.049,1.374851,0.839611,0.867037,0.859705,0.861255
8,0.0411,1.722301,0.798299,0.820513,0.817829,0.810991
9,0.0301,1.505311,0.821385,0.831285,0.835426,0.830654
10,0.0017,1.442772,0.831106,0.828417,0.843396,0.825191


Evaluating on validation fold...



Fold 6 Results:
  Accuracy:  0.8396
  Precision: 0.8670
  Recall:    0.8597
  F1 Score:  0.8613

Detailed Classification Report:
                  precision    recall  f1-score   support

 air_conditioner     0.5648    0.6100    0.5865       100
        car_horn     0.9333    1.0000    0.9655        28
children_playing     0.8972    0.9600    0.9275       100
        dog_bark     0.9300    0.9300    0.9300       100
        drilling     0.9688    0.9300    0.9490       100
   engine_idling     0.6373    0.6075    0.6220       107
        gun_shot     1.0000    1.0000    1.0000        46
      jackhammer     0.9811    0.7647    0.8595        68
           siren     0.9275    0.8649    0.8951        74
    street_music     0.8304    0.9300    0.8774       100

        accuracy                         0.8396       823
       macro avg     0.8670    0.8597    0.8613       823
    weighted avg     0.8442    0.8396    0.8400       823


Starting Training for Fold 7/10
Train samples: 7894, E

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4603,1.068901,0.76611,0.78939,0.764565,0.743404
2,0.2645,1.388488,0.775656,0.816854,0.776745,0.769408
3,0.1523,1.328987,0.824582,0.818007,0.828345,0.810711
4,0.0713,1.202387,0.823389,0.827523,0.826429,0.818319
5,0.1037,1.452576,0.816229,0.806272,0.82038,0.80082
6,0.0292,1.55336,0.793556,0.816719,0.799461,0.788637
7,0.0854,1.632049,0.798329,0.825271,0.803358,0.789088
8,0.0257,1.520984,0.818616,0.817186,0.823411,0.808805
9,0.0123,1.622335,0.819809,0.843992,0.82314,0.810685
10,0.0125,1.527813,0.837709,0.864267,0.837283,0.829605


Evaluating on validation fold...



Fold 7 Results:
  Accuracy:  0.8377
  Precision: 0.8643
  Recall:    0.8373
  F1 Score:  0.8296

Detailed Classification Report:
                  precision    recall  f1-score   support

 air_conditioner     0.5612    0.7800    0.6527       100
        car_horn     0.9310    0.9643    0.9474        28
children_playing     0.9691    0.9400    0.9543       100
        dog_bark     0.8868    0.9400    0.9126       100
        drilling     0.8148    0.8800    0.8462       100
   engine_idling     0.8532    0.8774    0.8651       106
        gun_shot     0.9792    0.9216    0.9495        51
      jackhammer     0.8421    0.2105    0.3368        76
           siren     0.9091    0.9091    0.9091        77
    street_music     0.8962    0.9500    0.9223       100

        accuracy                         0.8377       838
       macro avg     0.8643    0.8373    0.8296       838
    weighted avg     0.8511    0.8377    0.8247       838


Starting Training for Fold 8/10
Train samples: 7926, E

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4272,1.317024,0.677419,0.756169,0.713166,0.695247
2,0.2555,1.520271,0.720844,0.811856,0.730601,0.731276
3,0.1827,1.733972,0.734491,0.787118,0.752416,0.747203
4,0.156,1.913136,0.753102,0.83084,0.751831,0.756661
5,0.0675,1.459318,0.781638,0.813124,0.786904,0.786657
6,0.0189,2.112428,0.765509,0.838832,0.774108,0.7788
7,0.0447,2.257657,0.748139,0.810698,0.76554,0.75749
8,0.0065,1.968717,0.75062,0.789879,0.774394,0.749416
9,0.0193,2.216465,0.76799,0.833839,0.775725,0.776917
10,0.0028,2.231504,0.774194,0.850107,0.787271,0.789573


Evaluating on validation fold...



Fold 8 Results:
  Accuracy:  0.7742
  Precision: 0.8501
  Recall:    0.7873
  F1 Score:  0.7896

Detailed Classification Report:
                  precision    recall  f1-score   support

 air_conditioner     0.9189    0.3400    0.4964       100
        car_horn     1.0000    0.8667    0.9286        30
children_playing     0.7759    0.9000    0.8333       100
        dog_bark     0.9694    0.9500    0.9596       100
        drilling     0.4921    0.9300    0.6436       100
   engine_idling     0.9398    0.8864    0.9123        88
        gun_shot     1.0000    1.0000    1.0000        30
      jackhammer     0.7917    0.4872    0.6032        78
           siren     0.8654    0.5625    0.6818        80
    street_music     0.7480    0.9500    0.8370       100

        accuracy                         0.7742       806
       macro avg     0.8501    0.7873    0.7896       806
    weighted avg     0.8240    0.7742    0.7652       806


Starting Training for Fold 9/10
Train samples: 7916, E

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5405,0.653464,0.827206,0.845364,0.854345,0.836097
2,0.3029,0.924083,0.832108,0.855653,0.842484,0.844423
3,0.2534,0.760183,0.873775,0.896677,0.886469,0.882392
4,0.0926,0.836112,0.882353,0.897131,0.890964,0.890048
5,0.0013,1.241603,0.834559,0.857966,0.862969,0.834597
6,0.0576,1.237832,0.846814,0.874117,0.866668,0.866515
7,0.0694,1.011456,0.856618,0.882374,0.881189,0.87194
8,0.026,1.11754,0.870098,0.891878,0.889844,0.884154
9,0.0291,1.189393,0.867647,0.89091,0.88975,0.880417
10,0.0133,1.292359,0.857843,0.883714,0.881969,0.871854


Evaluating on validation fold...



Fold 9 Results:
  Accuracy:  0.8824
  Precision: 0.8971
  Recall:    0.8910
  F1 Score:  0.8900

Detailed Classification Report:
                  precision    recall  f1-score   support

 air_conditioner     0.8267    0.6200    0.7086       100
        car_horn     0.9412    1.0000    0.9697        32
children_playing     0.9670    0.8800    0.9215       100
        dog_bark     0.8922    0.9100    0.9010       100
        drilling     0.9231    0.8400    0.8796       100
   engine_idling     0.8866    0.9663    0.9247        89
        gun_shot     1.0000    0.8387    0.9123        31
      jackhammer     0.8152    0.9146    0.8621        82
           siren     0.9425    1.0000    0.9704        82
    street_music     0.7769    0.9400    0.8507       100

        accuracy                         0.8824       816
       macro avg     0.8971    0.8910    0.8900       816
    weighted avg     0.8857    0.8824    0.8799       816


New best model found in fold 9 (F1: 0.8900)!
Saving to

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4562,0.583094,0.818399,0.851066,0.827681,0.83064
2,0.2923,0.650437,0.854241,0.857823,0.871504,0.857376
3,0.1183,0.946698,0.841099,0.872213,0.850643,0.850774
4,0.156,0.924812,0.864994,0.907969,0.869587,0.879451
5,0.0448,0.973469,0.873357,0.901891,0.87922,0.881629
6,0.0768,0.688573,0.900836,0.917661,0.90679,0.908569
7,0.0148,0.889033,0.891278,0.913132,0.895975,0.899991
8,0.0336,0.918052,0.888889,0.908357,0.895992,0.89655
9,0.0253,0.884037,0.88172,0.904989,0.887552,0.888932
10,0.0185,0.845626,0.893668,0.914865,0.899917,0.90111


Evaluating on validation fold...



Fold 10 Results:
  Accuracy:  0.9008
  Precision: 0.9177
  Recall:    0.9068
  F1 Score:  0.9086

Detailed Classification Report:
                  precision    recall  f1-score   support

 air_conditioner     0.8509    0.9700    0.9065       100
        car_horn     1.0000    0.9394    0.9688        33
children_playing     0.8962    0.9500    0.9223       100
        dog_bark     0.8070    0.9200    0.8598       100
        drilling     1.0000    0.8300    0.9071       100
   engine_idling     0.9494    0.8065    0.8721        93
        gun_shot     0.9697    1.0000    0.9846        32
      jackhammer     0.9307    0.9792    0.9543        96
           siren     0.9091    0.7229    0.8054        83
    street_music     0.8636    0.9500    0.9048       100

        accuracy                         0.9008       837
       macro avg     0.9177    0.9068    0.9086       837
    weighted avg     0.9067    0.9008    0.8998       837


New best model found in fold 10 (F1: 0.9086)!
Saving 