In [None]:
### Installations ###
#####################

!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,

In [None]:
### Imports ###
###############

# general
import shutil
import os
import pandas as pd
import numpy  as np
from sklearn.model_selection import StratifiedShuffleSplit

# Audio Spectrogram Transformer Model and data loading
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer
from datasets import Dataset
from datasets import load_dataset, Audio
import torch

from torch.utils.data import DataLoader
import torchaudio

In [None]:
### Set data directory
##################

# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# set data directory
data_dir = '/content/drive/MyDrive/0_Masterarbeit/4_High_Level_Features/data'

Mounted at /content/drive


### Prepare Dataset

In [None]:
# Load audio samples
shutil.copy(os.path.join(data_dir, 'MER_audio_taffc_dataset.zip'), './')
os.makedirs('4Q Audio Emotion')
shutil.unpack_archive('./MER_audio_taffc_dataset.zip', extract_dir = './4Q Audio Emotion')

In [None]:
### Annotiations ###
####################

# load
df = pd.read_csv('./4Q Audio Emotion/panda_dataset_taffc_annotations.csv')

In [None]:
# include class mappings
class2id = {'Q1':0, 'Q2': 1, 'Q3': 2, 'Q4': 3}
df['classes'] = df['Quadrant'].map(class2id)

In [None]:
# check dataset
df

Unnamed: 0,Song,Quadrant,classes
0,MT0000004637,Q3,2
1,MT0000011357,Q2,1
2,MT0000011975,Q2,1
3,MT0000040632,Q1,0
4,MT0000044741,Q3,2
...,...,...,...
895,MT0035332835,Q2,1
896,MT0035334027,Q2,1
897,MT0036111736,Q2,1
898,MT0036368550,Q1,0


In [None]:
# bring into HuggingFace dataset format
ds = Dataset.from_pandas(df)
ds = ds.class_encode_column("classes")

In [None]:
# function to include audio_path
def audio_path(example):
  audio_path = os.path.join('./4Q Audio Emotion', example['Quadrant'], example['Song']) + '.mp3'

  return {'audio' :{'path' : audio_path,}}

In [None]:
# apply function
ds = ds.map(prepro)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

In [None]:
# Sample audio at 16000 hz
ds = ds.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
### Feature extraction ###
##########################

# initialize feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [None]:
# write function to extract features
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [None]:
# apply function
encoded_ds = ds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_ds = encoded_ds.rename_column("classes", "label")

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

In [None]:
### Train Test Split ###
########################

encoded_ds = encoded_ds.train_test_split(test_size = 0.2, stratify_by_column= 'label', seed = 42)

In [None]:
# check dataset
encoded_ds

DatasetDict({
    train: Dataset({
        features: ['Song', 'Quadrant', 'label', 'input_values'],
        num_rows: 720
    })
    test: Dataset({
        features: ['Song', 'Quadrant', 'label', 'input_values'],
        num_rows: 180
    })
})

### Training

In [None]:
### Evaluate ###
################

import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# function to compute accuracy
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
### Load Audio Spectrogram Transformer Model ###
################################################

# https://towardsdatascience.com/adding-custom-layers-on-top-of-a-hugging-face-model-f1ccdfc257bd
checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
model = AutoModelForAudioClassification.from_pretrained(checkpoint, num_labels = 4, ignore_mismatched_sizes = True)


Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# check model architecture
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (de

In [None]:
### Initialize Training Arguments ###
#####################################

training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    fp16 = True,
    optim="adafactor",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    seed = 123
)

In [None]:
### Initialize Trainer ###
##########################

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [None]:
#torch.cuda.empty_cache()

In [None]:
### Training ###
################

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,0.9425,0.928995,0.622222
2,0.6658,0.596767,0.744444
2,0.4018,0.578451,0.761111
4,0.1774,0.64419,0.744444
4,0.0621,0.628862,0.766667


TrainOutput(global_step=110, training_loss=0.4830246464772658, metrics={'train_runtime': 774.5531, 'train_samples_per_second': 4.648, 'train_steps_per_second': 0.142, 'total_flos': 2.3859985398104064e+17, 'train_loss': 0.4830246464772658, 'epoch': 4.89})

In [None]:
# save best model
save_dir = '/content/drive/MyDrive/0_Masterarbeit/2_Pipelines/Models'
trainer.save_model(os.path.join(save_dir,'best_audio_mood_model'))