<a href="https://colab.research.google.com/github/sebi061/VideoAdEngagement/blob/main/2_Training_feature%20extraction%20models/5_Sound_detection_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### Installations ###
#####################

!pip uninstall -y transformers
!pip install transformers==4.28.0 datasets evaluate

[0mCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_

In [None]:
### Imports ###
###############

# general
import numpy  as np
import pandas as pd
import os
import shutil

# torch
import torch

# Hugging Face model and training
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer
from datasets import Dataset
from datasets import load_dataset, Audio

In [None]:
### Set data directory
##################

# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# set data directory
data_dir = '/content/drive/MyDrive/VideoAdEngagement/2_Training_feature extraction models/Data'
save_dir = '/content/drive/MyDrive/VideoAdEngagement/2_Training_feature extraction models/trained_models'

Mounted at /content/drive


In [None]:
### Voice data ###
##################
# https://zenodo.org/record/1188976#.YO6yI-gzaUk

# copy
shutil.copy(os.path.join(data_dir, 'Audio_Speech_Actors_01-24.zip'), './')
os.makedirs('./Voice_data')
shutil.unpack_archive('./Audio_Speech_Actors_01-24.zip', extract_dir = './Voice_data')

In [None]:
# extract files
files = []
for folder in os.listdir('./Voice_data'):
  for file in os.listdir(os.path.join('./Voice_data', folder)):
    files.append(os.path.join('./Voice_data', folder, file))

In [None]:
# create pandas dataframe
voice_df = pd.DataFrame({'audio': files})

In [None]:
# write function to extract labels encoded in file names

def assign_emotion(example):

  if (example[28:30] == '02') and (example[31:33] == '02'):
    cat = 'calm voice'

  elif (example[28:30] == '03') and (example[31:33] == '02'):
    cat = 'happy voice'

  elif (example[28:30] == '04') and (example[31:33] == '02'):
    cat = 'sad voice'

  elif (example[28:30] == '05') and (example[31:33] == '02'):
    cat = 'angry voice'

  else:
    cat = 'delete'

  return cat

In [None]:
# apply to dataframe
voice_df['label'] =  voice_df['audio'].apply(assign_emotion)

In [None]:
# remove labels that are not needed
voice_df = voice_df.loc[voice_df['label'] != 'delete']

In [None]:
# sample 40 per group to match count of ESC-50 dataset
voice_df = voice_df.groupby('label').sample(n=40, random_state = 42).reset_index(drop = True)

In [None]:
# check
voice_df['label'].value_counts()

angry voice    40
calm voice     40
happy voice    40
sad voice      40
Name: label, dtype: int64

In [None]:
### Sound data ###
##################
# copy
shutil.copy(os.path.join(data_dir, 'ESC-50-master.zip'), './')
shutil.unpack_archive('./ESC-50-master.zip', extract_dir = './')

In [None]:
# load dataframe
sound_df = pd.read_csv('./ESC-50-master/meta/esc50.csv')

In [None]:
# write function to create file directories
def make_file_dir(example):
  return os.path.join('./ESC-50-master/audio', example)

In [None]:
# apply function
sound_df['audio'] = sound_df['filename'].apply(make_file_dir)

In [None]:
# drop not needed columns and rename
sound_df = sound_df.drop(columns = ['filename', 'fold', 'target', 'esc10', 'src_file', 'take']).rename(columns={"category": "label"})

In [None]:
# concatinate both dataframes
df_final = pd.concat([sound_df, voice_df], ignore_index = True)

In [None]:
### Bring into Hugging Face format and prepare for training  ###
################################################################

# bring into huggingface dataset format
ds = Dataset.from_pandas(df_final)
ds

Dataset({
    features: ['label', 'audio'],
    num_rows: 2160
})

In [None]:
# Sample audio at 16000 hz
ds = ds.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
# check dataset
ds[-1]

{'label': 'sad voice',
 'audio': {'path': './Voice_data/Actor_21/03-01-04-02-01-02-21.wav',
  'array': array([-1.89702405e-08,  2.17053184e-08, -2.43229010e-08, ...,
         -6.92779167e-14, -1.75282011e-12, -3.37063710e-13]),
  'sampling_rate': 16000}}

In [None]:
# encode category as class labels to do stratified train test split
ds = ds.class_encode_column("label")

Casting to class labels:   0%|          | 0/2160 [00:00<?, ? examples/s]

In [None]:
# extract label names
labels = ds.features['label'].names
print(labels)

['airplane', 'angry voice', 'breathing', 'brushing_teeth', 'calm voice', 'can_opening', 'car_horn', 'cat', 'chainsaw', 'chirping_birds', 'church_bells', 'clapping', 'clock_alarm', 'clock_tick', 'coughing', 'cow', 'crackling_fire', 'crickets', 'crow', 'crying_baby', 'dog', 'door_wood_creaks', 'door_wood_knock', 'drinking_sipping', 'engine', 'fireworks', 'footsteps', 'frog', 'glass_breaking', 'hand_saw', 'happy voice', 'helicopter', 'hen', 'insects', 'keyboard_typing', 'laughing', 'mouse_click', 'pig', 'pouring_water', 'rain', 'rooster', 'sad voice', 'sea_waves', 'sheep', 'siren', 'sneezing', 'snoring', 'thunderstorm', 'toilet_flush', 'train', 'vacuum_cleaner', 'washing_machine', 'water_drops', 'wind']


In [None]:
# extract dict for label to id mapping and vice versa
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
# check if it worked
print(label2id)

{'airplane': '0', 'angry voice': '1', 'breathing': '2', 'brushing_teeth': '3', 'calm voice': '4', 'can_opening': '5', 'car_horn': '6', 'cat': '7', 'chainsaw': '8', 'chirping_birds': '9', 'church_bells': '10', 'clapping': '11', 'clock_alarm': '12', 'clock_tick': '13', 'coughing': '14', 'cow': '15', 'crackling_fire': '16', 'crickets': '17', 'crow': '18', 'crying_baby': '19', 'dog': '20', 'door_wood_creaks': '21', 'door_wood_knock': '22', 'drinking_sipping': '23', 'engine': '24', 'fireworks': '25', 'footsteps': '26', 'frog': '27', 'glass_breaking': '28', 'hand_saw': '29', 'happy voice': '30', 'helicopter': '31', 'hen': '32', 'insects': '33', 'keyboard_typing': '34', 'laughing': '35', 'mouse_click': '36', 'pig': '37', 'pouring_water': '38', 'rain': '39', 'rooster': '40', 'sad voice': '41', 'sea_waves': '42', 'sheep': '43', 'siren': '44', 'sneezing': '45', 'snoring': '46', 'thunderstorm': '47', 'toilet_flush': '48', 'train': '49', 'vacuum_cleaner': '50', 'washing_machine': '51', 'water_drop

In [None]:
# stratified train test split
ds = ds.train_test_split(test_size = 0.2, stratify_by_column = 'label', seed = 42)

In [None]:
### Feature extraction ###
##########################

# initialize feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

Downloading (…)rocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [None]:
# write function to extract features (spectrograms)
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [None]:
# apply feature extraction to audio array
encoded_ds = ds.map(preprocess_function, remove_columns="audio", batched=True)

Map:   0%|          | 0/1728 [00:00<?, ? examples/s]

Map:   0%|          | 0/432 [00:00<?, ? examples/s]

In [None]:
### Model training ###
######################

# load evaluation metric
import evaluate
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# write function to apply evaluation metric
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
# Load pretrained Audio Spectrogram Transformer model
num_labels = len(id2label)
checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
model = AutoModelForAudioClassification.from_pretrained(checkpoint, num_labels = num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes = True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([54, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([54]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Initialize Training Arguments

training_args = TrainingArguments(
    output_dir="sound_detection_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    fp16 = True,
    optim="adafactor",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    seed = 123
)

In [None]:
# instantiate trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [None]:
# pre-training evaluation
trainer.evaluate()

{'eval_loss': 4.26597785949707,
 'eval_accuracy': 0.018518518518518517,
 'eval_runtime': 66.4953,
 'eval_samples_per_second': 6.497,
 'eval_steps_per_second': 1.624}

In [None]:
# fine-tune  pre-trained model on the esc-50 and voice dataset
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7628,0.506118,0.900463
2,0.2161,0.235413,0.951389
3,0.0811,0.142083,0.972222


TrainOutput(global_step=162, training_loss=0.8004293003935873, metrics={'train_runtime': 1120.0447, 'train_samples_per_second': 4.628, 'train_steps_per_second': 0.145, 'total_flos': 3.515492676277371e+17, 'train_loss': 0.8004293003935873, 'epoch': 3.0})

In [None]:
# save best model
trainer.save_model('/content/drive/MyDrive/0_Masterarbeit/4_High_Level_Features/best_sound_detection_model')

In [None]:
# save best model
trainer.save_model(os.path.join(save_dir,'best_sound_detection_model'))