<a href="https://colab.research.google.com/github/politeles/audio_classification/blob/main/VADSAudioClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementing an audio classifier for the VADS dataset
The VADS dataset contains a sample of different techniques of violin playing.

We will train a classifier using hugging face transformers library.
But first we have to download and preprocess the data, and perform operations like normalization.

This classifier implementation follows the guide from HuggingFace: https://huggingface.co/learn/audio-course/en/chapter4/fine-tuning


In [1]:
import os
from google.colab import userdata
# Note: `userdata.get` is a Colab API. If you're not using Colab, set the env
# vars as appropriate for your system.
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

In [2]:
pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-k9qcfvu3
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-k9qcfvu3
  Resolved https://github.com/huggingface/transformers to commit d1b92369ca193da49f9f7ecd01b08ece45c2c9aa
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers==4.52.0.dev0)
  Downloading huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.30.1-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.2/481.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hd

In [3]:
pip install datasets[audio]

Collecting datasets[audio]
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets[audio])
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets[audio])
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.8 MB/s[0m eta [36m0:0

In [4]:
from datasets import load_dataset

In [5]:
vads = load_dataset("politeles/vads")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md:   0%|          | 0.00/4.90k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2515 [00:00<?, ? examples/s]

In [6]:
vads

DatasetDict({
    train: Dataset({
        features: ['audio', 'technique', 'microphone', 'string', 'note'],
        num_rows: 2515
    })
})

# Undersampling the dataset

In [7]:
from collections import Counter

technique_counts = Counter(vads["train"]["technique"])
underrepresented_class = min(technique_counts, key=technique_counts.get)
undersample_count = technique_counts[underrepresented_class]

In [8]:
import pandas as pd
from datasets import Dataset

# Create a Pandas DataFrame from the dataset
df = pd.DataFrame(vads["train"])

# Undersample the DataFrame
undersampled_df =  df.groupby("technique", group_keys=False).apply(lambda x: x.sample(n=undersample_count) if len(x) > undersample_count else x).reset_index(drop=True)

# Convert the undersampled DataFrame back to a Hugging Face dataset
undersampled_dataset = Dataset.from_pandas(undersampled_df)

  undersampled_df =  df.groupby("technique", group_keys=False).apply(lambda x: x.sample(n=undersample_count) if len(x) > undersample_count else x).reset_index(drop=True)


In [9]:
undersampled_dataset

Dataset({
    features: ['audio', 'technique', 'microphone', 'string', 'note'],
    num_rows: 765
})

# Generate train test split

In [10]:
undersampled_dataset = undersampled_dataset.train_test_split(test_size=0.2)

In [11]:
undersampled_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'technique', 'microphone', 'string', 'note'],
        num_rows: 612
    })
    test: Dataset({
        features: ['audio', 'technique', 'microphone', 'string', 'note'],
        num_rows: 153
    })
})

# Preprocessing data

In [12]:
from transformers import AutoFeatureExtractor

model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True)

preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

In [13]:
sampling_rate = feature_extractor.sampling_rate

In [14]:
sampling_rate

16000

## Resample data
The original data is sampled at 48Khz, we have to resample to 16Khz.

In [15]:
from datasets import Audio

undersampled_dataset = undersampled_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

## Normalize audio

In [16]:
import numpy as np

sample = undersampled_dataset["train"][0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: 0.00144, Variance: 0.00338


In [17]:
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])

print(f"inputs keys: {list(inputs.keys())}")

print(
    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)

inputs keys: ['input_values', 'attention_mask']
Mean: 1.91e-09, Variance: 1.0


## Preprocess function
Truncates the audio to 1 second in the sampled audio rate

In [18]:
max_duration = 1.0


def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [19]:
vads_encoded = undersampled_dataset.map(
    preprocess_function,
    remove_columns=["audio", "microphone","string","note"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
vads_encoded

Map:   0%|          | 0/612 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['technique', 'input_values', 'attention_mask'],
        num_rows: 612
    })
    test: Dataset({
        features: ['technique', 'input_values', 'attention_mask'],
        num_rows: 153
    })
})

To allow the model to process we need to rename the column "technique" to "label"

In [20]:
vads_encoded = vads_encoded.rename_column("technique","label")

In [21]:
vads_encoded['train']['label'][10]

4

# Fine tunning a model


In [22]:
id2label_fn = vads["train"].features["technique"].int2str

In [23]:
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(vads["train"].features["technique"].names))
}
label2id = {v: k for k, v in id2label.items()}

In [24]:
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/94.0M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10



In [27]:
model_name

'distilhubert'

In [29]:
training_args = TrainingArguments(
    f"{model_name}-finetuned-vads",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)

In [30]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [31]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [33]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=vads_encoded["train"],
    eval_dataset=vads_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpoliteles[0m ([33mpoliteles-conservatorio-superior-andr-s-de-vandelvira[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6541,1.545101,0.712418
2,0.8964,0.777341,0.823529
3,0.529,0.464979,0.915033
4,0.303,0.351308,0.915033
5,0.21,0.268394,0.928105
6,0.1162,0.237334,0.921569
7,0.0663,0.18214,0.941176
8,0.0738,0.183719,0.941176
9,0.0307,0.178707,0.947712
10,0.0326,0.176018,0.954248


TrainOutput(global_step=770, training_loss=0.495980218168977, metrics={'train_runtime': 147.6332, 'train_samples_per_second': 41.454, 'train_steps_per_second': 5.216, 'total_flos': 1.39190215104e+16, 'train_loss': 0.495980218168977, 'epoch': 10.0})

In [34]:
trainer.save_model(f"{model_name}-finetuned-vads")

model.safetensors:   0%|          | 0.00/94.8M [00:00<?, ?B/s]

events.out.tfevents.1743950246.a35baae0787f.147.1:   0%|          | 0.00/42.2k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

In [35]:
# Update model card
trainer.create_model_card(
    tags=["audio-classification", "vads", "violin"],
    finetuned_from=model_id,
    dataset_tags="politeles/vads",
    dataset="vads",  # Or your specific dataset name
    # You can add more metadata here as needed, like license, model_name, tasks, etc.
)

In [36]:
kwargs = {
    "dataset_tags": "politeles/vads",
    "dataset": "VADS",
    "model_name": f"{model_name}-finetuned-vads",
    "finetuned_from": model_id,
    "tasks": "audio-classification",
}

In [37]:
trainer.push_to_hub()

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/politeles/distilhubert-finetuned-vads/commit/f6944fd181aa1c64f569f9957ac2727ff9e352dc', commit_message='End of training', commit_description='', oid='f6944fd181aa1c64f569f9957ac2727ff9e352dc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/politeles/distilhubert-finetuned-vads', endpoint='https://huggingface.co', repo_type='model', repo_id='politeles/distilhubert-finetuned-vads'), pr_revision=None, pr_num=None)