In [1]:
import wandb
import torch
import os
import pandas as pd
from datasets import load_dataset, Audio, DatasetDict, load_metric
from transformers import Wav2Vec2FeatureExtractor, HubertModel, HubertConfig, HubertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, Wav2Vec2Processor
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

Making sure CUDA is available!

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

Let's create a custom metadata given the following naming convention for our data files:

Example: 03-01-01-01-01-01-01.wav

Modality (01 = full-AV, 02 = video-only, 03 = audio-only).

Vocal channel (01 = speech, 02 = song).

Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).

Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.

Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").

Repetition (01 = 1st repetition, 02 = 2nd repetition).

Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

In [3]:
audio_folder = r'C:\Git Repos\s2t-wave2vec\data'

data = []

for filename in os.listdir(audio_folder):
    if filename.endswith('.wav'):
        file_name = filename
        modality = str(filename[0:2])
        vocal_channel = str(filename[3:5])
        emotion = str(filename[6:8])
        intensity = str(filename[9:11])
        statement = str(filename[12:14])
        repetition = str(filename[15:17])
        actor = str(filename[18:20])

    data.append([file_name, modality, vocal_channel, emotion, intensity, statement, repetition, actor])

column_names = ['file_name','Modality',"Vocal Channel","Emotion","Intensity","Statement","Repetition","Actor"]
metadata_df = pd.DataFrame(data, columns=column_names)

emotion_mapping = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

intensity_mapping = {
    '01':'normal',
    '02':'strong'
}

statement_mapping = {
    '01':'Kids are talking by the door',
    '02':'Dogs are sitting by the door'
}

repetition_mapping = {
    '01':'First',
    '02':'Second'
}

modality_mapping = {
    '03':'audio'
}

vocalchannel_mapping = {
    '01':"speech"
}

metadata_df.replace({
    'Modality': modality_mapping,
    'Vocal Channel':vocalchannel_mapping,
    'Emotion': emotion_mapping,
    'Intensity': intensity_mapping,
    'Statement': statement_mapping,
    'Repetition': repetition_mapping,
    # Add other column mappings if needed
}, inplace=True)

metadata_df['Gender'] = metadata_df['Actor'].apply(lambda x: 'Male' if int(x) % 2 == 1 else 'Female')

metadata_df.to_csv('data/metadata.csv', index=False)

label_list = [i for i in metadata_df['Emotion'].unique()]
label_list

['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

We can always repurpose this notebook for different tasks as we have multiple columns with possible labels!

Define Custom Data Collator

In [4]:
INPUT_FIELD = "input_values"
LABEL_FIELD = "labels"

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(
        self, examples: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:

        input_features = [
            {INPUT_FIELD: example[INPUT_FIELD]} for example in examples
        ]  # example is basically row0, row1, etc...
        labels = [example[LABEL_FIELD] for example in examples]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch[LABEL_FIELD] = torch.tensor(labels)

        return batch

Loading in Wave2Vec Assets and Custom Collator

In [5]:
model_name = "facebook/hubert-large-ls960-ft"
NUM_LABELS = 8

feature_extractor = Wav2Vec2FeatureExtractor().from_pretrained(model_name)
sampling_rate = feature_extractor.sampling_rate
hubert_base = HubertModel.from_pretrained(model_name)
hubert_config = HubertConfig.from_pretrained(model_name, num_labels=NUM_LABELS)
hubert_model = HubertForSequenceClassification.from_pretrained(model_name, config=hubert_config, ignore_mismatched_sizes=True)
data_collator = DataCollatorCTCWithPadding(processor=feature_extractor, padding=True)

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertModel: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertForSequenceClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from 

In [6]:
print(feature_extractor)

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}



In [7]:
print(hubert_base)

HubertModel(
  (feature_extractor): HubertFeatureEncoder(
    (conv_layers): ModuleList(
      (0): HubertLayerNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (1-4): 4 x HubertLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (5-6): 2 x HubertLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): HubertFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=1024, bias=True)
    (dropout): Dropout(p=

In [8]:
print(hubert_model)

HubertForSequenceClassification(
  (hubert): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertLayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (pro

In [9]:
print(data_collator)

DataCollatorCTCWithPadding(processor=Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}
, padding=True, max_length=None, max_length_labels=None, pad_to_multiple_of=None, pad_to_multiple_of_labels=None)


Time to create our DatasetDict object.  This finds "metadata.csv" and maps its data to each file in the folder.

In [10]:
#wanted to make this general to try a few different fine tuning tasks in the future
#this allows me to change "Emotion" out for a different column, and have the script run no matter what it's called without changing the column name
#annoying that the Datasets package doesn't have a 'select_columns' functionality
select_columns = ['audio','Emotion']
output_column = select_columns[1]

#load dataset and remove undesired columns (see above)
dataset = load_dataset("audiofolder", data_dir="data")
cols_to_remove = [i for i in dataset['train'].column_names if i not in select_columns]
dataset = dataset.remove_columns(cols_to_remove)

#rename columns to what the trainer object expects
dataset = dataset.rename_column(select_columns[1],"label")

#casts audio column to sampling rate expected by feature_extractor
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

#extract arrays and remove nested audio column
audio_arrays = [np.array(item["array"]) for item in dataset['train']["audio"]]
dataset['train'] = dataset['train'].add_column("audio_arrays",audio_arrays)
dataset = dataset.remove_columns("audio")

#view dataset
dataset

Resolving data files:   0%|          | 0/1441 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/default to C:/Users/khamm/.cache/huggingface/datasets/audiofolder/default-3e24c60e40fc04af/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/1441 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to C:/Users/khamm/.cache/huggingface/datasets/audiofolder/default-3e24c60e40fc04af/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'audio_arrays'],
        num_rows: 1440
    })
})

Prepares Dataset by Utilizing Feature Extractor and Adding Extracted Features and Labels to DatasetDict

In [11]:
def prepare_dataset(batch, feature_extractor):
    audio_arr = batch["audio_arrays"]
    input = feature_extractor(
        audio_arr, sampling_rate=sampling_rate, padding=True, return_tensors="pt"
    )

    batch[INPUT_FIELD] = input.input_values[0]
    batch[LABEL_FIELD] = batch["label"]  # colname MUST be labels as Trainer will look for it by default

    return batch

# APPLY THE DATA PREP USING FEATURE EXTRACTOR TO ALL EXAMPLES
dataset = dataset.map(
    prepare_dataset,
    fn_kwargs={"feature_extractor": feature_extractor},
    num_proc=1,
)

dataset = dataset.remove_columns(["audio_arrays","label"])

dataset

Map:   0%|          | 0/1440 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 1440
    })
})

Encoding Class Labels and Test Train Split

In [12]:
#encode labels column
dataset['train'] = dataset['train'].class_encode_column('labels')

#train test split
train_testvalid = dataset['train'].train_test_split(shuffle=True, test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(shuffle=True,test_size=0.5)
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'val': test_valid['train']
})

dataset

Casting to class labels:   0%|          | 0/1440 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 1152
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 144
    })
    val: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 144
    })
})

Trainer Config and Training Arguments

In [13]:
trainer_config = {
  "OUTPUT_DIR": "results",
  "TRAIN_EPOCHS": 3,
  "TRAIN_BATCH_SIZE": 4,
  "EVAL_BATCH_SIZE": 4,
  "GRADIENT_ACCUMULATION_STEPS": 4,
  "WARMUP_STEPS": 500,
  "DECAY": 0.01,
  "LOGGING_STEPS": 10,
  "MODEL_DIR": "models/slyle-test-hubert-model-ravdess",
  "SAVE_STEPS": 100
}

# Fine-Tuning with Trainer
training_args = TrainingArguments(
    output_dir=trainer_config["OUTPUT_DIR"],  # output directory
    gradient_accumulation_steps=trainer_config[
        "GRADIENT_ACCUMULATION_STEPS"
    ],  # accumulate the gradients before running optimization step
    num_train_epochs=trainer_config[
        "TRAIN_EPOCHS"
    ],  # total number of training epochs
    per_device_train_batch_size=trainer_config[
        "TRAIN_BATCH_SIZE"
    ],  # batch size per device during training
    per_device_eval_batch_size=trainer_config[
        "EVAL_BATCH_SIZE"
    ],  # batch size for evaluation
    warmup_steps=trainer_config[
        "WARMUP_STEPS"
    ],  # number of warmup steps for learning rate scheduler
    save_steps=trainer_config["SAVE_STEPS"], # save checkpoint every 100 steps
    weight_decay=trainer_config["DECAY"],  # strength of weight decay
    logging_steps=trainer_config["LOGGING_STEPS"],
    evaluation_strategy="epoch",  # report metric at end of each epoch
    report_to="wandb",  # enable logging to W&B
    fp16=True,
)

Compute Metrics Function

In [15]:
def compute_metrics(eval_pred):
    # DEFINE EVALUATION METRIC
    compute_accuracy_metric = load_metric("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return compute_accuracy_metric.compute(predictions=predictions, references=labels)

Creating Trainer Object

In [16]:
trainer = Trainer(
    model=hubert_model,  # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=dataset["train"],  # training dataset
    eval_dataset=dataset["val"],  # evaluation dataset
    compute_metrics=compute_metrics,
)

Connecting to WANDB Project

In [17]:
USER = 'khammitt1'
WANDB_PROJECT = "finetuning_wave2vec_emotion"
WANDB_NOTEBOOK_NAME = 'wave2vec.ipynb'
wandb.init(entity=USER, project=WANDB_PROJECT)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkhammitt1[0m. Use [1m`wandb login --relogin`[0m to force relogin


TRAINING TIME!

In [18]:
trainer.train()

# TO RESUME TRAINING FROM CHECKPOINT
# trainer.train("results/checkpoint-2000")



  0%|          | 0/216 [00:00<?, ?it/s]

{'loss': 2.0799, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.14}
{'loss': 2.0782, 'learning_rate': 1.5e-06, 'epoch': 0.28}
{'loss': 2.0828, 'learning_rate': 2.5e-06, 'epoch': 0.42}
{'loss': 2.0796, 'learning_rate': 3.4000000000000005e-06, 'epoch': 0.56}
{'loss': 2.0844, 'learning_rate': 4.4e-06, 'epoch': 0.69}
{'loss': 2.0828, 'learning_rate': 5.4e-06, 'epoch': 0.83}
{'loss': 2.0765, 'learning_rate': 6.4000000000000006e-06, 'epoch': 0.97}


  0%|          | 0/36 [00:00<?, ?it/s]

  compute_accuracy_metric = load_metric("accuracy")


{'eval_loss': 2.0726046562194824, 'eval_accuracy': 0.14583333333333334, 'eval_runtime': 25.6546, 'eval_samples_per_second': 5.613, 'eval_steps_per_second': 1.403, 'epoch': 1.0}
{'loss': 2.0782, 'learning_rate': 7.4e-06, 'epoch': 1.11}
{'loss': 2.0766, 'learning_rate': 8.400000000000001e-06, 'epoch': 1.25}
{'loss': 2.0699, 'learning_rate': 9.4e-06, 'epoch': 1.39}
{'loss': 2.0636, 'learning_rate': 1.04e-05, 'epoch': 1.53}
{'loss': 2.0652, 'learning_rate': 1.1400000000000001e-05, 'epoch': 1.67}
{'loss': 2.064, 'learning_rate': 1.24e-05, 'epoch': 1.81}
{'loss': 2.0592, 'learning_rate': 1.3300000000000001e-05, 'epoch': 1.94}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0374484062194824, 'eval_accuracy': 0.24305555555555555, 'eval_runtime': 50.0995, 'eval_samples_per_second': 2.874, 'eval_steps_per_second': 0.719, 'epoch': 2.0}
{'loss': 2.0443, 'learning_rate': 1.43e-05, 'epoch': 2.08}
{'loss': 2.0393, 'learning_rate': 1.53e-05, 'epoch': 2.22}
{'loss': 2.0401, 'learning_rate': 1.63e-05, 'epoch': 2.36}
{'loss': 2.0121, 'learning_rate': 1.73e-05, 'epoch': 2.5}
{'loss': 1.9817, 'learning_rate': 1.83e-05, 'epoch': 2.64}
{'loss': 1.9503, 'learning_rate': 1.93e-05, 'epoch': 2.78}
{'loss': 1.9151, 'learning_rate': 2.0200000000000003e-05, 'epoch': 2.92}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.8321126699447632, 'eval_accuracy': 0.3611111111111111, 'eval_runtime': 30.4222, 'eval_samples_per_second': 4.733, 'eval_steps_per_second': 1.183, 'epoch': 3.0}
{'train_runtime': 2827.2389, 'train_samples_per_second': 1.222, 'train_steps_per_second': 0.076, 'train_loss': 2.043682239673756, 'epoch': 3.0}


TrainOutput(global_step=216, training_loss=2.043682239673756, metrics={'train_runtime': 2827.2389, 'train_samples_per_second': 1.222, 'train_steps_per_second': 0.076, 'train_loss': 2.043682239673756, 'epoch': 3.0})

Here is where the example ends.  At this point, we see the model is steadily learning, and after 3 epochs it has already achieved 36% accuracy in 47 minutes of training time from a starting accuracy of a random guess (0.125).  You might be thinking, why did you do all this work for such measly accuracy when pre-trained models on this exact task exist on huggingface?  

    -It's just for practice, and now I can repurpose this work for other tasks.  
    -There will come a day when I will have custom data and a problem no one else has tackled, which will require fine tuning!  
    -I will use those pre-trained models when learning how to deploy and use RESTful API endpoints.  Hopefully I can host those locally, because I don't have $$  

However, we cannot extrapolate this performance to MANY further epochs (maybe a few), and have a few issues that we may run into with more epochs:

    -The model will eventually overfit as we only have about 1200 training samples and <100 samples per class  
    -My girlfriend will kill me if I continue to run our power bill through the roof  
    -I have maxed out my VRAM at a batch size of 4, using mixed precision (fp16), which may limit performance in further epochs (this depends on learning rate schedule, and may not occur)

If I were to continue the example, I would:

    -Reduce the # of warmup steps and increase learning rate, as it appeared to benefit as learning rate increased.  
    -Find a larger labeled dataset, although this may limit the granularity of our emotion labels (I could use the full RAVDESS dataset, as it is much larger than this subset)  
    -Increase learning rate  
    -Get a new graphics card and increase batch size (NVIDIA 4090 has 24GB which looks very appetizing right now, but is 1700 dollars and I have to pay rent)  
    -Try out other models in addition to HUBERT

Things to note:  

    -load_metrics will be deprecated soon, the code will have to be altered in the future in order to report metrics using evaluate.load  
    -I will need to figure out how to use torch.optim.AdamW as the current implementation of AdamW is deprecated  
    -Transformers v4.28.0 had to be used for this example, as future versions have been reported unstable (Partial State error occurs with v4.29.2 in TrainingArguments class).  It is unclear whether or not this error is in v4.29.0, but I used 4.28.0 at the recommendations of a forum post I saw.  
    -There has to be a more elegant way to use huggingface's dataset package, as you saw previously I had to really wrangle with it in order to get it in the correct state for training.  It felt awkward the whole way through

Things still to learn:

    -I need to get a grasp on how to separate this notebook into .py scripts, especially if I want to offload this to an Azure job
    -I need to practice Azure jobs, using simpler models.  I would really like to use this example, but I can't get my hands on compute that would outperform my 3080 (thanks a lot, Sam Altman)
    -I need to learn to deploy API endpoints, but these types of models would run me 400+ bucks per month on huggingface and probably about the same on Azure
    -We defined a custom DataCollator in this example, but I need to do more research to figure out if a prepackaged DataCollator from HuggingFace does the exact same thing