## init

In [1]:
%pip install -qqq transformers torchaudio datasets wandb

Note: you may need to restart the kernel to use updated packages.


In [2]:
# env variables
# the datapath is actually "/kaggle/input/iemocapfullrelease" but we use a symlink to get to the dataset
DATA_PATH = "/kaggle/working"

# we need a symling because torchaudio.datasets.IEMOCAP adds a "/IEMOCAP" to the data path, but the dataset is at /kaggle/input/iemocapfullrelease/IEMOCAP_full_release and the directory is read-only
!ln -s /kaggle/input/iemocapfullrelease/IEMOCAP_full_release /kaggle/working/IEMOCAP

In [3]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token = user_secrets.get_secret("wandb-api-token") 
wandb.login(key=token)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# **Use wav2vec2 for speech emotion recognition on IEMOCAP dataset**
---
- 🚀 **objective**: run wav2vec2 as a feature extractor on IEMOCAP dataset, requires the data preprocessing of IEMOCAP dataset  
- 🧯 **models**: wav2vec2
- 📚 **dataset**: IEMOCAP


Resources
- inspired by https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb#scrollTo=Fv62ShDsH5DZ

## ⚙️ configuration

In [4]:
import numpy as np

from transformers import TrainingArguments
import torch

In [5]:
model_name_or_path = "facebook/wav2vec2-base-960h"
feature_to_idx = {key: i for i, key in enumerate(["wav", "sampling_rate", "filename", "label", "speaker"])}
label_list = ["neu", "hap", "ang", "sad", "exc", "fru"]
num_labels = len(label_list)
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

pooling_mode = "max"
test_split_size = 0.2
target_sampling_rate = 16000

DEBUG_SIZE = 10

In [6]:
# training parameters
training_args = TrainingArguments(
    output_dir="/content/wav2vec2-iemocap-speech-emotion-recognition",
    label_names=label_list,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    evaluation_strategy="steps",
    num_train_epochs=1.0,
    fp16=torch.cuda.is_available(), # whether to use fp16 16-bit (mixed) precision training instead of 32-bit training
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
    report_to="wandb",
    half_precision_backend="auto", # shoud be 'cuda_amp' half precision backend in Colab
    gradient_checkpointing=True
)

## 📚 data
- ~torchaudio implemented a `dataset` to load IEMOCAP. Later in the script, we train the model with a `Trainer` from hugginface, therefore we prefer translating the pytorch dataset into a `transformers.Dataset` for convenience and compatibility.~
- the Trainer class expects an argument `train_dataset` to be of type torch.utils.data.Dataset (see [documentation](https://huggingface.co/docs/transformers/main_classes/trainer)) --> we use a torch dataset instead of a Hugginface dataset

In [7]:
# https://pytorch.org/audio/master/generated/torchaudio.datasets.IEMOCAP.html
from torchaudio.datasets import IEMOCAP

from transformers import Wav2Vec2Processor

import torch
from torch.utils.data import random_split, Dataset, DataLoader, SubsetRandomSampler

In [8]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
target_sampling_rate = processor.feature_extractor.sampling_rate

Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [9]:
class CustomIEMOCAP(Dataset):
  def __init__(self, data, processor):
    self.data = data
    self.processor = processor

  def __getitem__(self, index):
    wav, _, _, label, _ = self.data[index]
    inputs = self.processor(wav.squeeze(), sampling_rate=target_sampling_rate)
    inputs["labels"] = label2id[label]

    return inputs

  def __len__(self):
    return len(self.data)

In [10]:
iemocap = IEMOCAP(root=DATA_PATH) # in function, path = root / "IEMOCAP"
dataset = CustomIEMOCAP(data=iemocap, processor=processor)
train_ds, test_ds = random_split(dataset, [test_split_size, 1-test_split_size], generator=torch.Generator().manual_seed(42))

dataset[0]

{'input_values': [array([-0.38823238, -0.3835091 , -0.2937664 , ..., -0.20402369,
       -0.24417175, -0.32210618], dtype=float32)], 'labels': 0}

## 🚜 model definition

In [11]:
from transformers import Wav2Vec2Model, Wav2Vec2PreTrainedModel
from transformers import AutoConfig

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

In [12]:
# model configuration
config = AutoConfig.from_pretrained(
  model_name_or_path,
  num_labels=num_labels,
  label2id=label2id,
  id2label=id2label,
  finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)
config

Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base-960h",
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_dropout": 0.0,
  "feat_extract_norm": "group",
  "feat_proj_dropout": 0.1,
  "feat_quantizer_dropout": 0.0,
  "final_dropout": 0.1,
  "finetuning_task": "wa

In [13]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union, Tuple
from transformers.file_utils import ModelOutput

@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [14]:
class Wav2Vec2ClassificationHead(nn.Module):
  """Head for wav2vec classification task."""

  def __init__(self, config):
    super().__init__()
    self.dense = nn.Linear(config.hidden_size, config.hidden_size)
    self.dropout = nn.Dropout(config.final_dropout)
    self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

  def forward(self, features, **kwargs):
    x = features
    x = self.dropout(x)
    x = self.dense(x)
    x = torch.tanh(x)
    x = self.dropout(x)
    x = self.out_proj(x)
    return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
  def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels
    self.pooling_mode = config.pooling_mode
    self.config = config

    self.wav2vec2 = Wav2Vec2Model(config)
    self.classifier = Wav2Vec2ClassificationHead(config)

    self.init_weights()

  def freeze_feature_extractor(self):
    self.wav2vec2.feature_extractor._freeze_parameters()

  def merged_strategy(
      
      self,
      hidden_states,
      mode="mean"
  ):
    if mode == "mean":
        outputs = torch.mean(hidden_states, dim=1)
    elif mode == "sum":
        outputs = torch.sum(hidden_states, dim=1)
    elif mode == "max":
        outputs = torch.max(hidden_states, dim=1)[0]
    else:
        raise Exception(
            "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

    return outputs

  def forward(
      self,
      input_values,
      attention_mask=None,
      output_attentions=None,
      output_hidden_states=None,
      return_dict=None,
      labels=None,
  ):
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    outputs = self.wav2vec2(
        input_values,
        attention_mask=attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    hidden_states = outputs[0]
    hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
    logits = self.classifier(hidden_states)

    loss = None
    if labels is not None:
      loss_fct = CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    if not return_dict:
      output = (logits,) + outputs[2:]
      return ((loss,) + output) if loss is not None else output

    return SpeechClassifierOutput(
      loss=loss,
      logits=logits,
      hidden_states=outputs.hidden_states,
      attentions=outputs.attentions,
    )

In [15]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForSpeechClassification: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a

In [16]:
model.freeze_feature_extractor()
model 

Wav2Vec2ForSpeechClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (4): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=

## 🏃‍♀️ training routine

In [17]:
from dataclasses import dataclass
from typing import Any, Dict, Union, Tuple, Optional
from packaging import version
import numpy as np

import torch
from torch import nn

from transformers import Trainer, is_apex_available, Wav2Vec2Processor, EvalPrediction, ProcessorMixin

if is_apex_available():
    from apex import amp 
    # Apex is a PyTorch add-on package from NVIDIA with capabilities for automatic mixed precision (AMP) and distributed training.
    # https://www.ibm.com/docs/en/wmlce/1.6.1?topic=frameworks-getting-started-apex

if version.parse(torch.__version__) >= version.parse("1.6"):
  _is_native_amp_available = True
  from torch.cuda.amp import autocast

In [18]:
@dataclass
class DataCollatorCTCWithPadding:
  processor: Wav2Vec2Processor
  padding: Union[bool, str] = True
  max_length: Optional[int] = None
  max_length_labels: Optional[int] = None
  pad_to_multiple_of: Optional[int] = None
  pad_to_multiple_of_labels: Optional[int] = None

  def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
    input_features = [{"input_values": feature["input_values"][0]} for feature in features]
    label_features = [feature["labels"] for feature in features]

    d_type = torch.long if isinstance(label_features[0], int) else torch.float

    batch = self.processor.pad(
      input_features,
      padding=self.padding,
      max_length=self.max_length,
      pad_to_multiple_of=self.pad_to_multiple_of,
      return_tensors="pt",
    )

    batch["labels"] = torch.tensor(label_features, dtype=d_type)
    # print('batch', batch)
    return batch

In [19]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [20]:
class CTCTrainer(Trainer):
  def training_step(self, processor: ProcessorMixin, inputs) -> torch.Tensor:
    self.model.train()
    inputs = self._prepare_inputs(inputs)

    with autocast():
      loss = self.compute_loss(self.model, inputs)

    if self.args.gradient_accumulation_steps > 1:
      loss = loss / self.args.gradient_accumulation_steps

    self.scaler.scale(loss).backward()

    return loss.detach()

In [21]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [22]:
from transformers import DataCollatorWithPadding
trainer = CTCTrainer(
  model=model,
  data_collator=data_collator,
  args=training_args,
  compute_metrics=compute_metrics,
  train_dataset=train_ds,
  eval_dataset=test_ds,
  tokenizer=processor.feature_extractor,
)

Using cuda_amp half precision backend


## 🧪 experiments

In [23]:
trainer.train()

***** Running training *****
  Num examples = 1476
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 369
  Number of trainable parameters = 90766470
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33msmeelock[0m ([33mtsinghua-ser[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
10,1.8735,No log
20,1.7922,No log
30,1.8562,No log
40,1.7752,No log
50,1.678,No log
60,1.7477,No log
70,1.7775,No log
80,1.7779,No log
90,1.7685,No log
100,1.7288,No log


***** Running Evaluation *****
  Num examples = 5904
  Batch size = 4
Saving model checkpoint to /content/wav2vec2-iemocap-speech-emotion-recognition/checkpoint-10
Configuration saved in /content/wav2vec2-iemocap-speech-emotion-recognition/checkpoint-10/config.json
Model weights saved in /content/wav2vec2-iemocap-speech-emotion-recognition/checkpoint-10/pytorch_model.bin
Feature extractor saved in /content/wav2vec2-iemocap-speech-emotion-recognition/checkpoint-10/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 5904
  Batch size = 4
Saving model checkpoint to /content/wav2vec2-iemocap-speech-emotion-recognition/checkpoint-20
Configuration saved in /content/wav2vec2-iemocap-speech-emotion-recognition/checkpoint-20/config.json
Model weights saved in /content/wav2vec2-iemocap-speech-emotion-recognition/checkpoint-20/pytorch_model.bin
Feature extractor saved in /content/wav2vec2-iemocap-speech-emotion-recognition/checkpoint-20/preprocessor_config.json
***** Running 

TrainOutput(global_step=369, training_loss=1.7371736852134145, metrics={'train_runtime': 6072.429, 'train_samples_per_second': 0.243, 'train_steps_per_second': 0.061, 'total_flos': 1.0899090310430976e+17, 'train_loss': 1.7371736852134145, 'epoch': 1.0})