## init

In [None]:
%pip install -qqq transformers torchaudio

In [None]:
from google.colab import drive
import os

In [None]:
DRIVE_MOUNT_PATH = "/content/drive"
DATA_PATH = f"{DRIVE_MOUNT_PATH}/MyDrive/Shared/data"

In [None]:
drive.mount(DRIVE_MOUNT_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Use wav2vec2 for speech emotion recognition on IEMOCAP dataset**
---
- 🚀 **objective**: run wav2vec2 as a feature extractor on IEMOCAP dataset, requires the data preprocessing of IEMOCAP dataset  
- 🧯 **models**: wav2vec2
- 📚 **dataset**: IEMOCAP


Resources
- inspired by https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb#scrollTo=Fv62ShDsH5DZ

## ⚙️ configuration

In [None]:
import numpy as np

from transformers import TrainingArguments

In [None]:
model_name_or_path = "facebook/wav2vec2-large-960h"
label_list = ["neu", "hap", "ang", "sad", "exc", "fru"]
num_labels = len(label_list)
pooling_mode = "max"
train_test_split = 0.8
target_sampling_rate = 16000

In [None]:
# training parameters
training_args = TrainingArguments(
    output_dir="/content/wav2vec2-iemocap-speech-emotion-recognition",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=1.0,
    fp16=True,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## 📚 data

In [None]:
# https://pytorch.org/audio/master/generated/torchaudio.datasets.IEMOCAP.html
from torchaudio.datasets import IEMOCAP

from transformers import Wav2Vec2Processor

import torch
from torch.utils.data import random_split, Dataset, DataLoader, SubsetRandomSampler

In [None]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)

loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-large-960h/snapshots/bdeaacdf88f7a155f50a2704bc967aa81fbbb2ab/preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-large-960h/snapshots/bdeaacdf88f7a155f50a2704bc967aa81fbbb2ab/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-large-960h",
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "code

In [None]:
class CustomIEMOCAP(Dataset):
  def __init__(self, data, processor):
    self.data = data
    self.processor = processor

  def __getitem__(self, index):
    wav, sampling_rate, filename, label, speaker = self.data[index] #.get_metadata() to get filepath instead of wav

    return self.processor(wav, sampling_rate=sampling_rate), label

  def __len__(self):
    return len(self.data)

In [None]:
data = IEMOCAP(root=DATA_PATH) # in function, path = root / "IEMOCAP"
dataset = CustomIEMOCAP(data=data, processor=processor)
train_dataset, eval_dataset = random_split(dataset, [train_test_split, 1-train_test_split], generator=torch.Generator().manual_seed(42))

## 🚜 model definition

In [None]:
from transformers import Wav2Vec2Model, Wav2Vec2PreTrainedModel
from transformers import AutoConfig

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

In [None]:
# model configuration
config = AutoConfig.from_pretrained(
  model_name_or_path,
  num_labels=num_labels,
  label2id={label: i for i, label in enumerate(label_list)},
  id2label={i: label for i, label in enumerate(label_list)},
  finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)
config

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-large-960h/snapshots/bdeaacdf88f7a155f50a2704bc967aa81fbbb2ab/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-large-960h",
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "feat_extr

Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-large-960h",
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_dropout": 0.0,
  "feat_extract_norm": "group",
  "feat_proj_dropout": 0.0,
  "feat_quantizer_dropout": 0.0,
  "final_dropout": 0.1,
  "finetuning_task": "w

In [None]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union, Tuple
from transformers.file_utils import ModelOutput

@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [None]:
class Wav2Vec2ClassificationHead(nn.Module):
  """Head for wav2vec classification task."""

  def __init__(self, config):
    super().__init__()
    self.dense = nn.Linear(config.hidden_size, config.hidden_size)
    self.dropout = nn.Dropout(config.final_dropout)
    self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

  def forward(self, features, **kwargs):
    x = features
    x = self.dropout(x)
    x = self.dense(x)
    x = torch.tanh(x)
    x = self.dropout(x)
    x = self.out_proj(x)
    return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
  def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels
    self.pooling_mode = config.pooling_mode
    self.config = config

    self.wav2vec2 = Wav2Vec2Model(config)
    self.classifier = Wav2Vec2ClassificationHead(config)

    self.init_weights()

  def freeze_feature_extractor(self):
    self.wav2vec2.feature_extractor._freeze_parameters()

  def merged_strategy(
      
      self,
      hidden_states,
      mode="mean"
  ):
    if mode == "mean":
        outputs = torch.mean(hidden_states, dim=1)
    elif mode == "sum":
        outputs = torch.sum(hidden_states, dim=1)
    elif mode == "max":
        outputs = torch.max(hidden_states, dim=1)[0]
    else:
        raise Exception(
            "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

    return outputs

  def forward(
      self,
      input_values,
      attention_mask=None,
      output_attentions=None,
      output_hidden_states=None,
      return_dict=None,
      labels=None,
  ):
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    outputs = self.wav2vec2(
        input_values,
        attention_mask=attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    hidden_states = outputs[0]
    hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
    logits = self.classifier(hidden_states)

    loss = None
    if labels is not None:
      loss_fct = CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    if not return_dict:
      output = (logits,) + outputs[2:]
      return ((loss,) + output) if loss is not None else output

    return SpeechClassifierOutput(
      loss=loss,
      logits=logits,
      hidden_states=outputs.hidden_states,
      attentions=outputs.attentions,
    )

In [None]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-large-960h/snapshots/bdeaacdf88f7a155f50a2704bc967aa81fbbb2ab/pytorch_model.bin
Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForSpeechClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly init

In [None]:
model.freeze_feature_extractor()

## 🏃‍♀️ training routine

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, Union, Tuple, Optional
from packaging import version
import numpy as np

import torch
from torch import nn

from transformers import Trainer, is_apex_available, Wav2Vec2Processor, EvalPrediction, ProcessorMixin

if is_apex_available():
    from apex import amp 
    # Apex is a PyTorch add-on package from NVIDIA with capabilities for automatic mixed precision (AMP) and distributed training.
    # https://www.ibm.com/docs/en/wmlce/1.6.1?topic=frameworks-getting-started-apex

if version.parse(torch.__version__) >= version.parse("1.6"):
  _is_native_amp_available = True
  from torch.cuda.amp import autocast

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
  processor: Wav2Vec2Processor
  padding: Union[bool, str] = True
  max_length: Optional[int] = None
  max_length_labels: Optional[int] = None
  pad_to_multiple_of: Optional[int] = None
  pad_to_multiple_of_labels: Optional[int] = None

  def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
    print(features)
    inputs = [{"input_values": wavs["input_values"], "labels": labels} for wavs, labels in features]
    input_features = [{"input_values": feature["input_values"]} for feature in inputs]
    label_features = [feature["labels"] for feature in inputs]

    d_type = torch.long if isinstance(label_features[0], int) else torch.float

    batch = self.processor.pad(
      input_features,
      padding=self.padding,
      max_length=self.max_length,
      pad_to_multiple_of=self.pad_to_multiple_of,
      return_tensors="pt",
    )

    batch["labels"] = torch.tensor(label_features, dtype=d_type)

    return batch

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [None]:
class CTCTrainer(Trainer):
  def training_step(self, processor: ProcessorMixin, model: nn.Module, inputs) -> torch.Tensor:
    model.train()
    inputs = self._prepare_inputs(inputs)

    if self.use_amp:
      with autocast():
        loss = self.compute_loss(model, inputs)
    else:
      loss = self.compute_loss(model, inputs)

    if self.args.gradient_accumulation_steps > 1:
      loss = loss / self.args.gradient_accumulation_steps

    if self.use_amp:
      self.scaler.scale(loss).backward()
    elif self.use_apex:
      with amp.scale_loss(loss, self.optimizer) as scaled_loss:
        scaled_loss.backward()
    elif self.deepspeed:
      self.deepspeed.backward(loss)
    else:
      loss.backward()

    return loss.detach()


In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
trainer = CTCTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)

Using cuda_amp half precision backend


## 🧪 experiments

In [None]:
trainer.train()

***** Running training *****
  Num examples = 3397
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 425
  Number of trainable parameters = 312284294
  value = np.array(value)


[({'input_values': [array([[-0.7809634 , -0.8264922 , -0.78717184, ...,  0.04683269,
         0.27447662,  0.40278503]], dtype=float32)]}, 'neu'), ({'input_values': [array([[0.00324473, 0.05279194, 0.07329424, ..., 0.05620899, 0.05791752,
        0.06475162]], dtype=float32)]}, 'ang'), ({'input_values': [array([[-0.07990205, -0.20687743, -0.15963078, ...,  0.06331438,
         0.0485498 ,  0.03378522]], dtype=float32)]}, 'exc'), ({'input_values': [array([[ 0.10068248,  0.06985432, -0.036643  , ...,  0.13711578,
         0.00819798, -0.0800827 ]], dtype=float32)]}, 'neu')]


ValueError: ignored