## init

In [1]:
%pip install -qqq transformers torchaudio datasets wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
import os

In [3]:
# env variables
DRIVE_MOUNT_PATH = "/content/drive"
DATA_PATH = f"{DRIVE_MOUNT_PATH}/MyDrive/Shared/data"
OUTPUT_DIR = "/content/wav2vec2-iemocap-speech-emotion-recognition"
%env WANDB_WATCH=all
%env WANDB_LOG_MODEL=checkpoint

env: WANDB_WATCH=all
env: WANDB_LOG_MODEL=checkpoint


In [4]:
drive.mount(DRIVE_MOUNT_PATH)

Mounted at /content/drive


# **Use OpenAI's whisper for speech emotion recognition on IEMOCAP dataset**
---
- 🚀 **objective**: run whisper as a feature extractor on IEMOCAP dataset, requires the data preprocessing of IEMOCAP dataset  
- 🧯 **models**: whisper
- 📚 **dataset**: IEMOCAP

Whisper model card in HuggingFace https://huggingface.co/docs/transformers/model_doc/whisper

## ⚙️ configuration

In [5]:
import numpy as np

from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score

import torch

In [6]:
model_name_or_path = "openai/whisper-base"
feature_to_idx = {key: i for i, key in enumerate(["wav", "sampling_rate", "filename", "label", "speaker"])}
label_list = ["neu", "hap", "ang", "sad", "exc", "fru"]
num_labels = len(label_list)
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

pooling_mode = "max"
test_split_size = 0.2
target_sampling_rate = 16000

DEBUG_SIZE = 0.1 # percentage of the whole dataset

keep_n_encoder_layers = 2

metrics = {
  "unweighted_accuracy": accuracy_score,
  "weighted_accuracy": balanced_accuracy_score,
  "micro_f1": lambda y_true, y_pred: f1_score(y_true, y_pred, average="micro"),
  "macro_f1": lambda y_true, y_pred: f1_score(y_true, y_pred, average="macro")
}

In [7]:
# training parameters
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    label_names=label_list,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    evaluation_strategy="steps", # should enable do_eval
    num_train_epochs=1.0,
    learning_rate=1e-4,
    fp16=torch.cuda.is_available(), # whether to use fp16 16-bit (mixed) precision training instead of 32-bit training
    save_steps=100,
    eval_steps=10,
    logging_steps=50,
#     report_to="wandb",
    report_to=[],
    half_precision_backend="auto", # shoud be 'cuda_amp' half precision backend
    gradient_checkpointing=True, # use gradient checkpointing to save memory at the expense of slower backward pass
)

## 📚 data
- ~torchaudio implemented a `dataset` to load IEMOCAP. Later in the script, we train the model with a `Trainer` from hugginface, therefore we prefer translating the pytorch dataset into a `transformers.Dataset` for convenience and compatibility.~
- the Trainer class expects an argument `train_dataset` to be of type torch.utils.data.Dataset (see [documentation](https://huggingface.co/docs/transformers/main_classes/trainer)) --> we use a torch dataset instead of a Hugginface dataset

In [8]:
# https://pytorch.org/audio/master/generated/torchaudio.datasets.IEMOCAP.html
from torchaudio.datasets import IEMOCAP

from transformers import WhisperProcessor

import torch
from torch.utils.data import random_split, Dataset, DataLoader, SubsetRandomSampler

In [9]:
processor = WhisperProcessor.from_pretrained(model_name_or_path)
target_sampling_rate = processor.feature_extractor.sampling_rate

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/840 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

In [10]:
class CustomIEMOCAP(Dataset):
  def __init__(self, data, processor):
    self.data = data
    self.processor = processor

  def __getitem__(self, index):
    wav, _, _, label, _ = self.data[index]
    inputs = self.processor(wav.squeeze(), sampling_rate=target_sampling_rate)
    inputs["labels"] = label2id[label]

    return inputs

  def __len__(self):
    return len(self.data)

In [11]:
iemocap = IEMOCAP(root=DATA_PATH) # in function, path = root / "IEMOCAP"
iemocap = torch.utils.data.Subset(iemocap, range(int(DEBUG_SIZE * len(iemocap)))) # DEBUG

dataset = CustomIEMOCAP(data=iemocap, processor=processor)
train_ds, test_ds = random_split(dataset, [1-test_split_size, test_split_size], generator=torch.Generator().manual_seed(42))

dataset[0]

{'input_features': [array([[ 0.1641087 ,  0.26095778,  0.2273435 , ..., -1.0622692 ,
        -1.0622692 , -1.0622692 ],
       [-0.06023073,  0.24212158,  0.14617556, ..., -1.0622692 ,
        -1.0622692 , -1.0622692 ],
       [-0.08058894,  0.2320525 ,  0.11406708, ..., -1.0622692 ,
        -1.0622692 , -1.0622692 ],
       ...,
       [-0.9415262 , -1.0622692 , -1.0622692 , ..., -1.0622692 ,
        -1.0622692 , -1.0622692 ],
       [-1.0006719 , -1.0622692 , -1.0622692 , ..., -1.0622692 ,
        -1.0622692 , -1.0622692 ],
       [-1.013845  , -1.0622692 , -1.0622692 , ..., -1.0622692 ,
        -1.0622692 , -1.0622692 ]], dtype=float32)], 'labels': 0}

## 🚜 model definition

In [12]:
from transformers import WhisperModel, PreTrainedModel
from transformers import AutoConfig

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

In [None]:
# model configuration
config = AutoConfig.from_pretrained(
  model_name_or_path,
  num_labels=num_labels,
  label2id=label2id,
  id2label=id2label,
)
setattr(config, 'pooling_mode', pooling_mode)
setattr(config, 'keep_n_encoder_layers', keep_n_encoder_layers)
config

In [14]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union, Tuple
from transformers.file_utils import ModelOutput

@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [15]:
from transformers import WhisperPreTrainedModel

class WhisperClassificationHead(nn.Module):
  """Head for whisper classification task."""

  def __init__(self, config):
    super().__init__()
    self.dense = nn.Linear(config.hidden_size, config.hidden_size)
    self.dropout = nn.Dropout(config.dropout)
    self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

  def forward(self, features, **kwargs):
    x = features
    x = self.dropout(x)
    x = self.dense(x)
    x = torch.tanh(x)
    x = self.dropout(x)
    x = self.out_proj(x)
    return x


class WhisperForSpeechClassification(WhisperPreTrainedModel):
  def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels
    self.pooling_mode = config.pooling_mode
    self.config = config

    self.encoder = WhisperModel(config).encoder

    # only keep first n encoding layers
    self.encoder.layers = self.encoder.layers[:config.keep_n_encoder_layers]
    self.classifier = WhisperClassificationHead(config)

    self.init_weights()

  def freeze_encoder(self):
    self.encoder._freeze_parameters()

  def merged_strategy(
      
      self,
      hidden_states,
      mode="mean"
  ):
    if mode == "mean":
        outputs = torch.mean(hidden_states, dim=1)
    elif mode == "sum":
        outputs = torch.sum(hidden_states, dim=1)
    elif mode == "max":
        outputs = torch.max(hidden_states, dim=1)[0]
    else:
        raise Exception(
            "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

    return outputs

  def forward(
      self,
      input_features,
      attention_mask=None,
      output_attentions=None,
      output_hidden_states=None,
      return_dict=None,
      labels=None,
  ):
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    outputs = self.whisper(
        input_features,
        attention_mask=attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    hidden_states = outputs[0]
    hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
    logits = self.classifier(hidden_states)

    loss = None
    if labels is not None:
      loss_fct = CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    if not return_dict:
      output = (logits,) + outputs[2:]
      return ((loss,) + output) if loss is not None else output

    return SpeechClassifierOutput(
      loss=loss,
      logits=logits,
      hidden_states=outputs.hidden_states,
      attentions=outputs.attentions,
    )

In [16]:
model = WhisperForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Downloading pytorch_model.bin:   0%|          | 0.00/290M [00:00<?, ?B/s]

Some weights of the model checkpoint at openai/whisper-base were not used when initializing WhisperForSpeechClassification: ['model.decoder.layers.1.fc1.bias', 'model.encoder.layers.3.self_attn.out_proj.bias', 'model.decoder.layers.5.self_attn_layer_norm.bias', 'model.decoder.layers.2.fc2.weight', 'model.decoder.layers.0.fc1.weight', 'model.encoder.layers.2.fc2.bias', 'model.decoder.layer_norm.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.layers.4.fc1.bias', 'model.decoder.layers.0.self_attn.k_proj.weight', 'model.encoder.layers.4.fc2.weight', 'model.decoder.layers.4.encoder_attn.out_proj.bias', 'model.decoder.layers.3.self_attn_layer_norm.weight', 'model.decoder.layers.2.self_attn.v_proj.weight', 'model.encoder.layers.2.self_attn_layer_norm.bias', 'model.encoder.layers.0.fc2.weight', 'model.encoder.layers.2.fc1.bias', 'model.decoder.layers.2.encoder_attn_layer_norm.weight', 'model.encoder.layers.0.self_attn.out_proj.weight', 'model.decoder.layers.1.encoder_attn.out_proj

In [18]:
model.freeze_encoder()
model

WhisperForSpeechClassification(
  (whisper): WhisperEncoder(
    (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 512)
    (layers): ModuleList(
      (0): WhisperEncoderLayer(
        (self_attn): WhisperAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=Tr

## 🏃‍♀️ training routine

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, Union, Tuple, Optional
from packaging import version
import numpy as np

import torch
from torch import nn

from transformers import Trainer, is_apex_available, WhisperProcessor, EvalPrediction

if is_apex_available():
    from apex import amp 
    # Apex is a PyTorch add-on package from NVIDIA with capabilities for automatic mixed precision (AMP) and distributed training.
    # https://www.ibm.com/docs/en/wmlce/1.6.1?topic=frameworks-getting-started-apex

if version.parse(torch.__version__) >= version.parse("1.6"):
  _is_native_amp_available = True
  from torch.cuda.amp import autocast

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
  processor: WhisperProcessor
  padding: Union[bool, str] = True
  max_length: Optional[int] = None
  max_length_labels: Optional[int] = None
  pad_to_multiple_of: Optional[int] = None
  pad_to_multiple_of_labels: Optional[int] = None

  def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
    input_features = [{"input_features": feature["input_features"][0]} for feature in features]
    label_features = [feature["labels"] for feature in features]

    d_type = torch.long if isinstance(label_features[0], int) else torch.float

    batch = self.processor.feature_extractor.pad(
      input_features,
      padding=self.padding,
      max_length=self.max_length,
      pad_to_multiple_of=self.pad_to_multiple_of,
      return_tensors="pt",
    )

    batch["labels"] = torch.tensor(label_features, dtype=d_type)
    return batch

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    # (preds == p.label_ids).astype(np.float32).mean().item()
    return {k: metric(p.label_ids, preds) for k, metric in metrics.items()}

In [None]:
class CTCTrainer(Trainer):
  def training_step(self, model, inputs) -> torch.Tensor:
    model.train()
    inputs = self._prepare_inputs(inputs)

    with autocast():
      # loss = self.compute_loss(model, inputs)
      loss = model(**inputs).get("loss")

    if self.args.gradient_accumulation_steps > 1:
      loss = loss / self.args.gradient_accumulation_steps

    self.scaler.scale(loss).backward()

    return loss.detach()
  
  def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys) -> torch.Tensor: 
    model.eval()
    inputs = self._prepare_inputs(inputs)

    labels = inputs.get("labels")

    with autocast():
      outputs = model(**inputs)
      logits = outputs.get("logits")
      loss = outputs.get("loss")

    if self.args.gradient_accumulation_steps > 1:
      loss = loss / self.args.gradient_accumulation_steps

    self.scaler.scale(loss).backward()

    with torch.no_grad():
      torch.cuda.empty_cache()

    if prediction_loss_only:
      return loss.detach()
    return (loss.detach(), logits.detach(), labels.detach())

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
from transformers import DataCollatorWithPadding
trainer = CTCTrainer(
  model=model,
  data_collator=data_collator,
  args=training_args,
  compute_metrics=compute_metrics,
  train_dataset=train_ds,
  eval_dataset=test_ds,
  tokenizer=processor.feature_extractor,
)

## 🧪 experiments

In [24]:
trainer.train()



Step,Training Loss,Validation Loss,Unweighted Accuracy,Weighted Accuracy,Micro F1,Macro F1
10,No log,1.641102,0.369048,0.166667,0.369048,0.089855
20,No log,1.620774,0.369048,0.166667,0.369048,0.090643
30,No log,1.613851,0.369048,0.166667,0.369048,0.092262
40,No log,1.628781,0.369048,0.166667,0.369048,0.090643
50,1.544600,1.604792,0.369048,0.166667,0.369048,0.092262




Step,Training Loss,Validation Loss,Unweighted Accuracy,Weighted Accuracy,Micro F1,Macro F1
10,No log,1.641102,0.369048,0.166667,0.369048,0.089855
20,No log,1.620774,0.369048,0.166667,0.369048,0.090643
30,No log,1.613851,0.369048,0.166667,0.369048,0.092262
40,No log,1.628781,0.369048,0.166667,0.369048,0.090643
50,1.544600,1.604792,0.369048,0.166667,0.369048,0.092262
60,1.544600,1.545335,0.369048,0.166667,0.369048,0.093093
70,1.544600,1.517875,0.404762,0.191667,0.404762,0.131746
80,1.544600,1.511382,0.404762,0.191667,0.404762,0.131617




TrainOutput(global_step=85, training_loss=1.569033633961397, metrics={'train_runtime': 451.0184, 'train_samples_per_second': 0.754, 'train_steps_per_second': 0.188, 'total_flos': 9835244409600000.0, 'train_loss': 1.569033633961397, 'epoch': 1.0})

In [25]:
import wandb
wandb.finish()