In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime
import warnings

#Dataset Imports
import csv
from IPython.display import Audio, display
import torchaudio
from transformers import Wav2Vec2FeatureExtractor
from torch.nn.utils.rnn import pad_sequence

warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

  from .autonotebook import tqdm as notebook_tqdm


Device:  cuda


In [3]:
#Set directory to the Audios folder in MSP dataset
AUDIO_ROOT = '/data/NO-BACKUP/rvitiell-data/MSP/Audios/'
#Set path to labels_consensus in MSP dataset
LABELS_DIR = '/data/NO-BACKUP/rvitiell-data/MSP/labels/'

In [4]:
#Load the directory
names = sorted(os.listdir(AUDIO_ROOT))
data1 = AUDIO_ROOT + names[0]
#torchaudio.load requires you to install some programs if you get 'No audio I/O backend is available' error
#https://stackoverflow.com/questions/62543843/cannot-import-torch-audio-no-audio-backend-is-available
waveform, sample_rate = torchaudio.load(data1)
print(waveform.shape)
print(sample_rate)

#Load label csv file
with open(LABELS_DIR + 'labels_concensus.csv') as f:
    reader = csv.reader(f)
    labels = sorted(list(reader)[1:])

torch.Size([1, 167814])
16000


In [5]:
model_name_or_path = "facebook/hubert-large-ls960-ft"
pooling_mode = "mean"

In [6]:
from transformers import Wav2Vec2FeatureExtractor
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path,)
#target_sampling_rate = processor.feature_extractor.sampling_rate
#print(f"The target sampling rate: {target_sampling_rate}")

In [7]:
# {'A':1, 'S':2, 'H':3, 'U':4, 'F':5, 'D':6, 'C':7, 'N':8, 'O':9}
EMOMAP = {'A':0, 'S':1, 'H':2,'D':3,'N':4}

In [8]:
class MSPDataset(torch.utils.data.Dataset):
    
    #Initialize the dataset based on the recommended split in MSP dataset.
    def __init__(self, train = False, valid = False, test1 = False, test2 = False): 
        
        self.audio_dir = AUDIO_ROOT
        self.labels_dir = LABELS_DIR
        self.audio_names = sorted(names)
        self.labels_list = labels
        self.EMOMAP = EMOMAP

        self.audio = []
        self.labels = []
        
        #What type of dataset are we making
        setType = 'Train'
        if valid:
            setType = 'Validation'
        elif test1:
            setType = 'Test1'
        elif test2:
            setType = 'Test2'
        print(setType)       
        
        #Sanitycheck1
        assert(len(self.audio_names) == len(self.labels_list))
        
        for i in tqdm(range(0, len(self.audio_names))):
            assert(self.audio_names[i] == self.labels_list[i][0])
            if self.labels_list[i][7] != setType or self.labels_list[i][1] == 'X' or self.labels_list[i][1] not in self.EMOMAP.keys():
                continue
            #43 Audio files from 1904 podcast seems to be broken. Torchaudio load returns 'no data chunk'
            if self.labels_list[i][0].startswith('MSP-PODCAST_1904'):
                continue
            self.audio.append(self.audio_dir + self.audio_names[i])
            self.labels.append(self.EMOMAP[self.labels_list[i][1]])         
        
        self.length = len(self.audio)
        
    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        audio = self.audio[ind]
        label = self.labels[ind]
        #load audio when getting the item. If we do it in init, computer blue screens.
        waveform, sample_rate = torchaudio.load(audio)

        waveform = processor(waveform, sampling_rate = 16000,padding=True, device = device)

        # waveform = waveform.to(device)
        # label = label.to(device)
        waveform['labels'] = label

        return waveform
        #return waveform, label
    
    # def collate_fn(self, batch):
    #     batch_audio = [x[0].reshape(-1) for x in batch]

    #     audio_lengths = torch.LongTensor([len(x) for x in batch_audio])
    #     batch_audio = pad_sequence(batch_audio, padding_value=0.0, batch_first = True)
    #     batch_label = [x[1] for x in batch]
        
    #     return batch_audio, audio_lengths, torch.tensor(batch_label)

In [9]:
import gc 
gc.collect()

0

In [10]:
# Create Dataset objects.
# train_data = MSPDataset(train = True) 
# val_data = MSPDataset(valid = True) 
# test_data = MSPDataset(test1 = True)

# print(train_data.__len__())
# print(val_data.__len__())
# print(test_data.__len__())

In [11]:
# train_data[0]

In [12]:
from datasets import load_dataset, Dataset
# train_dataset = Dataset.from_list(train_data)
# val_dataset = Dataset.from_list(val_data)
# train_dataset = load_dataset("dlproject/msp_train_hubert")

In [13]:
# train_dataset.push_to_hub("dlproject/msp_train_hubert_large")
# val_dataset.push_to_hub("dlproject/msp_val_hubert_large")

In [14]:
#val_dataset = Dataset.from_list(val_data)

In [15]:
val_dataset = load_dataset("dlproject/msp_val_hubert")

Using custom data configuration dlproject--msp_val_hubert-89eed54aad19cdf2
Found cached dataset parquet (/home/rvitiell/.cache/huggingface/datasets/dlproject___parquet/dlproject--msp_val_hubert-89eed54aad19cdf2/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 65.52it/s]


In [16]:
train_dataset = load_dataset("dlproject/msp_train_hubert")

Using custom data configuration dlproject--msp_train_hubert-d9cb93d944f5240e
Found cached dataset parquet (/home/rvitiell/.cache/huggingface/datasets/dlproject___parquet/dlproject--msp_train_hubert-d9cb93d944f5240e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 10.85it/s]


In [17]:
train_dataset = train_dataset['train']
val_dataset = val_dataset['train']

In [18]:
print(len(train_dataset))
print(len(val_dataset))

29939
5213


In [19]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch
import torch.nn.functional as F
import transformers
from transformers import Wav2Vec2Processor
from torch.nn.utils.rnn import pad_sequence


@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # all_features = [torch.tensor(feature['input_values']) for feature in features]
        #print(typeall_features)
        #padded_inputs = pad_sequence(all_features)
        #input_features = [{"input_values": torch.tensor(feature["input_values"][0][0]), "attention_mask":torch.tensor(feature["attention_mask"][0])} for feature in features]
        input_features = [{"input_values": feature["input_values"][0][0]} for feature in features]

        label_features = [feature["labels"] for feature in features]

        #input_lengths = [len(x['input_values']) for x in input_features]
        # print(input_lengths)

        #print(input_features)

        #input_features = input_features.to(device)
        #label_features = label_features.to(device)        
        #print(input_features[0]['input_values'].device)
    
        #d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        # batch = {}
        # batch['input_values'] = torch.stack([features['input_values'] for features in input_features])
        # batch['attention_mask'] = torch.stack([features['attention_mask'] for features in input_features])
        
        batch['input_values'] = batch['input_values'].to(device)

        batch["labels"] = torch.tensor(label_features,device=device)

        # print(batch['labels'])
        # print(batch['input_values'].device)
        # print(batch['labels'].device)

        assert len(batch['input_values'])==len(batch["labels"])

        return batch

In [20]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [21]:
import numpy as np
from transformers import EvalPrediction
from torchmetrics import F1Score

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    f1 = F1Score(num_classes=5,average='weighted')
    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item(), "f1": f1(torch.tensor(preds), torch.tensor(p.label_ids))}

In [22]:
# config
from transformers import AutoConfig, Wav2Vec2Processor
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=len(EMOMAP),
    label2id=EMOMAP,
    id2label={i: label for label,i in EMOMAP.items()},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

In [23]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.file_utils import ModelOutput
from dataclasses import dataclass
from typing import Optional, Tuple
import torch

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)

@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [24]:
from transformers import HubertForSequenceClassification
model = HubertForSequenceClassification.from_pretrained(
    "facebook/hubert-large-ls960-ft",
    config=config,
    ignore_mismatched_sizes=True,
)

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertForSequenceClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['classifier.weight', 'projector.bias', 'projector.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [25]:
model.freeze_feature_extractor()

In [26]:

# # freeze all layers to begin with
# for param in model.parameters():
#     param.requires_grad = False

# layers_freeze_num = 2
# n_layers = (
#     4 + layers_freeze_num * 16
# )  # 4 refers to projector and classifier's weights and biases.
# for name, param in list(model.named_parameters())[-n_layers:]:
#     param.requires_grad = True


In [28]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/data/NO-BACKUP/rvitiell-data/MSP/models/hubert-fixed-10epoch",
    # output_dir="/content/gdrive/MyDrive/wav2vec2-xlsr-greek-speech-emotion-recognition"
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=128,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10.0,
    fp16=True,
    logging_steps=500,
    learning_rate=1e-4,
    save_total_limit=2,
    dataloader_pin_memory=False,
    load_best_model_at_end=True
)

In [29]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """

        #model = model.to(device)
        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_cuda_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_cuda_amp:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            loss.backward()

        return loss.detach()


In [30]:
device

'cuda'

In [31]:
model = model.to(device)

In [32]:
model.device

device(type='cuda', index=0)

In [33]:
trainer = CTCTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Using cuda_amp half precision backend


In [34]:
!nvidia-smi

Tue Dec 13 23:47:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.86       Driver Version: 470.86       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:09:00.0 Off |                  Off |
| 30%   55C    P2    86W / 300W |   2424MiB / 48685MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [35]:
torch.cuda.empty_cache()

In [36]:
import gc
gc.collect()

0

In [37]:
trainer.train()

***** Running training *****
  Num examples = 29939
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 128
  Total optimization steps = 1160
  Number of trainable parameters = 311492229
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrosavitiello[0m ([33mvarf[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,1.101999,0.608479,0.553132
1,No log,1.052427,0.610205,0.56455
2,No log,1.085197,0.595818,0.56063
3,No log,1.103768,0.601573,0.565459
4,0.892200,1.195656,0.593708,0.572563
5,0.892200,1.212851,0.596585,0.574693
6,0.892200,1.274971,0.586802,0.56441
7,0.892200,1.330421,0.584884,0.568398
8,0.643500,1.388777,0.580664,0.562334
9,0.643500,1.435218,0.576635,0.559776


***** Running Evaluation *****
  Num examples = 5213
  Batch size = 2
Saving model checkpoint to /data/NO-BACKUP/rvitiell-data/MSP/models/hubert-fixed-10epoch/checkpoint-116
Configuration saved in /data/NO-BACKUP/rvitiell-data/MSP/models/hubert-fixed-10epoch/checkpoint-116/config.json
Model weights saved in /data/NO-BACKUP/rvitiell-data/MSP/models/hubert-fixed-10epoch/checkpoint-116/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5213
  Batch size = 2
Saving model checkpoint to /data/NO-BACKUP/rvitiell-data/MSP/models/hubert-fixed-10epoch/checkpoint-232
Configuration saved in /data/NO-BACKUP/rvitiell-data/MSP/models/hubert-fixed-10epoch/checkpoint-232/config.json
Model weights saved in /data/NO-BACKUP/rvitiell-data/MSP/models/hubert-fixed-10epoch/checkpoint-232/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5213
  Batch size = 2
Saving model checkpoint to /data/NO-BACKUP/rvitiell-data/MSP/models/hubert-fixed-10epoch/checkpoint-348
Configuration save

TrainOutput(global_step=1160, training_loss=0.7385576905875371, metrics={'train_runtime': 52515.7214, 'train_samples_per_second': 5.701, 'train_steps_per_second': 0.022, 'total_flos': 6.367269806597044e+19, 'train_loss': 0.7385576905875371, 'epoch': 9.99})