# Speaker-Identification

In [1]:
# Importing the libraries
import os
import pandas as pd
import re
import librosa
from datasets import Dataset

# Loading the dataset

In [2]:
# Defining the root directory
data_directory = "../Badaga_Corpus-v.0.1.0/"
tagged_file = "Badaga-v0.1.0.xlsx"

In [3]:
# loading the dataset
tagged_file_path = os.path.join(data_directory, tagged_file)

# loading the transcription file
data_frame = pd.read_excel(tagged_file_path)

# droping the missing values
data_frame.dropna(inplace=True)

# loading the audio files
data_frame["audio_file_name"] = data_frame["audio_file_name"].apply(lambda x: os.path.join(data_directory, "clips", x))

# splitting the data into train and test using split_index from transcription file
train_df = data_frame[data_frame["split_label"]!="test"]
test_df = data_frame[data_frame["split_label"]=="test"]

In [4]:
# printing the data
data_frame

Unnamed: 0,items,translated_transcript,audio_file_name,translterated_script,user_id,gender,locale,split_label,duration
0,1.0,what is the recipe in home,../Badaga_Corpus-v.0.1.0/clips/F002_1_1.mp3,manaya aena udhaka,F002,F,ba,train,2.377187
1,2.0,who is there near you,../Badaga_Corpus-v.0.1.0/clips/F002_1_2.mp3,pakka dhara edhdharae,F002,F,ba,train,2.377187
2,3.0,what did you prepare for lunch,../Badaga_Corpus-v.0.1.0/clips/F002_1_3.mp3,hagulu hasuga aena maditha,F002,F,ba,train,2.351062
3,4.0,did you brush,../Badaga_Corpus-v.0.1.0/clips/F002_1_4.mp3,hallu ujjithaya,F002,F,ba,train,2.194313
4,5.0,did you eat,../Badaga_Corpus-v.0.1.0/clips/F002_1_5.mp3,nee thindhubutaya,F002,F,ba,train,2.272687
...,...,...,...,...,...,...,...,...,...
9832,595.0,what is price for gold,../Badaga_Corpus-v.0.1.0/clips/F004_595.mp3,chinna baellae aesaga vario,F004,F,ba,test,1.776375
9833,596.0,buffalo will be so black in color,../Badaga_Corpus-v.0.1.0/clips/F004_596.mp3,yaemmae appara kappa attra,F004,F,ba,test,1.907000
9834,597.0,today is my birthday,../Badaga_Corpus-v.0.1.0/clips/F004_597.mp3,endhdhu aenna utti jaenna,F004,F,ba,train,1.541250
9835,598.0,we are coming tomorrow.,../Badaga_Corpus-v.0.1.0/clips/F004_598.mp3,enga naaiga bannaeyo,F004,F,ba,train,1.724125


In [None]:
# converting variables such as "audio_file_name" and "user_id" as list and renaming them as "path" and "label"
train_df["path"] = list(train_df["audio_file_name"])
train_df["label"] = list(train_df["user_id"])

test_df["path"] = list(test_df["audio_file_name"])
test_df["label"] = list(test_df["user_id"])

# creating a new datasets using the above list for both training and testing set
train_df = train_df[["path", "label"]]
test_df = test_df[["path", "label"]]

In [6]:
# printing the shape of train and test
train_df.shape, test_df.shape

((8365, 2), (1469, 2))

In [7]:
# printing the data
train_df

Unnamed: 0,path,label
0,../Badaga_Corpus-v.0.1.0/clips/F002_1_1.mp3,F002
1,../Badaga_Corpus-v.0.1.0/clips/F002_1_2.mp3,F002
2,../Badaga_Corpus-v.0.1.0/clips/F002_1_3.mp3,F002
3,../Badaga_Corpus-v.0.1.0/clips/F002_1_4.mp3,F002
4,../Badaga_Corpus-v.0.1.0/clips/F002_1_5.mp3,F002
...,...,...
9830,../Badaga_Corpus-v.0.1.0/clips/F004_593.mp3,F004
9831,../Badaga_Corpus-v.0.1.0/clips/F004_594.mp3,F004
9834,../Badaga_Corpus-v.0.1.0/clips/F004_597.mp3,F004
9835,../Badaga_Corpus-v.0.1.0/clips/F004_598.mp3,F004


In [8]:
# dropping the index for the newly created dataset
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# saving it as csv files for both training and testing
train_df.to_csv("files/speaker_train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv("filesspeaker_test.csv", sep="\t", encoding="utf-8", index=False)

In [9]:
# We need to specify the input and output column
input_column = "path"
output_column = "label"

In [10]:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric


data_files = {
    "train": "files/speaker_train.csv", 
    "validation": "files/speaker_test.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

Using custom data configuration default-2409684a97212705


Downloading and preparing dataset csv/default to /home/ubuntu/.cache/huggingface/datasets/csv/default-2409684a97212705/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/csv/default-2409684a97212705/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['path', 'label'],
    num_rows: 8365
})
Dataset({
    features: ['path', 'label'],
    num_rows: 1469
})


In [11]:
# we need to distinguish the unique labels 
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 11 classes: ['F001', 'F002', 'F003', 'F004', 'F005', 'F006', 'M001', 'M002', 'M003', 'M004', 'M005']


# Feature Extraction

In [12]:
# loading the feature extractor and processor from the transformers

from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor

# Loading the Wav2Vec Model

In [13]:
# loading the model
model_name_or_path = "facebook/wav2vec2-large-xlsr-53"
pooling_mode = "mean"

In [14]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

In [15]:
# feature extraction
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path,)
target_sampling_rate = feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


In [16]:
# loading the audio data using librosa
import librosa
def speech_file_to_array_fn(path):
    a, s = librosa.load(path, sr=16000)
#     speech_array, sampling_rate = torchaudio.load(path)
#     resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
#     speech = resampler(speech_array).squeeze().numpy()
    return a

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

# function for pre-processing 
def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [None]:
# using map function to map the pre-processed files to the train adn test sets
import torchaudio
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)

In [18]:
# defining the dataclass for speech classifier
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


# Fine-Tuning Wav2Vec Pretrained Model for Speaker-Identification

In [19]:
# defining the classifier class
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [20]:
# defining function for datacollator and padding
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor


@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`)
            The feature_extractor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    feature_extractor: Wav2Vec2FeatureExtractor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.feature_extractor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [21]:
# setting only for classification (regression is set)
is_regression = False

In [22]:
# function for computing metrics for evaluation
import numpy as np
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [23]:
# classification
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForSpeechClassification: ['project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'project_q.bias', 'quantizer.weight_proj.weight', 'project_hid.weight', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['classifier.out_proj.bias', 'classifier.

In [24]:
model.freeze_feature_extractor()

In [25]:
# setting up the arguments for training 
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="wav2vec2-rbg-badaga-speaker",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=1.0,
    fp16=True,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
)

In [26]:
# setting up the trainer function
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """

        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            loss.backward()

        return loss.detach()


In [27]:
# calling the data collator with padding
data_collator = DataCollatorCTCWithPadding(feature_extractor=feature_extractor, padding=True)

In [28]:
## setting up CTCT trainer
trainer = CTCTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=feature_extractor,
)

Using amp half precision backend


In [29]:
# training
trainer.train()

The following columns in the training set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running training *****
  Num examples = 8365
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 1046
  return (input_length - kernel_size) // stride + 1


Step,Training Loss,Validation Loss,Accuracy
10,2.3638,2.365155,0.142954
20,2.3647,2.350315,0.150442
30,2.261,2.285996,0.197413
40,2.2664,2.258502,0.228046
50,2.2313,2.166689,0.33288
60,2.2044,2.043112,0.430905
70,2.0401,1.818119,0.479238
80,1.8275,1.754661,0.394146
90,1.6694,1.297313,0.687543
100,1.4839,1.202904,0.654187


The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-10
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-10/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-10/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-10/preprocessor_config.json
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-20
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-20/config.json
Model weights saved in wav2vec2-r

***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-80
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-80/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-80/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-80/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-60] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-90
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-90/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-90/pytorch_model.bin
Config

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-150
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-150/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-150/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-150/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-130] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-160
Confi

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-220
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-220/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-220/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-220/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-200] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-230
Confi

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-290
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-290/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-290/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-290/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-270] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-300
Confi

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-360
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-360/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-360/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-360/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-340] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-370
Confi

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-430
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-430/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-430/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-430/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-410] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-440
Confi

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-500
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-500/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-500/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-500/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-480] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-510
Confi

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-570
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-570/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-570/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-570/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-550] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-580
Confi

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-640
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-640/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-640/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-640/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-620] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-650
Confi

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-710
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-710/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-710/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-710/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-690] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-720
Confi

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-780
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-780/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-780/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-780/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-760] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-790
Confi

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-850
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-850/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-850/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-850/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-830] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-860
Confi

The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-920
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-920/config.json
Model weights saved in wav2vec2-rbg-badaga-speaker/checkpoint-920/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-speaker/checkpoint-920/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-speaker/checkpoint-900] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path.
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-speaker/checkpoint-930
Confi

KeyboardInterrupt: 

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [32]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name_or_path = "wav2vec2-rbg-badaga-speaker/"
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)

loading configuration file wav2vec2-rbg-badaga-speaker/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "wav2vec2-rbg-badaga-speaker/",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForSpeechClassification"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_dropout": 0.0,
  "feat_extract_norm": "layer",
  "feat_proj_