In [12]:
import os
from deep_utils import warmup_cosine, dump_pickle, load_pickle
from datasets import load_dataset, Audio
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [13]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
feature_extractor

loading feature extractor configuration file https://huggingface.co/facebook/wav2vec2-base/resolve/main/preprocessor_config.json from cache at /home/ai/.cache/huggingface/transformers/d4583dd9e59eb6295f8fe8b18833ae54d963a122d69aa1df7ecce6caafe18c8f.bc3155ca0bae3a39fc37fc6d64829c6a765f46480894658bb21c08db6155358d
loading configuration file https://huggingface.co/facebook/wav2vec2-base/resolve/main/config.json from cache at /home/ai/.cache/huggingface/transformers/c7746642f045322fd01afa31271dd490e677ea11999e68660a92619ec7c892b4.ce1f96bfaf3d7475cb8187b9668c7f19437ade45fb9ceb78d2b06a2cec198015
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature"

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [14]:
train_path = "../data/train_gender.csv"
test_path = '../data/test_gender.csv'
dataset = load_dataset('csv', data_files={'train': train_path,
                                          'test': test_path})
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16_000))
dataset["train"][0]

Using custom data configuration default-7d492a43a82ad139


Downloading and preparing dataset csv/default to /home/ai/.cache/huggingface/datasets/csv/default-7d492a43a82ad139/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/ai/.cache/huggingface/datasets/csv/default-7d492a43a82ad139/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

{'audio_path': {'path': '/home/ai/projects/speech/dataset/asr/new-raw-dataset/samples_02/samples_02_02/wav_files/0920586_020_00_S00_female.wav',
  'array': array([   -0.17177,    -0.17367,    -0.17571, ...,      -0.188,    -0.18659,    -0.18542], dtype=float32),
  'sampling_rate': 16000},
 'label': 'female'}

In [15]:
import random
import IPython.display as ipd
import librosa
index = random.randint(0, len(dataset['train']))

path = dataset['train'][index]['audio_path']['path']
waveform, sr = librosa.load(path)
text = dataset['train'][index]['label']
print(text)
ipd.Audio(waveform, rate=sr, autoplay=True)

female


In [16]:
labels = set(dataset["train"]['label'])
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
label2id

{'male': '0', 'female': '1'}

In [17]:
# save label2id to be used in test
os.makedirs("results/best", exist_ok=True)
dump_pickle("results/best/label2id.pkl", label2id)

In [18]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio_path"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    label = [int(label2id[x]) for x in examples["label"]]
    inputs["label"] = label
    return inputs

In [19]:
encoded_dataset = dataset.map(preprocess_function, remove_columns="audio_path", batched=True)
encoded_dataset['train'][0]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

{'label': 1,
 'input_values': [-0.25010964274406433,
  -0.28442907333374023,
  -0.321246474981308,
  -0.3433060646057129,
  -0.3382202982902527,
  -0.3138812780380249,
  -0.29707086086273193,
  -0.2925068736076355,
  -0.30026775598526,
  -0.3072969317436218,
  -0.2893325686454773,
  -0.25228801369667053,
  -0.21132244169712067,
  -0.1906547099351883,
  -0.21518272161483765,
  -0.2713311016559601,
  -0.33406245708465576,
  -0.3699530363082886,
  -0.34849798679351807,
  -0.2876116931438446,
  -0.22710171341896057,
  -0.2093534618616104,
  -0.2511386275291443,
  -0.3079020082950592,
  -0.33404070138931274,
  -0.31863585114479065,
  -0.2855857312679291,
  -0.2845032513141632,
  -0.3220859467983246,
  -0.3657554090023041,
  -0.38415685296058655,
  -0.3538425862789154,
  -0.29282188415527344,
  -0.2342064529657364,
  -0.19557541608810425,
  -0.19363492727279663,
  -0.21697455644607544,
  -0.2430688738822937,
  -0.2620891034603119,
  -0.2634586691856384,
  -0.262210875749588,
  -0.27603918313

In [20]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")
    precision = precision_score(labels, predictions, average="weighted")

    return {"accuracy": acc, "f1-score": f1, "recall-score": recall, "precision-score": precision}

## TRAIN

In [21]:
import math
import torch
from transformers import EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(early_stopping_patience=5)

train_bs = 64 
epochs = 25
lr = 5e-5
lrf = lr
output_dir = "./results"
total_steps = int((np.ceil(encoded_dataset["train"].num_rows / train_bs) * epochs))

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=epochs,
    report_to="tensorboard",
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model='loss',
    per_device_train_batch_size = train_bs,
    per_device_eval_batch_size = 64,
    logging_steps=1,
)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, warmup_cosine(100,
                                                                       max_lr=lr,
                                                                       total_steps=total_steps,
                                                                       optimizer_lr=lr,
                                                                       min_lr=1e-6))
# reduce lr with a cosine annealing if total_steps is set to total_steps
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

trainer.train()
trainer.save_model(os.path.join(output_dir, "best"))

loading configuration file https://huggingface.co/facebook/wav2vec2-base/resolve/main/config.json from cache at /home/ai/.cache/huggingface/transformers/c7746642f045322fd01afa31271dd490e677ea11999e68660a92619ec7c892b4.ce1f96bfaf3d7475cb8187b9668c7f19437ade45fb9ceb78d2b06a2cec198015
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": fal

Epoch,Training Loss,Validation Loss,Accuracy,F1-score,Recall-score,Precision-score
1,0.1086,0.564826,0.774464,0.734246,0.774464,0.831736
2,0.0338,0.132264,0.962721,0.96282,0.962721,0.963035
3,1.4245,0.107166,0.970177,0.970051,0.970177,0.970198
4,0.0182,0.121566,0.966449,0.966356,0.966449,0.966382
5,0.0411,0.095286,0.974837,0.974758,0.974837,0.974833
6,0.0498,0.106665,0.961789,0.961278,0.961789,0.963152
7,0.0085,0.146449,0.960857,0.96035,0.960857,0.962087
8,0.0114,0.10594,0.972973,0.972808,0.972973,0.973198
9,0.0073,0.169767,0.959925,0.959389,0.959925,0.961232
10,0.0071,0.148842,0.968313,0.968033,0.968313,0.968865


***** Running Evaluation *****
  Num examples = 1073
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-68
Configuration saved in ./results/checkpoint-68/config.json
Model weights saved in ./results/checkpoint-68/pytorch_model.bin
Feature extractor saved in ./results/checkpoint-68/preprocessor_config.json
Deleting older checkpoint [results/checkpoint-184] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1073
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-136
Configuration saved in ./results/checkpoint-136/config.json
Model weights saved in ./results/checkpoint-136/pytorch_model.bin
Feature extractor saved in ./results/checkpoint-136/preprocessor_config.json
Deleting older checkpoint [results/checkpoint-575] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1073
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-204
Configuration saved in ./results/checkpoint-204/config.json
Model wei

KeyboardInterrupt: 

## TEST

In [11]:
import torchaudio
import torch
import librosa
device = "cpu"
model = model.to(device)
waveform, sr = librosa.load("../audio_samples/man_02.mp4")
waveform = torch.from_numpy(waveform).unsqueeze(0)
waveform = torchaudio.transforms.Resample(sr, 16_000)(waveform)
inputs = feature_extractor(waveform, sampling_rate=feature_extractor.sampling_rate,
                           max_length=16000, truncation=True)
tensor = torch.tensor(inputs['input_values'][0]).to(device)
with torch.no_grad():
    output = model(tensor)
    logits = output['logits'][0]
    label_id = torch.argmax(logits).item()
label_name = id2label[str(label_id)]
print(label_name)

male
