# Model Training

In [9]:
from huggingface_hub import notebook_login

# notebook_login()

In [22]:
from datasets import load_dataset, Audio
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import pipeline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
dataset = load_dataset("audiofolder", data_dir="../data/Audios/", drop_metadata=True, split="train")

Resolving data files:   0%|          | 0/1500 [00:00<?, ?it/s]

Found cached dataset audiofolder (/home/ramonperez/.cache/huggingface/datasets/audiofolder/default-3b4b1d5ee650a44b/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


In [24]:
dataset

Dataset({
    features: ['audio', 'label'],
    num_rows: 1500
})

In [25]:
dataset.features["label"]

ClassLabel(names=['Bachata', 'Cumbia', 'Merengue', 'Salsa', 'Vallenato'], id=None)

In [26]:
dataset = dataset.train_test_split(test_size=0.2)

In [27]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 300
    })
})

In [28]:
dataset["train"][0]

{'audio': {'path': '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Merengue/merengue0018.mp3',
  'array': array([0.        , 0.        , 0.        , ..., 0.37584227, 0.22729513,
         0.153633  ], dtype=float32),
  'sampling_rate': 44100},
 'label': 2}

In [29]:
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [30]:
id2label[str(2)]

'Merengue'

## Extract Features

In [31]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



In [23]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
dataset["train"][0]

{'audio': {'path': '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0270.mp3',
  'array': array([-8.9345886e-10, -4.4023729e-10,  1.6287136e-09, ...,
         -3.2076925e-01, -3.9201927e-01,  0.0000000e+00], dtype=float32),
  'sampling_rate': 16000},
 'label': 0}

In [24]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    return feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )

In [25]:
%%time

encoded_latin = dataset.map(preprocess_function, remove_columns="audio", batched=True)

                                                                                                                                                                          

CPU times: user 1min 36s, sys: 2.73 s, total: 1min 39s
Wall time: 1min 39s




In [29]:
encoded_latin.to

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 300
    })
})

In [33]:
encoded_latin["train"].features["input_values"]

TypeError: 'Sequence' object is not subscriptable

In [26]:
accuracy = evaluate.load("accuracy")

Downloading builder script: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4.20k/4.20k [00:00<00:00, 1.28MB/s]


In [27]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

## Ready to train model

In [35]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['project_q.bias', 'quantizer.weight_proj.bias', 'quantizer.codevectors', 'project_hid.bias', 'project_q.weight', 'quantizer.weight_proj.weight', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.bias', 'classifier.bias', 'classifier.w

In [37]:
training_args = TrainingArguments(
    output_dir="../models",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push_to_hub=True,
)

In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_latin["train"],
    eval_dataset=encoded_latin["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [39]:
%%time

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
0,No log,1.598464,0.313333
2,1.605300,1.558419,0.403333
2,1.566500,1.44888,0.526667
4,1.474400,1.395545,0.463333
4,1.365400,1.308152,0.55
6,1.280900,1.249626,0.576667
6,1.217100,1.205484,0.57
8,1.136100,1.156103,0.593333
8,1.123800,1.173428,0.573333
9,1.060100,1.175543,0.566667


CPU times: user 5min 30s, sys: 17.5 s, total: 5min 48s
Wall time: 4min 54s


TrainOutput(global_step=90, training_loss=1.3144165992736816, metrics={'train_runtime': 294.4244, 'train_samples_per_second': 40.757, 'train_steps_per_second': 0.306, 'total_flos': 1.03279366918656e+17, 'train_loss': 1.3144165992736816, 'epoch': 9.47})

## Test Model

In [4]:
dataset2 = load_dataset("audiofolder", data_dir="../data/Audios/", drop_metadata=True, split="train")
dataset2 = dataset2.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate2 = dataset2.features["audio"].sampling_rate
print(sampling_rate2)
audio_file2 = dataset2[0]["audio"]["path"]

Resolving data files:   0%|          | 0/1500 [00:00<?, ?it/s]

Found cached dataset audiofolder (/home/ramonperez/.cache/huggingface/datasets/audiofolder/default-3b4b1d5ee650a44b/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


16000


In [5]:
audio_file2

'/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0000.mp3'

In [48]:
trainer.save_model("first_mod")

In [6]:
classifier = pipeline("audio-classification", model="first_mod")

In [7]:
classifier(audio_file2)

[{'score': 0.37294140458106995, 'label': 'Bachata'},
 {'score': 0.3329851031303406, 'label': 'Vallenato'},
 {'score': 0.12372597306966782, 'label': 'Cumbia'},
 {'score': 0.0981556624174118, 'label': 'Merengue'},
 {'score': 0.07219197601079941, 'label': 'Salsa'}]

In [8]:
from IPython.display import Audio

In [21]:
Audio(dataset2[61]["audio"]["path"])

In [12]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("first_mod")
inputs = feature_extractor(dataset2[0]["audio"]["array"], sampling_rate=sampling_rate2, return_tensors="pt")
inputs

{'input_values': tensor([[ 0.0003,  0.0003,  0.0003,  ...,  0.0866, -0.1515, -0.0898]])}

In [13]:
inputs["input_values"].size()

torch.Size([1, 479835])

In [15]:
from transformers import AutoModelForAudioClassification
import torch


model2 = AutoModelForAudioClassification.from_pretrained("first_mod")
with torch.no_grad():
    logits = model2(**inputs).logits
logits

tensor([[ 0.9065, -0.1545, -0.4021, -0.6993,  0.8430]])

In [16]:
predicted_class_ids = torch.argmax(logits).item()
predicted_label = model2.config.id2label[predicted_class_ids]
predicted_label

'Bachata'

In [36]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

In [37]:
client = QdrantClient("localhost", port=6333)

In [None]:
client.recreate_collection(
    collection_name="test_collection",
    vectors_config=VectorParams(size=4, distance=Distance.DOT),
)