## 05 Pipeline

Load up all of the libraries we'll need.

In [12]:
from datasets import load_dataset
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, TrainingArguments, Trainer, pipeline
import evaluate
import torch
import numpy as np
from transformers import pipeline
from faker import Faker
from IPython.display import Audio as player


%load_ext autoreload
%autoreload 2

Make sure you have downloaded the dataset from [Kaggle](https://www.kaggle.com/datasets/carlossalazar65/tropical-genres-dataset), and unzipped it inside the `data` directory.

In [14]:
dataset = load_dataset(path="audiofolder", data_dir="../data/Audios/", split="train")
dataset

Resolving data files:   0%|          | 0/1500 [00:00<?, ?it/s]

Found cached dataset audiofolder (/home/ramonperez/.cache/huggingface/datasets/audiofolder/default-937056b5ef3f06eb/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


Dataset({
    features: ['audio', 'label'],
    num_rows: 1500
})

A few of our functions will need explicit access to the `label` variable as text and numbers, plus the amount of classes we are trying to predicts, so let's start by extracting these.

In [15]:
labels = dataset.features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

num_labels = len(id2label)
int_id_2_label = {int(k): v for k, v in id2label.items()}
int_id_2_label

{0: 'Bachata', 1: 'Cumbia', 2: 'Merengue', 3: 'Salsa', 4: 'Vallenato'}

Since we will want to play the recommendations we receive from Qdrant, we want to load of the files (for now) rather than loading up the long piece of string that represents a sound into Qdrant. Let's get the path for each audio file using pandas, and then let's convert it to a list we'll need later.

In [18]:
paths = dataset.select_columns("audio").to_pandas()['audio'].apply(lambda x: x['path']).tolist()
paths[:5]

['/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0000.mp3',
 '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0001.mp3',
 '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0002.mp3',
 '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0003.mp3',
 '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0004.mp3']

Time to split the data. Feel free to change the ratio used for the `test_size` below.

In [19]:
dataset = dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 300
    })
})

In [20]:
dataset['train'][0]

{'audio': {'path': '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Cumbia/cumbia0131.mp3',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.26368874,
          0.21336344,  0.684293  ], dtype=float32),
  'sampling_rate': 44100},
 'label': 1}

Note that the sampling rate above is of 44,100 herts...

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

def get_features(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    return feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt",
        max_length=16000, truncation=True, padding=True
    )

In [None]:
%%time

encoded_latin = dataset.map(get_features, batched=True, batch_size=50)

In [None]:
encoded_latin

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

## Model Finetuning

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
).to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="../models",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_latin["train"],
    eval_dataset=encoded_latin["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [None]:
%%time

trainer.train()

In [None]:
trainer.save_model("sec_mod")

In [None]:
classifier = pipeline("audio-classification", model="sec_mod")

In [None]:
from random import choice
audio_file = dataset["train"][choice(range(1000))]["audio"]["path"]
audio_file

In [None]:
classifier.predict(audio_file)

In [None]:
player(audio_file)

In [None]:
from datasets import concatenate_datasets

In [None]:
data = concatenate_datasets([encoded_latin['train'], encoded_latin['test']])
data

In [None]:
from transformers import AutoModel
model_ckpt = "sec_mod"
model = AutoModel.from_pretrained(model_ckpt).to(device)
type(model)

In [None]:
data

In [None]:
def extract_hidden_states(batch):
    inputs = {k: v.to(device) for k, v in batch.items() if k in feature_extractor.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:, 0].cpu().numpy()}

In [None]:
data.set_format("torch", columns=["label", "input_values"])

In [None]:
%%time

encoded_hidden = data.map(extract_hidden_states, batched=True, batch_size=50)

In [None]:
%%time

np.save('vectors_full.npy', np.array(encoded_hidden["hidden_state"]), allow_pickle=False)

In [None]:
data = encoded_hidden.rename_column("label", "genre")
data

In [None]:
player(
data.select_columns(['genre', 'audio']).to_pandas().head()['audio'][0]["bytes"]
)

In [None]:
fake = Faker()
int_id_2_label = {int(k): v for k, v in id2label.items()}

In [None]:
dataset = data.to_pandas()

In [None]:
print(dataset.loc[0, 'audio']['path'])

In [None]:
dataset['audio_path'] = paths
dataset["idx"] = [randint(10_000, 99_999) for _ in range(len(dataset))]
dataset["artist"] = [fake.name() for _ in range(len(dataset))]
dataset['genre'] = dataset['genre'].map(int_id_2_label)
dataset.head()

In [None]:
dataset[["idx", 'genre', "artist", 'audio_path']].to_json("payload.json")

In [None]:
dataset = Dataset.from_pandas(dataset)
dataset

In [None]:
data.save_to_disk("../data/audio_data")

In [None]:
dataset.select_columns(["idx", 'genre', "artist", 'audio_path']).to_pandas().head()

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http import models
import pandas as pd

In [None]:
vectors = np.load('vectors_full.npy')
vectors.shape

In [None]:
client = QdrantClient("localhost", port=6333)

In [None]:
from pprint import pprint
collection_info = client.get_collection(collection_name="test_collection")
collection_info

In [None]:
from qdrant_client.http.models import CollectionStatus

assert collection_info.status == CollectionStatus.GREEN
assert collection_info.vectors_count == 0

In [None]:
client.recreate_collection(
    collection_name="test_collection",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

In [None]:
payload = pd.read_json("payload.json", orient="records").set_index("idx")
payload.head(2)

In [None]:
# payload[['genre', 'artist', 'audio_path']]
payload.to_dict(orient="records")

In [None]:
client.upsert(
    collection_name="test_collection",
    points=models.Batch(
        ids=payload.index.to_list(),
        payloads=payload.to_dict(orient="records"),
        vectors=vectors.tolist()
    ),
)

In [None]:
from diffusers import AudioLDMPipeline

repo_id = "cvssp/audioldm"
pipe = AudioLDMPipeline.from_pretrained(repo_id)
pipe = pipe.to("cuda")

prompt = "high quality bachata"

audio = pipe(prompt=prompt, num_inference_steps=20, audio_length_in_s=10.0).audios[0]

from IPython.display import Audio as player

player(audio, rate=16000)

In [None]:
classifier = pipeline("audio-classification", model="sec_mod")

In [None]:
classifier(audio)

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained("sec_mod")
inputs = feature_extractor(
    audio, sampling_rate=feature_extractor.sampling_rate, 
    return_tensors="pt", max_length=16000, truncation=True
)

In [None]:
from transformers import AutoModel
model = AutoModel.from_pretrained('sec_mod').to(device)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
with torch.no_grad():
    last_hidden_state = model(**inputs.to(device)).last_hidden_state[:, 0]
last_hidden_state.size()

In [None]:
vectr = last_hidden_state.cpu().numpy()[0, :]

In [None]:
from pprint import pprint
results = client.search(
    collection_name="test_collection",
    query_vector=vectr,
    limit=10
)
results[0].payload['genre']