## 05 Pipeline

In [83]:
from datasets import load_dataset, Audio, load_from_disk, Dataset, ClassLabel
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
import evaluate
import torch
import numpy as np
from transformers import pipeline
from faker import Faker
from random import randint, choice
from IPython.display import Audio as player


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
data = load_from_disk("../data/processed/")
data

DatasetDict({
    train: Dataset({
        features: ['audio', 'label', 'input_values'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['audio', 'label', 'input_values'],
        num_rows: 300
    })
})

In [85]:
dataset = load_dataset(path="audiofolder", data_dir="../data/Audios/", split="train")
# dataset.cleanup_cache_files()

Resolving data files:   0%|          | 0/1500 [00:00<?, ?it/s]

Found cached dataset audiofolder (/home/ramonperez/.cache/huggingface/datasets/audiofolder/default-937056b5ef3f06eb/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


In [None]:
dataset.save_to_disk()

In [87]:
labels = dataset.features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

num_labels = len(id2label)
int_id_2_label = {int(k): v for k, v in id2label.items()}
int_id_2_label

{0: 'Bachata', 1: 'Cumbia', 2: 'Merengue', 3: 'Salsa', 4: 'Vallenato'}

In [None]:
paths = dataset.select_columns("audio").to_pandas()['audio'].apply(lambda x: x['path']).tolist()
paths[:10]

In [None]:
dataset = dataset.train_test_split(test_size=0.2)
dataset

In [None]:
dataset['train'][0]

In [86]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



In [None]:
def get_features(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    return feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt",
        max_length=16000, truncation=True, padding=True
    )

In [None]:
%%time

encoded_latin = dataset.map(get_features, batched=True, batch_size=50)

In [None]:
encoded_latin

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
).to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="../models",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_latin["train"],
    eval_dataset=encoded_latin["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [None]:
%%time

trainer.train()

In [None]:
trainer.save_model("sec_mod")

In [None]:
classifier = pipeline("audio-classification", model="sec_mod")

In [None]:
from random import choice
audio_file = dataset["train"][choice(range(1000))]["audio"]["path"]
audio_file

In [None]:
classifier.predict(audio_file)

In [None]:
player(audio_file)

In [None]:
from datasets import concatenate_datasets

In [None]:
data = concatenate_datasets([encoded_latin['train'], encoded_latin['test']])
data

In [None]:
from transformers import AutoModel
model_ckpt = "sec_mod"
model = AutoModel.from_pretrained(model_ckpt).to(device)
type(model)

In [None]:
data

In [None]:
def extract_hidden_states(batch):
    inputs = {k: v.to(device) for k, v in batch.items() if k in feature_extractor.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:, 0].cpu().numpy()}

In [None]:
data.set_format("torch", columns=["label", "input_values"])

In [None]:
%%time

encoded_hidden = data.map(extract_hidden_states, batched=True, batch_size=50)

In [None]:
%%time

np.save('vectors_full.npy', np.array(encoded_hidden["hidden_state"]), allow_pickle=False)

In [None]:
data = encoded_hidden.rename_column("label", "genre")
data

In [None]:
player(
data.select_columns(['genre', 'audio']).to_pandas().head()['audio'][0]["bytes"]
)

In [None]:
fake = Faker()
int_id_2_label = {int(k): v for k, v in id2label.items()}

In [None]:
dataset = data.to_pandas()

In [None]:
print(dataset.loc[0, 'audio']['path'])

In [None]:
dataset['audio_path'] = paths
dataset["idx"] = [randint(10_000, 99_999) for _ in range(len(dataset))]
dataset["artist"] = [fake.name() for _ in range(len(dataset))]
dataset['genre'] = dataset['genre'].map(int_id_2_label)
dataset.head()

In [None]:
dataset[["idx", 'genre', "artist", 'audio_path']].to_json("payload.json")

In [None]:
dataset = Dataset.from_pandas(dataset)
dataset

In [None]:
data.save_to_disk("../data/audio_data")

In [None]:
dataset.select_columns(["idx", 'genre', "artist", 'audio_path']).to_pandas().head()

In [22]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http import models
import pandas as pd

In [23]:
vectors = np.load('vectors_full.npy')
vectors.shape

(1500, 768)

In [24]:
client = QdrantClient("localhost", port=6333)

In [25]:
from pprint import pprint
collection_info = client.get_collection(collection_name="test_collection")
collection_info

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=0, indexed_vectors_count=0, points_count=0, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None), payload_schema={})

In [26]:
from qdrant_client.http.models import CollectionStatus

assert collection_info.status == CollectionStatus.GREEN
assert collection_info.vectors_count == 0

In [27]:
client.recreate_collection(
    collection_name="test_collection",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

True

In [28]:
payload = pd.read_json("payload.json", orient="records").set_index("idx")
payload.head(2)

Unnamed: 0_level_0,genre,artist,audio_path
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24592,Cumbia,Brian Rowland,/home/ramonperez/Tresors/datascience/challenge...
13278,Vallenato,Sean Haynes,/home/ramonperez/Tresors/datascience/challenge...


In [29]:
# payload[['genre', 'artist', 'audio_path']]
payload.to_dict(orient="records")

[{'genre': 'Cumbia',
  'artist': 'Brian Rowland',
  'audio_path': '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0000.mp3'},
 {'genre': 'Vallenato',
  'artist': 'Sean Haynes',
  'audio_path': '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0001.mp3'},
 {'genre': 'Merengue',
  'artist': 'Tabitha Ortiz',
  'audio_path': '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0002.mp3'},
 {'genre': 'Merengue',
  'artist': 'Elizabeth Ayala',
  'audio_path': '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0003.mp3'},
 {'genre': 'Vallenato',
  'artist': 'Brandon Ruiz',
  'audio_path': '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0004.mp3'},
 {'genre': 'Cumbia',
  'artist': 'Sarah Lopez',
  'audio_path': '/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/data/Audios/Bachata/bachata0005.mp3'},
 {'

In [30]:
client.upsert(
    collection_name="test_collection",
    points=models.Batch(
        ids=payload.index.to_list(),
        payloads=payload.to_dict(orient="records"),
        vectors=vectors.tolist()
    ),
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [41]:
from diffusers import AudioLDMPipeline

repo_id = "cvssp/audioldm"
pipe = AudioLDMPipeline.from_pretrained(repo_id)
pipe = pipe.to("cuda")

prompt = "high quality bachata"

audio = pipe(prompt=prompt, num_inference_steps=20, audio_length_in_s=10.0).audios[0]

from IPython.display import Audio as player

player(audio, rate=16000)

  0%|          | 0/20 [00:00<?, ?it/s]

In [42]:
classifier = pipeline("audio-classification", model="sec_mod")

In [33]:
classifier(audio)

[{'score': 0.32287564873695374, 'label': 'Cumbia'},
 {'score': 0.21248485147953033, 'label': 'Merengue'},
 {'score': 0.18366380035877228, 'label': 'Salsa'},
 {'score': 0.15386271476745605, 'label': 'Vallenato'},
 {'score': 0.1271129548549652, 'label': 'Bachata'}]

In [48]:
feature_extractor = AutoFeatureExtractor.from_pretrained("sec_mod")
inputs = feature_extractor(
    audio, sampling_rate=feature_extractor.sampling_rate, 
    return_tensors="pt", max_length=16000, truncation=True
)

torch.float32

In [52]:
from transformers import AutoModel
model = AutoModel.from_pretrained('sec_mod').to(device)

Some weights of the model checkpoint at sec_mod were not used when initializing Wav2Vec2Model: ['projector.weight', 'classifier.bias', 'projector.bias', 'classifier.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
with torch.no_grad():
    last_hidden_state = model(**inputs.to(device)).last_hidden_state[:, 0]
last_hidden_state.size()

torch.Size([1, 768])

In [57]:
vectr = last_hidden_state.cpu().numpy()[0, :]

In [82]:
from pprint import pprint
results = client.search(
    collection_name="test_collection",
    query_vector=vectr,
    limit=10
)
results[0].payload['genre']

'Salsa'