# IF5281 - UAS

## Initial Setup

In [None]:
import os

token = "redacted"
username = "stefanus-lamlo"
repo_name = "IF5281"

repo_url = f"https://{username}:{token}@github.com/{username}/{repo_name}.git"

!git clone {repo_url}

In [None]:
import os
os.makedirs('dataset', exist_ok=True)
!mv *.wav dataset/

In [None]:
!pip install datasets evaluate

## Import Dependencies

In [2]:
import torch
import os
import torchaudio
import evaluate
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer, AutoFeatureExtractor

## Define Model and Dataset

In [3]:
model_name = "facebook/wav2vec2-base"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)



In [4]:
def preprocess_audio(file_path, target_sample_rate=16000):
    waveform, original_sample_rate = torchaudio.load(file_path)
    if original_sample_rate != target_sample_rate:
        resample_transform = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=target_sample_rate)
        waveform = resample_transform(waveform)
    return waveform.squeeze().tolist()

In [5]:
def load_custom_dataset(dataset_path, label_map, target_sample_rate=16000):
    data = []
    for file_name in os.listdir(dataset_path):
        if file_name.endswith(".wav"):
            file_path = os.path.join(dataset_path, file_name)
            input_values = preprocess_audio(file_path, target_sample_rate)
            category = file_name.split('-')[0]
            if category in label_map:
                label = label_map[category]
                data.append({"input_values": input_values, "label": label})
    return data

In [6]:
def ensure_list_of_floats(input_values):
    def flatten(lst):
        for elem in lst:
            if isinstance(elem, list):
                yield from flatten(elem)
            else:
                yield elem
    
    return list(flatten(input_values))

dataset_path = "dataset"
categories = ["sopran", "alto", "tenor", "bass"]
label_map = {category: idx for idx, category in enumerate(categories)}

data = load_custom_dataset(dataset_path, label_map)

# Ensure the input values are lists of floats and labels are integers
for item in data:
    item['input_values'] = ensure_list_of_floats(item['input_values'])
    item['label'] = int(item['label'])

train_data, val_data = train_test_split(data, test_size=0.2, stratify=[item['label'] for item in data], random_state=42)

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
})

dataset_dict.save_to_disk("dataset")

loaded_dataset = DatasetDict.load_from_disk("dataset")

train_dataset = loaded_dataset["train"]
val_dataset = loaded_dataset["validation"]

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")

Saving the dataset (0/1 shards):   0%|          | 0/92 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24 [00:00<?, ? examples/s]

Training examples: 92
Validation examples: 24


In [7]:
labels = train_dataset.features["label"]
label2id = {label: str(idx) for label, idx in label_map.items()}
id2label = {str(idx): label for label, idx in label_map.items()}

print("Label to ID mapping:", label2id)
print("ID to Label mapping:", id2label)

Label to ID mapping: {'sopran': '0', 'alto': '1', 'tenor': '2', 'bass': '3'}
ID to Label mapping: {'0': 'sopran', '1': 'alto', '2': 'tenor', '3': 'bass'}


In [8]:
train_dataset.features

{'input_values': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None)}

In [9]:
def preprocess_function(examples):
    inputs = feature_extractor(
        examples["input_values"], sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [10]:
encoded_dataset = loaded_dataset.map(preprocess_function, batched=True)

In [11]:
print(encoded_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_values', 'label'],
        num_rows: 92
    })
    validation: Dataset({
        features: ['input_values', 'label'],
        num_rows: 24
    })
})


## Creating Trainer and Inference

In [12]:
accuracy = evaluate.load("accuracy")

In [13]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [15]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight', 'projector.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [17]:
training_args = TrainingArguments(
    output_dir="vocal_classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=50,
    warmup_ratio=0.1,
    logging_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to=[] 
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,1.0378,1.394785,0.208333
2,0.69,1.380538,0.25
3,0.3361,1.38068,0.291667
4,1.0008,1.383683,0.25
5,0.9809,1.375495,0.333333
6,0.6295,1.372625,0.375
7,0.304,1.362572,0.291667
8,0.8966,1.354226,0.291667
9,0.8761,1.333701,0.333333
10,0.5569,1.306512,0.541667




TrainOutput(global_step=50, training_loss=0.4865791082382202, metrics={'train_runtime': 123.7944, 'train_samples_per_second': 37.158, 'train_steps_per_second': 0.404, 'total_flos': 2.7853396326912e+16, 'train_loss': 0.4865791082382202, 'epoch': 33.33})

## Evaluation of Inference

In [21]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = (preds == labels).astype(float).mean().item()
    return {"accuracy": accuracy}

In [22]:
def run_inference_on_dataset(trainer, dataset_dict, label2id, id2label):
    all_results = []
    for split in dataset_dict:
        dataset = dataset_dict[split]
        predictions = trainer.predict(dataset)
        preds = predictions.predictions.argmax(-1)
        labels = predictions.label_ids
        filenames = dataset['filename']
        
        for filename, true_label, pred_label in zip(filenames, labels, preds):
            result = {
                "filename": filename,
                "true_label": id2label[str(true_label)],
                "predicted_label": id2label[str(pred_label)]
            }
            all_results.append(result)
    
    return all_results

In [23]:
for split in encoded_dataset:
    filenames = [f"{split}_{i}.wav" for i in range(len(encoded_dataset[split]))]
    encoded_dataset[split] = encoded_dataset[split].add_column("filename", filenames)

In [24]:
results = run_inference_on_dataset(trainer, encoded_dataset, label2id, id2label)

In [25]:
max_filename_length = max(len(result['filename']) for result in results)
max_label_length = max(len(result['true_label']) for result in results)

header = f"{'Filename'.ljust(max_filename_length)} | {'True Label'.ljust(max_label_length)} | {'Predicted Label'.ljust(max_label_length)}"
print(header)
print("-" * len(header))

for result in results:
    print(f"{result['filename'].ljust(max_filename_length)} | {result['true_label'].ljust(max_label_length)} | {result['predicted_label'].ljust(max_label_length)}")

Filename          | True Label | Predicted Label
------------------------------------------------
train_0.wav       | bass   | bass  
train_1.wav       | alto   | sopran
train_2.wav       | bass   | bass  
train_3.wav       | bass   | bass  
train_4.wav       | sopran | sopran
train_5.wav       | sopran | sopran
train_6.wav       | bass   | bass  
train_7.wav       | alto   | sopran
train_8.wav       | tenor  | tenor 
train_9.wav       | sopran | sopran
train_10.wav      | tenor  | tenor 
train_11.wav      | alto   | sopran
train_12.wav      | bass   | bass  
train_13.wav      | tenor  | tenor 
train_14.wav      | tenor  | tenor 
train_15.wav      | bass   | tenor 
train_16.wav      | tenor  | tenor 
train_17.wav      | alto   | sopran
train_18.wav      | sopran | sopran
train_19.wav      | tenor  | tenor 
train_20.wav      | bass   | bass  
train_21.wav      | tenor  | bass  
train_22.wav      | tenor  | tenor 
train_23.wav      | alto   | alto  
train_24.wav      | bass   | bass  
tr

In [28]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score

true_labels = [result['true_label'] for result in results]
predicted_labels = [result['predicted_label'] for result in results]
categories = sorted(set(true_labels))

cm = confusion_matrix(true_labels, predicted_labels, labels=categories)
print("Confusion Matrix:")
print(cm)

report = classification_report(true_labels, predicted_labels, target_names=categories)

print("\nClassification Report:")
print(report)


Confusion Matrix:
[[11  2 16  0]
 [ 0 21  0  8]
 [ 1  4 24  0]
 [ 0  3  0 26]]

Classification Report:
              precision    recall  f1-score   support

        alto       0.92      0.38      0.54        29
        bass       0.70      0.72      0.71        29
      sopran       0.60      0.83      0.70        29
       tenor       0.76      0.90      0.83        29

    accuracy                           0.71       116
   macro avg       0.75      0.71      0.69       116
weighted avg       0.75      0.71      0.69       116



In [None]:
!git add .
!git commit -m "Notebook"
!git push