In [1]:
import torchaudio
import torch
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset, Dataset
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
# Load model and processor
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

In [3]:
# Load dataset
dataset = load_dataset("PolyAI/minds14", "en-US")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
# Preprocess the dataset
def preprocess_function(examples):
    audio = examples['audio']
    waveform, _ = torchaudio.load(audio['path'])  # Menggunakan torchaudio untuk memuat audio
    waveform = torchaudio.transforms.Resample(orig_freq=audio['sampling_rate'], new_freq=16000)(waveform)
    input_features = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features
    return {"input_features": input_features.squeeze().tolist(), "labels": examples["transcription"]}

In [5]:
# Apply the preprocessing to the dataset
processed_dataset = dataset.map(preprocess_function, remove_columns=["audio"])

Map:   0%|          | 0/563 [00:00<?, ? examples/s]

In [6]:
# Split dataset into train and test
train_test_split = processed_dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [7]:
# Convert to PyTorch datasets
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        input_features = torch.tensor(item['input_features'], dtype=torch.float32)
        labels = item['labels']
        return {"input_features": input_features, "labels": labels}

In [8]:
train_dataset = CustomDataset(train_dataset)
test_dataset = CustomDataset(test_dataset)

In [9]:
# Define DataLoader
batch_size = 4
train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

In [10]:
# Prepare optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=num_training_steps)

In [11]:
# Move model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [13]:
for batch in train_dataloader:
    print(batch)
    break

{'input_features': tensor([[[-0.5470, -0.5470, -0.5470,  ..., -0.5470, -0.5470, -0.5470],
         [-0.5470, -0.5470, -0.5470,  ..., -0.5470, -0.5470, -0.5470],
         [-0.5470, -0.5470, -0.5470,  ..., -0.5470, -0.5470, -0.5470],
         ...,
         [-0.5470, -0.5470, -0.5470,  ..., -0.5470, -0.5470, -0.5470],
         [-0.5470, -0.5470, -0.5470,  ..., -0.5470, -0.5470, -0.5470],
         [-0.5470, -0.5470, -0.5470,  ..., -0.5470, -0.5470, -0.5470]],

        [[-0.6561, -0.6561, -0.6561,  ..., -0.6561, -0.6561, -0.6561],
         [-0.6561, -0.6561, -0.6561,  ..., -0.6561, -0.6561, -0.6561],
         [-0.6561, -0.6356, -0.6561,  ..., -0.6561, -0.6561, -0.6561],
         ...,
         [-0.6561, -0.6561, -0.6561,  ..., -0.6561, -0.6561, -0.6561],
         [-0.6561, -0.6561, -0.6561,  ..., -0.6561, -0.6561, -0.6561],
         [-0.6561, -0.6561, -0.6561,  ..., -0.6561, -0.6561, -0.6561]],

        [[-0.7007, -0.6600, -0.6692,  ..., -0.7007, -0.7007, -0.7007],
         [-0.7007, -0.6793

In [14]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        # Extract input_features and labels from the batch
        input_features = batch['input_features'].to(device)
        
        # Convert labels to tensor
        labels = batch['labels']
        
        # Encode labels if necessary (depends on the model's requirements)
        # For example, if the labels are strings and the model expects indices, you need to encode them
        # Assuming labels are strings and need to be converted to tensor of indices
        labels = processor.tokenizer(labels, return_tensors='pt', padding=True, truncation=True, max_length=128).input_ids
        labels = labels.squeeze(1).to(device)  # Adjust according to model requirements
        
        # Forward pass
        outputs = model(input_features=input_features, labels=labels)
        loss = outputs.loss
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1}/{num_epochs} completed. Loss: {loss.item()}")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/3 completed. Loss: 2.8012444972991943
Epoch 2/3 completed. Loss: 0.8791717290878296
Epoch 3/3 completed. Loss: 0.19346804916858673


In [17]:
# Evaluation loop
model.eval()
y_true = []
y_pred = []

In [18]:
for batch in test_dataloader:
    # Extract input_features and move to device
    input_features = batch['input_features'].to(device)
    
    # Perform inference without gradient computation
    with torch.no_grad():
        outputs = model.generate(input_features)
    
    # Decode predictions into text
    predictions = processor.batch_decode(outputs, skip_special_tokens=True)
    
    # Extract labels
    labels = batch['labels']
    
    # Accumulate true labels and predictions
    y_true.extend(labels)
    y_pred.extend(predictions)

# After the loop, you can evaluate the predictions

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [22]:
# Print evaluation results
print("Classification Report:")
print(classification_report(y_true, y_pred))

Classification Report:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
# Calculate and print accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.07
