In [1]:
from datasets import Dataset
import random
import pandas as pd
import re
import json
import IPython.display as ipd
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor
import torch

In [2]:
reloaded_dataset = Dataset.load_from_disk("../results/WAV2VEC2_DATASET")  # Load the dataset from the disk
reloaded_dataset = reloaded_dataset.remove_columns(["filename"]) # Remove the filename column
reloaded_dataset

Dataset({
    features: ['audio', 'text'],
    num_rows: 2947
})

In [3]:
# Annotations to exclude
annotations_to_exclude = [
    "[*]", "[LAUGHTER]", "[SONANT]", "[MUSIC]", "[SYSTEM]", "[ENS]", "[UNK]", "+"
]

# Filter out annotations
filtered_dataset = reloaded_dataset.filter(
    lambda example: not any(annotation in example["text"] for annotation in annotations_to_exclude)
)

filtered_dataset

Filter:   0%|          | 0/2947 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'text'],
    num_rows: 2886
})

In [4]:
def convert_to_proper_case(text):
    # Replace hyphens with spaces
    text = re.sub(r'-', ' ', text)
    
    # Remove periods, commas, and apostrophes
    text = re.sub(r'[.,\']', '', text)

    # Use regular expression to match words
    text = re.sub(r'\b([A-Za-z]{2,})\b', lambda match: match.group(1).lower(), text)
    
    return text

filtered_dataset = filtered_dataset.map(
    lambda example: {**example, 'text': convert_to_proper_case(example['text'])}
)


Map:   0%|          | 0/2886 [00:00<?, ? examples/s]

In [5]:
df = pd.DataFrame(filtered_dataset)
df

Unnamed: 0,audio,text
0,"[-511.0000000000002, -515.9999999999998, -513....",halo mishela
1,"[-12.000000000000083, -11.99999999999995, -13....",gimana kabarnya di sana
2,"[-6.000000000000045, -5.939557236068103e-14, 4...",wes sendiri juga baik baik aja
3,"[-16.99999999999999, -19.000000000000046, -18....",terus
4,"[-8.000000000000075, -4.999999999999978, -4.99...",gimana
...,...,...
2881,"[-31.999999999998757, -11.000000000000234, -8....",iya
2882,"[-14.999999999999968, -25.000000000000497, -33...",oke
2883,"[32.999999999999645, 16.00000000000048, 20.000...",dah
2884,"[-44.00000000000009, -34.9999999999996, -36.99...",dah


In [6]:
# Define the function to compute vocab
def extract_all_chars(dataset):
    # Concatenate all text into a single string and get unique characters
    all_text = " ".join(dataset["text"])
    vocab = sorted(set(all_text))
    return vocab

# Extract vocabulary
vocab = extract_all_chars(filtered_dataset)
vocab_list = list(vocab)

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{' ': 0,
 'A': 1,
 'C': 2,
 'D': 3,
 'E': 4,
 'F': 5,
 'G': 6,
 'H': 7,
 'I': 8,
 'K': 9,
 'M': 10,
 'N': 11,
 'O': 12,
 'P': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'U': 17,
 'V': 18,
 'W': 19,
 'Y': 20,
 'a': 21,
 'b': 22,
 'c': 23,
 'd': 24,
 'e': 25,
 'f': 26,
 'g': 27,
 'h': 28,
 'i': 29,
 'j': 30,
 'k': 31,
 'l': 32,
 'm': 33,
 'n': 34,
 'o': 35,
 'p': 36,
 'q': 37,
 'r': 38,
 's': 39,
 't': 40,
 'u': 41,
 'v': 42,
 'w': 43,
 'x': 44,
 'y': 45,
 'z': 46}

In [7]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
vocab_dict

{' ': 0,
 'A': 1,
 'C': 2,
 'D': 3,
 'E': 4,
 'F': 5,
 'G': 6,
 'H': 7,
 'I': 8,
 'K': 9,
 'M': 10,
 'N': 11,
 'O': 12,
 'P': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'U': 17,
 'V': 18,
 'W': 19,
 'Y': 20,
 'a': 21,
 'b': 22,
 'c': 23,
 'd': 24,
 'e': 25,
 'f': 26,
 'g': 27,
 'h': 28,
 'i': 29,
 'j': 30,
 'k': 31,
 'l': 32,
 'm': 33,
 'n': 34,
 'o': 35,
 'p': 36,
 'q': 37,
 'r': 38,
 's': 39,
 't': 40,
 'u': 41,
 'v': 42,
 'w': 43,
 'x': 44,
 'y': 45,
 'z': 46,
 '[UNK]': 47,
 '[PAD]': 48}

In [8]:
with open('../results/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [9]:
rand_int = random.randint(0, len(reloaded_dataset))

print(reloaded_dataset[rand_int]["text"])
ipd.Audio(data=np.asarray(reloaded_dataset[rand_int]["audio"]), autoplay=True, rate=16000)


terus menurut mu apa lagi


In [10]:
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False
)

tokenizer = Wav2Vec2CTCTokenizer(
    "../results/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token=" "
)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

def prepare_dataset(batch):
    
    audio = batch["audio"]
    
    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio, sampling_rate=16000).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    
    return batch

# Apply the function with multiple processes
dataset = filtered_dataset.map(prepare_dataset)
dataset

Map:   0%|          | 0/2886 [00:00<?, ? examples/s]



Dataset({
    features: ['audio', 'text', 'input_values', 'labels'],
    num_rows: 2886
})

In [11]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels for separate padding
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input features
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Pad labels as target processor
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # Handle label padding (-100 for ignored loss computation)
        if "attention_mask" in labels_batch:
            labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        else:
            labels = labels_batch["input_ids"].masked_fill(
                labels_batch["input_ids"] == self.processor.tokenizer.pad_token_id, -100
            )

        # Add labels to the batch
        batch["labels"] = labels

        return batch


In [12]:
import evaluate

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Evaluation metrics: WER, CER, and MER
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

print(wer_metric)
print(cer_metric)

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}

EvaluationModule(name: "wer", module_type: "metric", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Compute WER score of transcribed segments against references.

Args:
    references: List of references for each speech input.
    predictions: List of transcriptions to score.
    concatenate_texts (bool, default=False): Whether to concatenate all input texts or compute WER iteratively.

Returns:
    (float): the word error rate

Examples:

    >>> predictions = ["this is the prediction", "there is an other sample"]
    >>> references = ["this is the reference", "there is another one"]
    >>> wer = evaluate.load("wer")
    >>> wer_score = wer.compute(predictions=predictions, references=references)
    >>> print(wer_score)
    0.5
""", stored examples: 0)
EvaluationModule(name: "cer", module_type: "metric", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', i

In [13]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,      # Ensure pad token is set
    vocab_size=processor.tokenizer.vocab_size           # Match tokenizer vocab size
)

# Ensure blank token is the last ID
model.config.blank_token_id = processor.tokenizer.pad_token_id

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model.freeze_feature_encoder()

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="../results/trainingResults",
  group_by_length=True,
  per_device_train_batch_size=15,
  eval_strategy="steps",
  num_train_epochs=10,
  fp16=True,
  gradient_checkpointing=True, 
  save_steps=500,
  eval_steps=500,
  logging_steps=100,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,
)




In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset,
    eval_dataset=dataset.select(range(501,600)),
    tokenizer=processor.feature_extractor,
)

  trainer = Trainer(


In [17]:
trainer.train()

  0%|          | 0/1930 [00:00<?, ?it/s]



{'loss': 16.056, 'grad_norm': 5.04547119140625, 'learning_rate': 1e-05, 'epoch': 0.52}
{'loss': 3.9266, 'grad_norm': 2.114565372467041, 'learning_rate': 2e-05, 'epoch': 1.04}
{'loss': 3.3656, 'grad_norm': 1.6932415962219238, 'learning_rate': 3e-05, 'epoch': 1.55}
{'loss': 3.1222, 'grad_norm': 0.7612019777297974, 'learning_rate': 4e-05, 'epoch': 2.07}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'loss': 3.0622, 'grad_norm': 1.3215218782424927, 'learning_rate': 5e-05, 'epoch': 2.59}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 3.106966257095337, 'eval_wer': 1.0, 'eval_cer': 1.0, 'eval_runtime': 19.7211, 'eval_samples_per_second': 5.02, 'eval_steps_per_second': 0.659, 'epoch': 2.59}




{'loss': 2.9822, 'grad_norm': 0.6667632460594177, 'learning_rate': 6e-05, 'epoch': 3.11}
{'loss': 2.9849, 'grad_norm': 1.7284296751022339, 'learning_rate': 7e-05, 'epoch': 3.63}
{'loss': 2.9444, 'grad_norm': 0.475057989358902, 'learning_rate': 8e-05, 'epoch': 4.15}
{'loss': 2.8906, 'grad_norm': 1.8983972072601318, 'learning_rate': 9e-05, 'epoch': 4.66}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'loss': 2.2207, 'grad_norm': 6.36513614654541, 'learning_rate': 0.0001, 'epoch': 5.18}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 1.623173713684082, 'eval_wer': 0.9963702359346642, 'eval_cer': 0.4419551934826884, 'eval_runtime': 20.3483, 'eval_samples_per_second': 4.865, 'eval_steps_per_second': 0.639, 'epoch': 5.18}




{'loss': 1.6034, 'grad_norm': 3.404778480529785, 'learning_rate': 8.924731182795699e-05, 'epoch': 5.7}
{'loss': 1.3315, 'grad_norm': 7.657089710235596, 'learning_rate': 7.849462365591398e-05, 'epoch': 6.22}
{'loss': 1.1388, 'grad_norm': 6.335774898529053, 'learning_rate': 6.774193548387096e-05, 'epoch': 6.74}
{'loss': 1.0105, 'grad_norm': 1.8491891622543335, 'learning_rate': 5.6989247311827965e-05, 'epoch': 7.25}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'loss': 0.8962, 'grad_norm': 1.3926045894622803, 'learning_rate': 4.6236559139784944e-05, 'epoch': 7.77}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6198934316635132, 'eval_wer': 0.6061705989110708, 'eval_cer': 0.14697895451459606, 'eval_runtime': 19.0176, 'eval_samples_per_second': 5.206, 'eval_steps_per_second': 0.684, 'epoch': 7.77}




{'loss': 0.8398, 'grad_norm': 1.906684398651123, 'learning_rate': 3.548387096774194e-05, 'epoch': 8.29}
{'loss': 0.7679, 'grad_norm': 2.2868077754974365, 'learning_rate': 2.4731182795698928e-05, 'epoch': 8.81}
{'loss': 0.7246, 'grad_norm': 2.1066315174102783, 'learning_rate': 1.3978494623655914e-05, 'epoch': 9.33}
{'loss': 0.6844, 'grad_norm': 1.978676438331604, 'learning_rate': 3.225806451612903e-06, 'epoch': 9.84}
{'train_runtime': 10991.2117, 'train_samples_per_second': 2.626, 'train_steps_per_second': 0.176, 'train_loss': 2.7357519337550347, 'epoch': 10.0}


TrainOutput(global_step=1930, training_loss=2.7357519337550347, metrics={'train_runtime': 10991.2117, 'train_samples_per_second': 2.626, 'train_steps_per_second': 0.176, 'total_flos': 7.102520233114181e+17, 'train_loss': 2.7357519337550347, 'epoch': 10.0})

In [18]:
# Example audio file
test_audio = dataset[800]["audio"]  # Replace with your test audio data
print(dataset[800]['text'])
# Process the audio input
input_values = processor(test_audio, sampling_rate=16000, return_tensors="pt").input_values

# Move input to GPU if available
if torch.cuda.is_available():
    input_values = input_values.to("cuda")
    model.to("cuda")
    print("There is cuda!")

# Get the model's predictions
with torch.no_grad():
    logits = model(input_values).logits

# Decode the predictions to text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

print("Transcription:", transcription)


ah aku si semua makanan suka ya
Transcription: ['kallaku si sema makanan suka ya']




In [23]:
trainer.save_model("../savedModel")
processor.save_pretrained("../savedModel")

[]