In [None]:
from datasets import load_dataset

common_voice_train = load_dataset("mozilla-foundation/common_voice_11_0", "ar", split="train+validation")
common_voice_test = load_dataset("mozilla-foundation/common_voice_11_0", "ar", split="test")

In [None]:
language_code = 'ar'
language_name = 'arabic'
base_model = "facebook/wav2vec2-large-xlsr-53"

output_models_dir = f"/kaggle/working/output_models3/{language_code}/wav2vec2-large-xlsr-{language_name}-demo"
new_output_models_dir = f"/kaggle/working/output_models3/{language_code}/wav2vec2-large-xlsr-{language_name}"

# Example: Check the created directories
import os

print(f"Base model: {base_model}")
print(f"Output models directory: {output_models_dir}")
print(f"New output models directory: {new_output_models_dir}")

# Create the directories if they don't exist
os.makedirs(new_output_models_dir, exist_ok=True)


Base model: facebook/wav2vec2-large-xlsr-53
Output models directory: /kaggle/working/output_models3/ar/wav2vec2-large-xlsr-arabic-demo
New output models directory: /kaggle/working/output_models3/ar/wav2vec2-large-xlsr-arabic


In [None]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(common_voice_train.remove_columns(["path", "audio"]), num_examples=10)

Unnamed: 0,sentence
0,آسفة ، أنا مشغولة الآن.
1,قال الذين استكبروا إنا بالذي آمنتم به كافرون
2,.لم يكن لديّ وقت للأكل
3,غير مسموح لك أن تصعد إلى سيارته.
4,قَبَّحَهُ اللَّهُ أَتَرَوْنَهُ لَوْ زَادُوهُ فَعَلَ ؟ وَعَزَلَهُ
5,فَأَجَاءَهَا الْمَخَاضُ إِلَى جِذْعِ النَّخْلَةِ قَالَتْ يَا لَيْتَنِي مِتُّ قَبْلَ هَذَا وَكُنْتُ نَسْيًا مَنْسِيًّا
6,وَأَقْلَعَ عَنْ سَالِفِ زَلَلِهِ
7,كان ثمّة رجل آخر في حياة ليلى.
8,يجب ان يحبك
9,وَقَلْبٌ هَائِمٌ


In [None]:
import re
chars_to_ignore_regex = '[\,\؟\.\!\-\;\:\'\"\☭\«\»\؛\—\ـ\_\،\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    batch["sentence"] = re.sub('[a-z]','',batch["sentence"])
    batch["sentence"] = re.sub("[إأٱآا]", "ا", batch["sentence"])
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    batch["sentence"] = re.sub(noise, '', batch["sentence"])
    return batch


common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)
show_random_elements(common_voice_train.remove_columns(["path","audio"]))

Map:   0%|          | 0/38481 [00:00<?, ? examples/s]

Map:   0%|          | 0/10440 [00:00<?, ? examples/s]

Unnamed: 0,sentence
0,هي تحتاجه اكثر مما يحتاجها
1,لا تنس نقودك
2,متى ينتهي الدوام المدرسي
3,من كان هنا فاما ان يسبح او يغرق
4,هل تحتاج حقا لكلب اضافي
5,انت هو حياتي يا بني
6,وذكر ان في التوراة مكتوبا يا ابن ادم اذكرني حين تغضب اذكرك حين اغضب
7,ولقد راه نزلة اخرى
8,هل عندك سيارة
9,الناس يخطئون


In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

Map:   0%|          | 0/38481 [00:00<?, ? examples/s]

Map:   0%|          | 0/10440 [00:00<?, ? examples/s]

In [None]:
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
common_voice_train[0]["path"]

'/root/.cache/huggingface/datasets/downloads/extracted/03e3da23e5fa2d7de56094c40cc7ea529b3b571b215978f053d1fd26557901a6/ar_train_0/common_voice_ar_24082672.mp3'

In [None]:
from datasets import Audio
common_voice_train[0]["audio"]
common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
common_voice_train[0]["audio"]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/03e3da23e5fa2d7de56094c40cc7ea529b3b571b215978f053d1fd26557901a6/ar_train_0/common_voice_ar_24082672.mp3',
 'array': array([ 8.73114914e-11, -4.36557457e-11,  1.67347025e-10, ...,
        -2.33121682e-08, -2.25205440e-07,  4.55183908e-08]),
 'sampling_rate': 16000}

In [None]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(common_voice_train)-1)

print(common_voice_train[rand_int]["sentence"])
ipd.Audio(data=common_voice_train[rand_int]["audio"]["array"], autoplay=True, rate=16000)

اعمل مع سالي في المكتب نفسه 


In [None]:
rand_int = random.randint(0, len(common_voice_train)-1)

print("Target text:", common_voice_train[rand_int]["sentence"])
print("Input array shape:", common_voice_train[rand_int]["audio"]["array"].shape)
print("Sampling rate:", common_voice_train[rand_int]["audio"]["sampling_rate"])

Target text: بدت كانها سكرت 
Input array shape: (56832,)
Sampling rate: 16000


In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)

Map:   0%|          | 0/38481 [00:00<?, ? examples/s]



Map:   0%|          | 0/10440 [00:00<?, ? examples/s]

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch
import numpy as np
from evaluate import load  # Using 'evaluate' library instead of 'datasets'
from transformers import Wav2Vec2Processor  # Assuming Wav2Vec2Processor is imported from transformers
from typing import List, Dict, Union, Optional
from dataclasses import dataclass

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # Replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

# Assuming 'processor' is defined and initialized correctly
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Load WER metric
wer_metric = load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # Ensure label_ids are not masked incorrectly
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
import tempfile
from transformers import Wav2Vec2ForCTC

# Load Wav2Vec2 model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

# Freeze feature extractor
model.freeze_feature_extractor()

config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=new_output_models_dir,
  #output_dir="dev/",
  group_by_length=True,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=8,
  dataloader_num_workers = 1,
  evaluation_strategy="steps",
  num_train_epochs=5,
  fp16=True,
  save_steps=400,
  eval_steps=400,
  logging_steps=400,
  learning_rate=1e-4,
  warmup_steps=500,
  save_total_limit=2,
)
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)
trainer.train()
# Save the trained model
trainer.save_model("./wav2vec2-arabic-model")

# Save the processor (feature extractor + tokenizer)
processor.save_pretrained("./wav2vec2-arabic-processor")

print("Model and processor saved successfully.")


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  self.pid = os.fork()
  self.pid = os.fork()


Step,Training Loss,Validation Loss,Wer
400,9.6278,3.030787,1.0
800,2.2366,0.663298,0.768855
1200,0.7164,0.419083,0.588736
1600,0.539,0.358852,0.537523
2000,0.4755,0.336499,0.512864
2400,0.4381,0.31716,0.500657
2800,0.4115,0.314824,0.491042


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  self.pid = os.fork()
  self.pid = os.fork()
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  self.pid = os.fork()
  self.pid = os.fork()
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You s

Model and processor saved successfully.


In [None]:
import shutil

# Zip the model directory
shutil.make_archive("wav2vec2-arabic-model", 'zip', "./wav2vec2-arabic-model")
shutil.make_archive("wav2vec2-arabic-processor", 'zip', "./wav2vec2-arabic-processor")

print("Model and processor zipped successfully.")


Model and processor zipped successfully.
