# **Fine-tuning Wav2Vec2 for Amharic ASR with Transformers**

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Mar 31 15:17:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
%%capture
!pip install transformers[torch]
!pip install datasets
!pip install jiwer

In [None]:
!pip install accelerate -U



In [None]:
from peft import prepare_model_for_int8_training

ModuleNotFoundError: No module named 'peft'

In [None]:
%%capture
!apt install git-lfs

## Prepare Data, Tokenizer, Feature Extractor

### Create Wav2Vec2CTCTokenizer

Let's start by loading the dataset and taking a look at its structure.

In [None]:
# cp -r /content/gdrive/MyDrive/NewModel/ALFFAAmharic

In [None]:
from datasets import load_dataset, load_metric
# amha = load_dataset("Samuael/ALFFAAmharic", data_dir="/content/gdrive/MyDrive/NewModel/CommonTTest/")
# amha = load_dataset("Samuael/ALFFAAmharic", data_dir="/content/gdrive/MyDrive/NewModel/CommonVoiceTigreTest/")
amha = load_dataset("Samuael/ALFFAAmharic", data_dir="/content/gdrive/MyDrive/NewModel/ALFFAAmharicTest/")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
import IPython.display as ipd
import numpy as np
import random

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(amha["train"].remove_columns(["audio", "file"]), num_examples=4)

In [None]:
#@title Removing Special Characters
import re
chars_to_ignore_regex = '[\,\?\.\-\;\:\"]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    return batch

amha = amha.map(remove_special_characters)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/359 [00:00<?, ? examples/s]

In [None]:
show_random_elements(amha["train"].remove_columns(["audio", "file"]))

In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
vocabs = amha.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=amha.column_names["train"])

Map:   0%|          | 0/268 [00:00<?, ? examples/s]

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

Now, we create the union of all distinct letters in the training dataset and test dataset and convert the resulting list into an enumerated dictionary.

In [None]:
#@title Vocab List
vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

In [None]:
!cp /content/gdrive/MyDrive/NewModel/ALFFAAmharic/vocab_map_new.json .
!cp /content/gdrive/MyDrive/NewModel/ALFFAAmharic/phoneme_map.json .
!cp /content/gdrive/MyDrive/NewModel/ALFFAAmharic/char_map.json .
!cp /content/gdrive/MyDrive/NewModel/ALFFAAmharic/testing_chars.txt .

In a final step, we use the json file to instantiate an object of the `Wav2Vec2CTCTokenizer` class.

In [None]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("/content/vocab_map_new.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [None]:
repo_name = "asr-amharic-phoneme-based-38"

In [None]:
tokenizer.push_to_hub(repo_id="Samuael/"+repo_name)

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Samuael/asr-amharic-phoneme-based-38/commit/105bbf28add4067dd1215e7e8f127f6e909e4dcb', commit_message='Upload tokenizer', commit_description='', oid='105bbf28add4067dd1215e7e8f127f6e909e4dcb', pr_url=None, pr_revision=None, pr_num=None)

### Create Wav2Vec2 Feature Extractor

In [None]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

Next, we can prepare the dataset.

In [None]:
!cd /content/gdrive/MyDrive/NewModel/ALFFAAmharic/train/wav/ && ls -1 | wc -l

^C


In [None]:
!cd /content/gdrive/MyDrive/NewModel/ALFFAAmharic/train/wav_old/ && ls -1 | wc -l

^C


In [None]:
amha["train"][0]["file"]

'/content/gdrive/MyDrive/NewModel/ALFFAAmharicTest/train/wav/tr_10000_tr097082.wav'

In [None]:
amha["train"][0]["audio"]

{'path': '/content/gdrive/MyDrive/NewModel/ALFFAAmharicTest/train/wav/tr_10000_tr097082.wav',
 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.12573242,
        -0.12634277, -0.12634277]),
 'sampling_rate': 16000}

In [None]:
rand_int = random.randint(0, len(amha["train"]))

print(amha["train"][rand_int]["text"])
ipd.Audio(data=np.asarray(amha["train"][rand_int]["audio"]["array"]), autoplay=True, rate=16000)

ይኧ  ጥኧ ምኧ ን ጅኣ  ትኧ ክኡ ስ  ትኧ ክኧ ፍኧ ትኧ  ንኣ  ኣ ርኣ ት  ይኧ ኤ ር ት ርኣ  ውኧ ትኣ ድኧ ርኦ ች  ትኧ ግኧ ድኧ ልኡ 


NameError: name 'ipd' is not defined

In [None]:
rand_int = random.randint(0, len(amha["train"]))

print("Target text:", amha["train"][rand_int]["text"])
print("Input array shape:", np.asarray(amha["train"][rand_int]["audio"]["array"]).shape)
print("Sampling rate:", amha["train"][rand_int]["audio"]["sampling_rate"])

In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

In [None]:
amha = amha.map(prepare_dataset, remove_columns=amha.column_names["train"], num_proc=16)

num_proc must be <= 4. Reducing num_proc to 4 for dataset of size 4.


## Training & Evaluation


### Set-up Trainer

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

  wer_metric = load_metric("wer")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


The model will return a sequence of logit vectors:
$\mathbf{y}_1, \ldots, \mathbf{y}_m$ with $\mathbf{y}_1 = f_{\theta}(x_1, \ldots, x_n)[0]$ and $n >> m$.

A logit vector $\mathbf{y}_1$ contains the log-odds for each word in the vocabulary we defined earlier, thus $\text{len}(\mathbf{y}_i) =$ `config.vocab_size`. We are interested in the most likely prediction of the model and thus take the `argmax(...)` of the logits. Also, we transform the encoded labels back to the original string by replacing `-100` with the `pad_token_id` and decoding the ids while making sure that consecutive tokens are **not** grouped to the same token in CTC style ${}^1$.

In [None]:
def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  return batch

In [None]:
# tigre_phonemeMap= {"ህኡኣ": "ኋ",    "ልኡኣ": "ሏ",    "ምኡኣ": "ሟ",    "ርኡኣ": "ሯ",    "ስኡኣ": "ሷ",    "ሽኡኣ": "ሿ",    "ቅኡኣ": "ቋ",    "ብኡኣ": "ቧ",    "ቭኡኣ": "ቯ",    "ትኡኣ": "ቷ",    "ችኡኣ": "ቿ",    "ንኡኣ": "ኗ",    "ኝኡኣ": "ኟ",    "ክኡኣ": "ኳ",    "ዝኡኣ": "ዟ",    "ዥኡኣ": "ዧ",    "ድኡኣ": "ዷ",    "ጅኡኣ": "ጇ",    "ግኡኣ": "ጓ",    "ጥኡኣ": "ጧ",    "ጭኡኣ": "ጯ",    "ጵኡኣ": "ጷ",    "ጽኡኣ": "ጿ",    "ፍኡኣ": "ፏ",    "ፕኡኣ": "ፗ",    "ግኡኤ": "ጔ",    "ክኡኤ": "ኴ",    "ህኡኤ": "ኌ",    "ቅኡኤ": "ቌ",    "ግኡኢ": "ጒ",    "ክኡኢ": "ኲ",    "ቅኡኢ": "ቊ",    "ፍኡኢ": "ᎉ",    "ግኡእ": "ጕ",    "ህኡኢ": "ዂ",    "ክኡእ": "ኵ",    "ቅኡእ": "ቍ",    "ምኢኣ": "ፙ",    "ፍኢኣ": "ፚ",    "ህኡእ": "ዅ",    "ፍኡኤ": "ᎊ",    "ፍኡኧ": "ᎋ",    "ርኢኣ": "ፘ",    "ፕኡኧ": "ᎌ",    "ፕኡኢ": "ᎍ",    "ትኡኤ": "ᎎ",    "ፕኡኤ": "ᎏ",    "ቅህኧ": "ቐ",    "ቅህኡ": "ቑ",    "ቅህኢ": "ቒ",    "ቅህኣ": "ቓ","ቅህኤ": "ቔ","ቅህእ": "ቕ", "ቅህኦ": "ቖ",    "ቅህኡኢ":"ቚ", "ቅህኡኣ":"ቛ", "ቅህኡኤ":"ቜ",    "ህ": "ህ",    "ል": "ል",    "ም": "ም",    "ር": "ር",    "ስ": "ስ",    "ሽ": "ሽ",    "ቅ": "ቅ",    "ብ": "ብ",    "ቭ": "ቭ",    "ት": "ት",    "ች": "ች",    "ን": "ን",    "ኝ": "ኝ",    "ክ": "ክ",    "ው": "ው",    "ዝ": "ዝ",    "ዥ": "ዥ",    "ይ": "ይ",    "ድ": "ድ",    "ጅ": "ጅ",    "ግ": "ግ",    "ጥ": "ጥ",    "ጭ": "ጭ",    "ጵ": "ጵ",    "ጽ": "ጽ",    "ፍ": "ፍ",    "ፕ": "ፕ",    "ኧ": "ኧ",    "ኣ": "አ",    "ኡ": "ኡ",    "ኢ": "ኢ",    "ኤ": "ኤ",    "እ": "እ",    "ኦ": "ኦ",    "ህኧ": "ኸ",    "ህኣ": "ሀ",    "ህኡ": "ሁ",    "ህኢ": "ሂ",    "ህኤ": "ሄ",    "ህኦ": "ሆ",    "ልኧ": "ለ",    "ልኡ": "ሉ",    "ልኢ": "ሊ",    "ልኣ": "ላ",    "ልኤ": "ሌ",    "ልኦ": "ሎ",    "ምኧ": "መ",    "ምኡ": "ሙ",    "ምኢ": "ሚ",    "ምኣ": "ማ",    "ምኤ": "ሜ",    "ምኦ": "ሞ",    "ርኧ": "ረ",    "ርኡ": "ሩ",    "ርኢ": "ሪ",    "ርኣ": "ራ",    "ርኤ": "ሬ",    "ርኦ": "ሮ",    "ስኧ": "ሰ",    "ስኡ": "ሱ",    "ስኢ": "ሲ",    "ስኣ": "ሳ",    "ስኤ": "ሴ",    "ስኦ": "ሶ",    "ሽኧ": "ሸ",    "ሽኡ": "ሹ",    "ሽኢ": "ሺ",    "ሽኣ": "ሻ",    "ሽኤ": "ሼ",    "ሽኦ": "ሾ",    "ቅኧ": "ቀ",    "ቅኡ": "ቁ",    "ቅኢ": "ቂ",    "ቅኣ": "ቃ",    "ቅኤ": "ቄ",    "ቅኦ": "ቆ",    "ብኧ": "በ",    "ብኡ": "ቡ",    "ብኢ": "ቢ",    "ብኣ": "ባ",    "ብኤ": "ቤ",    "ብኦ": "ቦ",    "ቭኧ": "ቨ",    "ቭኡ": "ቩ",    "ቭኢ": "ቪ",    "ቭኣ": "ቫ",    "ቭኤ": "ቬ",    "ቭኦ": "ቮ",    "ትኧ": "ተ",    "ትኡ": "ቱ",    "ትኢ": "ቲ",    "ትኣ": "ታ",    "ትኤ": "ቴ",    "ትኦ": "ቶ",    "ችኧ": "ቸ",    "ችኡ": "ቹ",    "ችኢ": "ቺ",    "ችኣ": "ቻ",    "ችኤ": "ቼ",    "ችኦ": "ቾ",    "ንኧ": "ነ",    "ንኡ": "ኑ",    "ንኢ": "ኒ",    "ንኣ": "ና",    "ንኤ": "ኔ",    "ንኦ": "ኖ",    "ኝኧ": "ኘ",    "ኝኡ": "ኙ",    "ኝኢ": "ኚ",    "ኝኣ": "ኛ",    "ኝኤ": "ኜ",    "ኝኦ": "ኞ",    "ክኧ": "ከ",    "ክኡ": "ኩ",    "ክኢ": "ኪ",    "ክኣ": "ካ",    "ክኤ": "ኬ",    "ክኦ": "ኮ",    "ውኧ": "ወ",    "ውኡ": "ዉ",    "ውኢ": "ዊ",    "ውኣ": "ዋ",    "ውኤ": "ዌ",    "ውኦ": "ዎ",    "ዝኧ": "ዘ",    "ዝኡ": "ዙ",    "ዝኢ": "ዚ",    "ዝኣ": "ዛ",    "ዝኤ": "ዜ",    "ዝኦ": "ዞ",    "ዥኧ": "ዠ",    "ዥኡ": "ዡ",    "ዥኢ": "ዢ",    "ዥኣ": "ዣ",    "ዥኤ": "ዤ",    "ዥኦ": "ዦ",    "ይኧ": "የ",    "ይኡ": "ዩ",    "ይኢ": "ዪ",    "ይኣ": "ያ",    "ይኤ": "ዬ",    "ይኦ": "ዮ",    "ድኧ": "ደ",    "ድኡ": "ዱ",    "ድኢ": "ዲ",    "ድኣ": "ዳ",    "ድኤ": "ዴ",    "ድኦ": "ዶ",    "ጅኧ": "ጀ",    "ጅኡ": "ጁ",    "ጅኢ": "ጂ",    "ጅኣ": "ጃ",    "ጅኤ": "ጄ",    "ጅኦ": "ጆ",    "ግኧ": "ገ",    "ግኡ": "ጉ",    "ግኢ": "ጊ",    "ግኣ": "ጋ",    "ግኤ": "ጌ",    "ግኦ": "ጐ",    "ጥኧ": "ጠ",    "ጥኡ": "ጡ",    "ጥኢ": "ጢ",    "ጥኣ": "ጣ",    "ጥኤ": "ጤ",    "ጥኦ": "ጦ",    "ጭኧ": "ጨ",    "ጭኡ": "ጩ",    "ጭኢ": "ጪ",    "ጭኣ": "ጫ",    "ጭኤ": "ጬ",    "ጭኦ": "ጮ",    "ጵኧ": "ጰ",    "ጵኡ": "ጱ",    "ጵኢ": "ጲ",    "ጵኣ": "ጳ",    "ጵኤ": "ጴ",    "ጵኦ": "ጶ",    "ጽኧ": "ጸ",    "ጽኡ": "ጹ",    "ጽኢ": "ጺ",    "ጽኣ": "ጻ",    "ጽኤ": "ጼ",    "ጽኦ": "ጾ",    "ፍኧ": "ፈ",    "ፍኡ": "ፉ",    "ፍኢ": "ፊ",    "ፍኣ": "ፋ",    "ፍኤ": "ፌ",    "ፍኦ": "ፎ",    "ፕኧ": "ፐ",    "ፕኡ": "ፑ",    "ፕኢ": "ፒ",    "ፕኣ": "ፓ",    "ፕኤ": "ፔ",    "ፕኦ": "ፖ"}
phonemeToCharMap = {
    "ቕኡኢ":"ቚ", "ቕኡኣ":"ቛ", "ቕኡኤ":"ቜ", "ህኡኣ": "ኋ",    "ልኡኣ": "ሏ",    "ምኡኣ": "ሟ",    "ርኡኣ": "ሯ",    "ስኡኣ": "ሷ",    "ሽኡኣ": "ሿ",    "ቅኡኣ": "ቋ",    "ብኡኣ": "ቧ",    "ቭኡኣ": "ቯ",    "ትኡኣ": "ቷ",    "ችኡኣ": "ቿ",    "ንኡኣ": "ኗ",    "ኝኡኣ": "ኟ",    "ክኡኣ": "ኳ",    "ዝኡኣ": "ዟ",    "ዥኡኣ": "ዧ",    "ድኡኣ": "ዷ",    "ጅኡኣ": "ጇ",    "ግኡኣ": "ጓ",    "ጥኡኣ": "ጧ",    "ጭኡኣ": "ጯ",    "ጵኡኣ": "ጷ",    "ጽኡኣ": "ጿ",    "ፍኡኣ": "ፏ",    "ፕኡኣ": "ፗ",    "ግኡኤ": "ጔ",    "ክኡኤ": "ኴ",    "ህኡኤ": "ኌ",    "ቅኡኤ": "ቌ",    "ግኡኢ": "ጒ",    "ክኡኢ": "ኲ",    "ቅኡኢ": "ቊ",    "ፍኡኢ": "ᎉ",    "ግኡእ": "ጕ",    "ህኡኢ": "ዂ",    "ክኡእ": "ኵ",    "ቅኡእ": "ቍ",    "ምኢኣ": "ፙ",    "ፍኢኣ": "ፚ",    "ህኡእ": "ዅ",    "ፍኡኤ": "ᎊ",    "ፍኡኧ": "ᎋ",    "ርኢኣ": "ፘ",    "ፕኡኧ": "ᎌ",    "ፕኡኢ": "ᎍ",    "ትኡኤ": "ᎎ",    "ፕኡኤ": "ᎏ",    "ቕኧ": "ቐ",    "ቕኡ": "ቑ",    "ቕኢ": "ቒ",    "ቕኣ": "ቓ","ቕኤ": "ቔ","ቕ": "ቕ", "ቕኦ": "ቖ",    "ህ": "ህ",    "ል": "ል",    "ም": "ም",    "ር": "ር",    "ስ": "ስ",    "ሽ": "ሽ",    "ቅ": "ቅ",    "ብ": "ብ",    "ቭ": "ቭ",    "ት": "ት",    "ች": "ች",    "ን": "ን",    "ኝ": "ኝ",    "ክ": "ክ",    "ው": "ው",    "ዝ": "ዝ",    "ዥ": "ዥ",    "ይ": "ይ",    "ድ": "ድ",    "ጅ": "ጅ",    "ግ": "ግ",    "ጥ": "ጥ",    "ጭ": "ጭ",    "ጵ": "ጵ",    "ጽ": "ጽ",    "ፍ": "ፍ",    "ፕ": "ፕ",    "ህኧ": "ኸ",    "ህኣ": "ሀ",    "ህኡ": "ሁ",    "ህኢ": "ሂ",    "ህኤ": "ሄ",    "ህኦ": "ሆ",    "ልኧ": "ለ",    "ልኡ": "ሉ",    "ልኢ": "ሊ",    "ልኣ": "ላ",    "ልኤ": "ሌ",    "ልኦ": "ሎ",    "ምኧ": "መ",    "ምኡ": "ሙ",    "ምኢ": "ሚ",    "ምኣ": "ማ",    "ምኤ": "ሜ",    "ምኦ": "ሞ",    "ርኧ": "ረ",    "ርኡ": "ሩ",    "ርኢ": "ሪ",    "ርኣ": "ራ",    "ርኤ": "ሬ",    "ርኦ": "ሮ",    "ስኧ": "ሰ",    "ስኡ": "ሱ",    "ስኢ": "ሲ",    "ስኣ": "ሳ",    "ስኤ": "ሴ",    "ስኦ": "ሶ",    "ሽኧ": "ሸ",    "ሽኡ": "ሹ",    "ሽኢ": "ሺ",    "ሽኣ": "ሻ",    "ሽኤ": "ሼ",    "ሽኦ": "ሾ",    "ቅኧ": "ቀ",    "ቅኡ": "ቁ",    "ቅኢ": "ቂ",    "ቅኣ": "ቃ",    "ቅኤ": "ቄ",    "ቅኦ": "ቆ",    "ብኧ": "በ",    "ብኡ": "ቡ",    "ብኢ": "ቢ",    "ብኣ": "ባ",    "ብኤ": "ቤ",    "ብኦ": "ቦ",    "ቭኧ": "ቨ",    "ቭኡ": "ቩ",    "ቭኢ": "ቪ",    "ቭኣ": "ቫ",    "ቭኤ": "ቬ",    "ቭኦ": "ቮ",    "ትኧ": "ተ",    "ትኡ": "ቱ",    "ትኢ": "ቲ",    "ትኣ": "ታ",    "ትኤ": "ቴ",    "ትኦ": "ቶ",    "ችኧ": "ቸ",    "ችኡ": "ቹ",    "ችኢ": "ቺ",    "ችኣ": "ቻ",    "ችኤ": "ቼ",    "ችኦ": "ቾ",    "ንኧ": "ነ",    "ንኡ": "ኑ",    "ንኢ": "ኒ",    "ንኣ": "ና",    "ንኤ": "ኔ",    "ንኦ": "ኖ",    "ኝኧ": "ኘ",    "ኝኡ": "ኙ",    "ኝኢ": "ኚ",    "ኝኣ": "ኛ",    "ኝኤ": "ኜ",    "ኝኦ": "ኞ",    "ክኧ": "ከ",    "ክኡ": "ኩ",    "ክኢ": "ኪ",    "ክኣ": "ካ",    "ክኤ": "ኬ",    "ክኦ": "ኮ",    "ውኧ": "ወ",    "ውኡ": "ዉ",    "ውኢ": "ዊ",    "ውኣ": "ዋ",    "ውኤ": "ዌ",    "ውኦ": "ዎ",    "ዝኧ": "ዘ",    "ዝኡ": "ዙ",    "ዝኢ": "ዚ",    "ዝኣ": "ዛ",    "ዝኤ": "ዜ",    "ዝኦ": "ዞ",    "ዥኧ": "ዠ",    "ዥኡ": "ዡ",    "ዥኢ": "ዢ",    "ዥኣ": "ዣ",    "ዥኤ": "ዤ",    "ዥኦ": "ዦ",    "ይኧ": "የ",    "ይኡ": "ዩ",    "ይኢ": "ዪ",    "ይኣ": "ያ",    "ይኤ": "ዬ",    "ይኦ": "ዮ",    "ድኧ": "ደ",    "ድኡ": "ዱ",    "ድኢ": "ዲ",    "ድኣ": "ዳ",    "ድኤ": "ዴ",    "ድኦ": "ዶ",    "ጅኧ": "ጀ",    "ጅኡ": "ጁ",    "ጅኢ": "ጂ",    "ጅኣ": "ጃ",    "ጅኤ": "ጄ",    "ጅኦ": "ጆ",    "ግኧ": "ገ",    "ግኡ": "ጉ",    "ግኢ": "ጊ",    "ግኣ": "ጋ",    "ግኤ": "ጌ",    "ግኦ": "ጐ",    "ጥኧ": "ጠ",    "ጥኡ": "ጡ",    "ጥኢ": "ጢ",    "ጥኣ": "ጣ",    "ጥኤ": "ጤ",    "ጥኦ": "ጦ",    "ጭኧ": "ጨ",    "ጭኡ": "ጩ",    "ጭኢ": "ጪ",    "ጭኣ": "ጫ",    "ጭኤ": "ጬ",    "ጭኦ": "ጮ",    "ጵኧ": "ጰ",    "ጵኡ": "ጱ",    "ጵኢ": "ጲ",    "ጵኣ": "ጳ",    "ጵኤ": "ጴ",    "ጵኦ": "ጶ",    "ጽኧ": "ጸ",    "ጽኡ": "ጹ",    "ጽኢ": "ጺ",    "ጽኣ": "ጻ",    "ጽኤ": "ጼ",    "ጽኦ": "ጾ",    "ፍኧ": "ፈ",    "ፍኡ": "ፉ",    "ፍኢ": "ፊ",    "ፍኣ": "ፋ",    "ፍኤ": "ፌ",    "ፍኦ": "ፎ",    "ፕኧ": "ፐ",    "ፕኡ": "ፑ",    "ፕኢ": "ፒ",    "ፕኣ": "ፓ",    "ፕኤ": "ፔ",    "ፕኦ": "ፖ", "ኧ": "ኧ",    " ኣ": "አ","ኣ": "አ",    " ኡ": "ኡ",    " ኢ": "ኢ",    " ኤ": "ኤ",    " እ": "እ",    " ኦ": "ኦ"
}

tigrePhonemeToCharMap = {
 "ቕኡኢ":"ቚ",  "ቕኡኣ":"ቛ",  "ቕኡኤ":"ቜ",  "ህኡኣ": "ኋ",     "ልኡኣ": "ሏ",     "ምኡኣ": "ሟ",     "ርኡኣ": "ሯ",     "ስኡኣ": "ሷ",     "ሽኡኣ": "ሿ",     "ቅኡኣ": "ቋ",     "ብኡኣ": "ቧ",     "ቭኡኣ": "ቯ",     "ትኡኣ": "ቷ",     "ችኡኣ": "ቿ",     "ንኡኣ": "ኗ",     "ኝኡኣ": "ኟ",     "ክኡኣ": "ኳ",     "ዝኡኣ": "ዟ",     "ዥኡኣ": "ዧ",     "ድኡኣ": "ዷ",     "ጅኡኣ": "ጇ",     "ግኡኣ": "ጓ",     "ጥኡኣ": "ጧ",     "ጭኡኣ": "ጯ",     "ጵኡኣ": "ጷ",     "ጽኡኣ": "ጿ",     "ፍኡኣ": "ፏ",     "ፕኡኣ": "ፗ",     "ግኡኤ": "ጔ",     "ክኡኤ": "ኴ",     "ህኡኤ": "ኌ",     "ቅኡኤ": "ቌ",     "ግኡኢ": "ጒ",     "ክኡኢ": "ኲ",     "ቅኡኢ": "ቊ",     "ፍኡኢ": "ᎉ",     "ግኡእ": "ጕ",     "ህኡኢ": "ዂ",     "ክኡእ": "ኵ",     "ቅኡእ": "ቍ",     "ምኢኣ": "ፙ",     "ፍኢኣ": "ፚ",     "ህኡእ": "ዅ",     "ፍኡኤ": "ᎊ",     "ፍኡኧ": "ᎋ",     "ርኢኣ": "ፘ",     "ፕኡኧ": "ᎌ",     "ፕኡኢ": "ᎍ",     "ትኡኤ": "ᎎ",     "ፕኡኤ": "ᎏ",     "ቕኧ": "ቐ",     "ቕኡ": "ቑ",     "ቕኢ": "ቒ",     "ቕኣ": "ቓ", "ቕኤ": "ቔ", "ቕ": "ቕ",  "ቕኦ": "ቖ",     "ህ": "ህ",     "ል": "ል",     "ም": "ም",     "ር": "ር",     "ስ": "ስ",     "ሽ": "ሽ",     "ቅ": "ቅ",     "ብ": "ብ",     "ቭ": "ቭ",     "ት": "ት",     "ች": "ች",     "ን": "ን",     "ኝ": "ኝ",     "ክ": "ክ",     "ው": "ው",     "ዝ": "ዝ",     "ዥ": "ዥ",     "ይ": "ይ",     "ድ": "ድ",     "ጅ": "ጅ",     "ግ": "ግ",     "ጥ": "ጥ",     "ጭ": "ጭ",     "ጵ": "ጵ",     "ጽ": "ጽ",     "ፍ": "ፍ",     "ፕ": "ፕ",     "ህኧ": "ኸ",     "ህኣ": "ሓ",     "ህኡ": "ሑ",     "ህኢ": "ሒ",     "ህኤ": "ሔ",     "ህኦ": "ሖ",     "ልኧ": "ለ",     "ልኡ": "ሉ",     "ልኢ": "ሊ",     "ልኣ": "ላ",     "ልኤ": "ሌ",     "ልኦ": "ሎ",     "ምኧ": "መ",     "ምኡ": "ሙ",     "ምኢ": "ሚ",     "ምኣ": "ማ",     "ምኤ": "ሜ",     "ምኦ": "ሞ",     "ርኧ": "ረ",     "ርኡ": "ሩ",     "ርኢ": "ሪ",     "ርኣ": "ራ",     "ርኤ": "ሬ",     "ርኦ": "ሮ",     "ስኧ": "ሰ",     "ስኡ": "ሱ",     "ስኢ": "ሲ",     "ስኣ": "ሳ",     "ስኤ": "ሴ",     "ስኦ": "ሶ",     "ሽኧ": "ሸ",     "ሽኡ": "ሹ",     "ሽኢ": "ሺ",     "ሽኣ": "ሻ",     "ሽኤ": "ሼ",     "ሽኦ": "ሾ",     "ቅኧ": "ቀ",     "ቅኡ": "ቁ",     "ቅኢ": "ቂ",     "ቅኣ": "ቃ",     "ቅኤ": "ቄ",     "ቅኦ": "ቆ",     "ብኧ": "በ",     "ብኡ": "ቡ",     "ብኢ": "ቢ",     "ብኣ": "ባ",     "ብኤ": "ቤ",     "ብኦ": "ቦ",     "ቭኧ": "ቨ",     "ቭኡ": "ቩ",     "ቭኢ": "ቪ",     "ቭኣ": "ቫ",     "ቭኤ": "ቬ",     "ቭኦ": "ቮ",     "ትኧ": "ተ",     "ትኡ": "ቱ",     "ትኢ": "ቲ",     "ትኣ": "ታ",     "ትኤ": "ቴ",     "ትኦ": "ቶ",     "ችኧ": "ቸ",     "ችኡ": "ቹ",     "ችኢ": "ቺ",     "ችኣ": "ቻ",     "ችኤ": "ቼ",     "ችኦ": "ቾ",     "ንኧ": "ነ",     "ንኡ": "ኑ",     "ንኢ": "ኒ",     "ንኣ": "ና",     "ንኤ": "ኔ",     "ንኦ": "ኖ",     "ኝኧ": "ኘ",     "ኝኡ": "ኙ",     "ኝኢ": "ኚ",     "ኝኣ": "ኛ",     "ኝኤ": "ኜ",     "ኝኦ": "ኞ",     "ክኧ": "ከ",     "ክኡ": "ኩ",     "ክኢ": "ኪ",     "ክኣ": "ካ",     "ክኤ": "ኬ",     "ክኦ": "ኮ",     "ውኧ": "ወ",     "ውኡ": "ዉ",     "ውኢ": "ዊ",     "ውኣ": "ዋ",     "ውኤ": "ዌ",     "ውኦ": "ዎ",     "ዝኧ": "ዘ",     "ዝኡ": "ዙ",     "ዝኢ": "ዚ",     "ዝኣ": "ዛ",     "ዝኤ": "ዜ",     "ዝኦ": "ዞ",     "ዥኧ": "ዠ",     "ዥኡ": "ዡ",     "ዥኢ": "ዢ",     "ዥኣ": "ዣ",     "ዥኤ": "ዤ",     "ዥኦ": "ዦ",     "ይኧ": "የ",     "ይኡ": "ዩ",     "ይኢ": "ዪ",     "ይኣ": "ያ",     "ይኤ": "ዬ",     "ይኦ": "ዮ",     "ድኧ": "ደ",     "ድኡ": "ዱ",     "ድኢ": "ዲ",     "ድኣ": "ዳ",     "ድኤ": "ዴ",     "ድኦ": "ዶ",     "ጅኧ": "ጀ",     "ጅኡ": "ጁ",     "ጅኢ": "ጂ",     "ጅኣ": "ጃ",     "ጅኤ": "ጄ",     "ጅኦ": "ጆ",     "ግኧ": "ገ",     "ግኡ": "ጉ",     "ግኢ": "ጊ",     "ግኣ": "ጋ",     "ግኤ": "ጌ",     "ግኦ": "ጐ",     "ጥኧ": "ጠ",     "ጥኡ": "ጡ",     "ጥኢ": "ጢ",     "ጥኣ": "ጣ",     "ጥኤ": "ጤ",     "ጥኦ": "ጦ",     "ጭኧ": "ጨ",     "ጭኡ": "ጩ",     "ጭኢ": "ጪ",     "ጭኣ": "ጫ",     "ጭኤ": "ጬ",     "ጭኦ": "ጮ",     "ጵኧ": "ጰ",     "ጵኡ": "ጱ",     "ጵኢ": "ጲ",     "ጵኣ": "ጳ",     "ጵኤ": "ጴ",     "ጵኦ": "ጶ",     "ጽኧ": "ጸ",     "ጽኡ": "ጹ",     "ጽኢ": "ጺ",     "ጽኣ": "ጻ",     "ጽኤ": "ጼ",     "ጽኦ": "ጾ",     "ፍኧ": "ፈ",     "ፍኡ": "ፉ",     "ፍኢ": "ፊ",     "ፍኣ": "ፋ",     "ፍኤ": "ፌ",     "ፍኦ": "ፎ",     "ፕኧ": "ፐ",     "ፕኡ": "ፑ",     "ፕኢ": "ፒ",     "ፕኣ": "ፓ",     "ፕኤ": "ፔ",     "ፕኦ": "ፖ",  "ኧ": "ኧ",     " ኣ": "ኣ",     " ኡ": "ኡ",     " ኢ": "ኢ",     " ኤ": "ኤ",     " እ": "እ",     " ኦ": "ኦ"
}

In [None]:
def mapBack(sent:str, tigre=False) -> str:
  sent = sent.strip()
  theMap = {}
  if tigre :
    theMap = tigrePhonemeToCharMap
  else:
    theMap = phonemeToCharMap
  for a in theMap:
    if a in sent:
      sent = sent.replace(a, theMap[a])
  return sent.strip()

  return sent.strip()

In [None]:
sample = "ኣግኣፍኣርኢ ውኦችኣችኧው ቕኧ ብኧ  ኣድኣንኡ ም  ኣቅኡኣምኣችኧው ልኧ  እንኧርስኡ ህኣርችኦ ንኣ ምኡብትኧው ክኧ ስኧጥኡት ድግኣፍ  ኣይ ልኧይም"
print(mapBack(sample))

አጋፋሪ ዎቻቸው ቐ በ አዳኑ ም አቋማቸው ለ እነርሱ ሀርቾ ና ሙብተው ከ ሰጡት ድጋፍ አይ ለይም


In [None]:
sample = "ኣግኣፍኣርኢ ውኦችኣችኧው ቕኧ ብኧ  ኣድኣንኡ ም  ኣቅኡኣምኣችኧው ልኧ  እንኧርስኡ ህኣርችኦ ንኣ ምኡብትኧው ክኧ ስኧጥኡት ድግኣፍ  ኣይ ልኧይም"
print(mapBack(sample, tigre=True))

ኣጋፋሪ ዎቻቸው ቐ በ ኣዳኑ ም ኣቋማቸው ለ እነርሱ ሓርቾ ና ሙብተው ከ ሰጡት ድጋፍ ኣይ ለይም


In [None]:
true_value = []
fille=open("val.spa","r",encoding="utf-8")
true_value= fille.readlines()
fille.close()

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    results = []
    for s in pred_str:
      results.append(mapBack(s))
    real_cer = cer_metric.compute(predictions=results, references=true_value)

    return {"wer": wer, "phoneme_cer": cer, "cer": real_cer}

In [None]:
import gc
gc.collect()

23

In [None]:
import torch
from transformers import Wav2Vec2ForCTC

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    "Samuael/asr-amharic-phoneme-based-38",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)
# model.lm_head=  torch.nn.Sequential(
#     torch.nn.Linear(in_features=1024, out_features=2048, bias=True),
#     torch.nn.LayerNorm((2048,), eps=1e-05, elementwise_affine=True),
#     torch.nn.LeakyReLU(),
#     torch.nn.Dropout(0.15),
#     torch.nn.Linear(in_features=2048, out_features=224, bias=True),
#     torch.nn.LayerNorm((224,), eps=1e-05, elementwise_affine=True),
#     torch.nn.LeakyReLU(),
#     torch.nn.Dropout(0.15),
#     torch.nn.Linear(in_features=224, out_features=37, bias=True),
# )
# model.vocab_size=37
# model.eos_token_id=37
# model.config.vocab_size=37

In [None]:
model.lm_head.weight = torch.nn.Parameter(torch.vstack((model.lm_head.weight, torch.zeros((1, 1024)) + 0.0001 )))

In [None]:
model.lm_head.weight.shape

torch.Size([38, 1024])

In [None]:
model.config.vocab_size=38

In [None]:
model.lm_head.weight

In [None]:
model.vocab_size=38
model.eos_token_id=38
model.config.vocab_size=38

In [None]:
model.lm_head.bias = torch.nn.Parameter(torch.hstack((model.lm_head.bias, torch.tensor([0.0001]))))

In [None]:
model.lm_head.weight.shape

torch.Size([38, 1024])

In [None]:
model.lm_head.bias= torch.nn.Parameter()

In [None]:
model.push_to_hub("Samuael/asr-amharic-phoneme-based-38")

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Samuael/asr-amharic-phoneme-based-38/commit/3dc05692df48f9251a5180e0a7f6636b743f6f2d', commit_message='Upload Wav2Vec2ForCTC', commit_description='', oid='3dc05692df48f9251a5180e0a7f6636b743f6f2d', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
model.freeze_feature_encoder()

In [None]:
from transformers import TrainingArguments
import torch
from transformers import Trainer

In [None]:
training_args = TrainingArguments(
  output_dir="/content/gdrive/MyDrive/Wav2Vec_model/"+repo_name,
  group_by_length=True,
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=20,
  # bf16=True,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=200,
  eval_steps=200,
  # logging_steps=2,
  learning_rate=3e-5,
  weight_decay=0.0005,
  warmup_steps=100,
  save_total_limit=6,
)

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=amha["train"],
    eval_dataset=amha["test"],
    tokenizer=processor.feature_extractor,
)

### Training

In [None]:
# amha["train"][0]

In [None]:
trainer.args.learning_rate = 3e-5
trainer.args.per_device_train_batch_size=32
trainer.use_retrant=False

In [None]:
import gc

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()
# trainer.use_retrant=False
# trainer.train()

In [None]:
#@title Remove model
del model
del trainer

import gc
gc.collect()

### Evaluate

In the final part, we run our model on some of the validation data to get a feeling for how well it works.

Let's load the `processor` and `model`.

In [None]:
processor = Wav2Vec2Processor.from_pretrained("Samuael/asr-amharic-phoneme-based-233")

In [None]:
model = Wav2Vec2ForCTC.from_pretrained("Samuael/asr-amharic-phoneme-based-233").cuda()

In [None]:
model=model.cuda()

In [None]:
results = amha["test"].map(map_to_result,) # remove_columns=amha["train"].column_names)

In [None]:
print("Test Phoneme CER: {:.3f}".format(cer_metric.compute(predictions=results["pred_str"], references=results["text"])))

In [None]:
print("Test Phoneme WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

Compute WER now.

In [None]:
print("Test Phoneme CER: {:.3f}".format(cer_metric.compute(predictions=results["pred_str"], references=results["text"])))

In [None]:
print("Test Phoneme WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))