<h1 style="text-align: center;">Fine-Tuning Wav2Vec2 for Badaga ASR</h1>


* **Wav2Vec2** is a pretrained model for Automatic Speech Recognition (ASR)  
* *Facebook AI* presented a multi-lingual version of Wav2Vec2, called XLSR. XLSR stands for *cross-lingual speech representations* and refers to model's ability to learn speech representations that are useful across multiple languages.


* Similar to BERT's masked language modeling objective, XLS-R learns contextualized speech representations by randomly masking feature vectors before passing them to a transformer network during self-supervised pre-training.


![wav2vec2_structure](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/xls_r.png)



###  Checking GPU Availability

In [1]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is available
print(torch.cuda.current_device())  # Should show the current GPU index
print(torch.cuda.get_device_name(0))  # Should display the GPU name


True
0
NVIDIA GeForce RTX 3050 Laptop GPU


###  Data Loading and Preprocessing

In [2]:
# Importing the libraries
import os
import pandas as pd
import re
import librosa
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Define absolute paths
data_directory = r"C:\Users\T H E J\Desktop\Badaga_Corpus-v.0.1.0"
tagged_file = "Badaga-v0.1.0.xlsx"

In [4]:

# joining the path to the root directory
tagged_file_path = os.path.join(data_directory, tagged_file)

# reading the data
data_frame = pd.read_excel(tagged_file_path)

# dropping missing values
data_frame.dropna(inplace=True)

# reading the audio files
data_frame["audio_file_name"] = data_frame["audio_file_name"].apply(lambda x: os.path.join(data_directory, "clips", x))

# splitting the data based on train and test in the transcription file
train_df = data_frame[data_frame["split_label"]!="test"]
test_df = data_frame[data_frame["split_label"]=="test"]

In [5]:
# printing the shape of the train and test
train_df.shape, test_df.shape

((8365, 9), (1469, 9))

### Audio Loading and Dataset Preparation

In [6]:
# loading the audio using librosa 
audio = list()
for file in list(train_df["audio_file_name"]):
    a, s = librosa.load(file, sr=16000)
    audio.append({
        'path': file,
        'array': a,
        'sampling_rate': s
    })
    




audio1 = list()
for file in list(test_df["audio_file_name"]):
    a, s = librosa.load(file, sr=16000)
    audio1.append({
        'path': file,
        'array': a,
        'sampling_rate': s
    })
    



In [7]:
# creating the dictionary for both train and test by taking the audiofile and corresponding translated sentences
train_dict = {'client_id':list(range(0, len(audio))), 'audio': audio, "sentence": list(train_df["translterated_script"])}
test_dict = {'client_id':list(range(len(audio), len(audio)+len(audio1))),'audio': audio1, "sentence": list(test_df["translterated_script"])}

In [8]:
# creating a dataset for the above created dictionary
train_dataset = Dataset.from_dict(train_dict)
test_dataset = Dataset.from_dict(test_dict)

In [9]:
# printing the column names
train_dataset.column_names

['client_id', 'audio', 'sentence']

In [10]:
# printing the first column using index
train_dataset[0]

{'client_id': 0,
 'audio': {'array': [-4.381633131409629e-13,
   -4.547211402989443e-13,
   -3.9910945642124274e-13,
   -3.477597441785274e-13,
   -5.892689764960823e-13,
   -7.35186759560913e-13,
   -1.9754379067867672e-13,
   -6.340125906917349e-13,
   -3.9618181239993444e-13,
   1.3508462469273563e-13,
   -3.499395326463095e-14,
   -4.2430406519029795e-13,
   1.7334330043410062e-13,
   -4.451782367222157e-13,
   -1.756699440861459e-13,
   -4.5253256979356504e-13,
   -8.702850723060762e-13,
   1.7466076870399638e-14,
   1.5608089094180239e-13,
   -5.420372267347928e-13,
   -3.7012819024961896e-13,
   3.2685109501752463e-13,
   -6.801431976215938e-14,
   -3.582878893744479e-13,
   -1.0914449140482507e-13,
   -3.098158936429979e-13,
   -4.894797132719286e-13,
   -6.201866277477652e-13,
   -9.631275811605722e-13,
   -1.0823816615126297e-13,
   -4.2352512013947574e-13,
   -6.471780576722264e-13,
   -7.760326127363715e-14,
   3.445004698177101e-13,
   -4.1060592968246434e-13,
   -9.605296

### Text Cleaning: Removing Special Characters

In [None]:
# function to remove the special characters
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'


def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).upper() + " "
    return batch

In [12]:
# removing the special characters from train and test using map function
train_dataset = train_dataset.map(remove_special_characters)
test_dataset = test_dataset.map(remove_special_characters)

Map: 100%|██████████████████████████████████████████████| 8365/8365 [00:01<00:00, 5172.07 examples/s]
Map: 100%|██████████████████████████████████████████████| 1469/1469 [00:00<00:00, 9757.52 examples/s]


In [13]:
# checking the dataset after removing special characters
train_dataset[0]

{'client_id': 0,
 'audio': {'array': [-4.381633131409629e-13,
   -4.547211402989443e-13,
   -3.9910945642124274e-13,
   -3.477597441785274e-13,
   -5.892689764960823e-13,
   -7.35186759560913e-13,
   -1.9754379067867672e-13,
   -6.340125906917349e-13,
   -3.9618181239993444e-13,
   1.3508462469273563e-13,
   -3.499395326463095e-14,
   -4.2430406519029795e-13,
   1.7334330043410062e-13,
   -4.451782367222157e-13,
   -1.756699440861459e-13,
   -4.5253256979356504e-13,
   -8.702850723060762e-13,
   1.7466076870399638e-14,
   1.5608089094180239e-13,
   -5.420372267347928e-13,
   -3.7012819024961896e-13,
   3.2685109501752463e-13,
   -6.801431976215938e-14,
   -3.582878893744479e-13,
   -1.0914449140482507e-13,
   -3.098158936429979e-13,
   -4.894797132719286e-13,
   -6.201866277477652e-13,
   -9.631275811605722e-13,
   -1.0823816615126297e-13,
   -4.2352512013947574e-13,
   -6.471780576722264e-13,
   -7.760326127363715e-14,
   3.445004698177101e-13,
   -4.1060592968246434e-13,
   -9.605296

###  Vocabulary File for CTC Tokenizer

We have created a JSON vocabulary file (`stt-vocab.json`) for use with the `Wav2Vec2CTCTokenizer`.  
This file maps each character or token to a unique integer ID. Here's a breakdown of its contents:

---

###  Character-Level Vocabulary (For CTC)

The vocabulary includes:

#### 1. **Special Tokens (for CTC and Transformers)**
| Token     | Description                                                                 |
|-----------|-----------------------------------------------------------------------------|
| `<pad>`   | Padding token — used to align sequences of varying lengths.                 |
| `<s>`     | Start of sentence — often required for seq2seq models (not used in CTC).    |
| `</s>`    | End of sentence — same as above, reserved but not always used.              |
| `<unk>`   | Unknown token — used when the model encounters an unseen character.         |
| `|`       | Word delimiter token — replaces whitespace between words.                   |

#### 2. **Punctuation/Other Symbols**
| Symbol | Use Case                        |
|--------|----------------------------------|
| `'`    | Apostrophe in contractions.     |
| `-`    | Hyphen in compound words.       |

#### 3. **Uppercase Letters A-Z**
- Each letter from `A` to `Z` is treated as a **distinct token**.
- The model will output these characters during inference.

```json
{
  "<pad>": 0,
  "<s>": 1,
  "</s>": 2,
  "<unk>": 3,
  "|": 4,
  "'": 5,
  "-": 6,
  "A": 7,
  "B": 8,
  ...
  "Z": 32
}


### Initializing the Wav2Vec2 CTC Tokenizer

We initialize the `Wav2Vec2CTCTokenizer` with a custom vocabulary JSON file for CTC-based speech recognition.  
Special tokens like `<unk>`, `<pad>`, and `|` are defined for unknown characters, padding, and word separation.


In [14]:
from transformers import Wav2Vec2CTCTokenizer

vocab_file = r"C:\Users\T H E J\Desktop\Badaga_Corpus-v.0.1.0\stt-vocab.json"
tokenizer = Wav2Vec2CTCTokenizer(vocab_file, unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|")

### Creating `Wav2Vec2FeatureExtractor`

The `Wav2Vec2FeatureExtractor` is used to preprocess raw audio input for the Wav2Vec2 model.

- `feature_size=1`: Wav2Vec2 expects a single feature from raw audio.
- `sampling_rate=16000`: The audio sampling rate used during training.
- `padding_value=0.0`: Used to pad shorter audio sequences for batching.
- `do_normalize=True`: Normalizes the audio for better model performance.
- `return_attention_mask=True`: Enables attention masking, especially needed for XLS-R models.


In [15]:
# loading the feature extractor for feature extraction
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

### Using `Wav2Vec2Processor`

* The `Wav2Vec2Processor` combines both the tokenizer and feature extractor into a single class.  
* This simplifies training and inference by allowing you to use just the `processor` along with the model.


In [16]:
# loading the processor which takes the tokenizer and feature extractor to process
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [17]:
# printing the data after processing 
train_dataset[0]

{'client_id': 0,
 'audio': {'array': [-4.381633131409629e-13,
   -4.547211402989443e-13,
   -3.9910945642124274e-13,
   -3.477597441785274e-13,
   -5.892689764960823e-13,
   -7.35186759560913e-13,
   -1.9754379067867672e-13,
   -6.340125906917349e-13,
   -3.9618181239993444e-13,
   1.3508462469273563e-13,
   -3.499395326463095e-14,
   -4.2430406519029795e-13,
   1.7334330043410062e-13,
   -4.451782367222157e-13,
   -1.756699440861459e-13,
   -4.5253256979356504e-13,
   -8.702850723060762e-13,
   1.7466076870399638e-14,
   1.5608089094180239e-13,
   -5.420372267347928e-13,
   -3.7012819024961896e-13,
   3.2685109501752463e-13,
   -6.801431976215938e-14,
   -3.582878893744479e-13,
   -1.0914449140482507e-13,
   -3.098158936429979e-13,
   -4.894797132719286e-13,
   -6.201866277477652e-13,
   -9.631275811605722e-13,
   -1.0823816615126297e-13,
   -4.2352512013947574e-13,
   -6.471780576722264e-13,
   -7.760326127363715e-14,
   3.445004698177101e-13,
   -4.1060592968246434e-13,
   -9.605296

###  Preparing Audio and Transcriptions for Wav2Vec2 Training using `Wav2Vec2Processor`

* Finally, we can leverage `Wav2Vec2Processor` to process the data to the format expected by `Wav2Vec2ForCTC` for training. To do so let's make use of Dataset's 'map' function.

* First, we load and resample the audio data, simply by calling `batch["audio"]`.
* Second, we extract the `input_values` from the loaded audio file. In our case, the `Wav2Vec2Processor` only normalizes the data.

* Third, we encode the transcriptions to label ids.

* **Note**: This mapping function is a good example of how the `Wav2Vec2Processor` class should be used. In "normal" context, calling `processor(...)` is redirected to `Wav2Vec2FeatureExtractor`'s call method. When wrapping the processor into the `as_target_processor` context, however, the same method is redirected to `Wav2Vec2CTCTokenizer`'s call method.

In [18]:
# function to pass the audio files in batch
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [19]:
# Map function with progress bar
train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names, desc="Processing Train Dataset")
test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names, desc="Processing Test Dataset")


Processing Train Dataset: 100%|███████████████████████████| 8365/8365 [02:34<00:00, 54.01 examples/s]
Processing Test Dataset: 100%|████████████████████████████| 1469/1469 [00:26<00:00, 56.01 examples/s]


### Training Setup

### Set the data collator to handle and pass the data to the model

In [20]:
# defining the function for data collator
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [21]:
# data collator
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

### Defining the evaluation metrics

In [22]:
# Import the correct metric loading function
import evaluate

# Load Word Error Rate (WER) metric
wer_metric = evaluate.load("wer")


### Function for evalaution

In [23]:
# function for computing the WER 
import numpy as np
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

### Loading the pre-trained model
* Here we use 'xls-r-53' variant.

In [24]:
# Check if GPU is available
import torch
print("GPU Available:", torch.cuda.is_available())
print("Using GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

# Load Wav2Vec2 model and move to GPU
from transformers import Wav2Vec2ForCTC

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
).to(device)  # Move model to GPU

# Enable gradient checkpointing to save memory
model.freeze_feature_extractor()
model.gradient_checkpointing_enable()


GPU Available: True
Using GPU: NVIDIA GeForce RTX 3050 Laptop GPU


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


### Defining training parameters

In [25]:
# setting up the arguments for training
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="wav2vec2-badaga",
  group_by_length=True,
  per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=10,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
)



In [26]:
# setting up the trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor.feature_extractor,
)

  trainer = Trainer(


### Training

In [27]:
# training 
trainer.train()



Step,Training Loss,Validation Loss,Wer
100,3.7828,3.359624,1.0
200,2.9114,2.877742,1.0
300,2.803,2.808927,1.0
400,2.8088,2.712787,1.0
500,2.5665,2.359767,1.0
600,1.7207,1.329751,0.958451
700,1.6214,1.025979,0.789267
800,1.5909,0.922109,0.686497
900,1.2445,0.864935,0.650929
1000,1.1531,0.774906,0.564684




TrainOutput(global_step=10460, training_loss=0.68324225282236, metrics={'train_runtime': 104636.5165, 'train_samples_per_second': 0.799, 'train_steps_per_second': 0.1, 'total_flos': 6.082338111187655e+18, 'train_loss': 0.68324225282236, 'epoch': 10.0})

Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-4100/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-3900] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-4200
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-4200/config.json
Model weights saved in wav2vec2-rbg-badaga-stt/checkpoint-4200/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-4200/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-4000] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-4300
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-4300/config.json
Model weights saved i

Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-4900/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-4700] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-5000
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-5000/config.json
Model weights saved in wav2vec2-rbg-badaga-stt/checkpoint-5000/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-5000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-4800] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-5100
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-5100/config.json
Model weights saved i

Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-5700/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-5500] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-5800
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-5800/config.json
Model weights saved in wav2vec2-rbg-badaga-stt/checkpoint-5800/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-5800/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-5600] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-5900
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-5900/config.json
Model weights saved i

Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-6500/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-6300] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-6600
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-6600/config.json
Model weights saved in wav2vec2-rbg-badaga-stt/checkpoint-6600/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-6600/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-6400] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-6700
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-6700/config.json
Model weights saved i

Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-7300/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-7100] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-7400
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-7400/config.json
Model weights saved in wav2vec2-rbg-badaga-stt/checkpoint-7400/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-7400/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-7200] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-7500
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-7500/config.json
Model weights saved i

Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-8100/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-7900] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-8200
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-8200/config.json
Model weights saved in wav2vec2-rbg-badaga-stt/checkpoint-8200/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-8200/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-8000] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-8300
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-8300/config.json
Model weights saved i

Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-8900/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-8700] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-9000
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-9000/config.json
Model weights saved in wav2vec2-rbg-badaga-stt/checkpoint-9000/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-9000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-8800] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-9100
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-9100/config.json
Model weights saved i

Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-9700/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-9500] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-9800
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-9800/config.json
Model weights saved in wav2vec2-rbg-badaga-stt/checkpoint-9800/pytorch_model.bin
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-9800/preprocessor_config.json
Deleting older checkpoint [wav2vec2-rbg-badaga-stt/checkpoint-9600] due to args.save_total_limit
  return (input_length - kernel_size) // stride + 1
***** Running Evaluation *****
  Num examples = 1469
  Batch size = 4
Saving model checkpoint to wav2vec2-rbg-badaga-stt/checkpoint-9900
Configuration saved in wav2vec2-rbg-badaga-stt/checkpoint-9900/config.json
Model weights saved i

TrainOutput(global_step=10460, training_loss=0.6890156001821303, metrics={'train_runtime': 25906.7641, 'train_samples_per_second': 3.229, 'train_steps_per_second': 0.404, 'total_flos': 6.123033781781698e+18, 'train_loss': 0.6890156001821303, 'epoch': 10.0})

### Load Fine-Tuned Wav2Vec2 Model for Inference

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch

# Path to your fine-tuned Wav2Vec2 model checkpoint
model_path = r"C:\Users\T H E J\Downloads\wav2vec2-badaga\checkpoint-10460"

# Load the processor and model
processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path)

# Move the model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


### Perform Inference with Fine-Tuned Wav2Vec2 Model on Audio Sample

In [11]:
import torchaudio

# Load audio (16kHz, mono)
speech_array, sampling_rate = torchaudio.load(
    r"C:\Users\T H E J\Desktop\Badaga_Corpus-v.0.1.0\clips\F001_1_52.mp3"
)

# Resample if needed
if sampling_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
    speech_array = resampler(speech_array)

# Mono channel
speech_array = speech_array.squeeze()

# Tokenize
inputs = processor(speech_array, return_tensors="pt", sampling_rate=16000)
inputs = {key: val.to(model.device) for key, val in inputs.items()}

# Inference
with torch.no_grad():
    logits = model(**inputs).logits

# Decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])

print("🗣️ Transcription:", transcription)


🗣️ Transcription: E BANDI ALLIGA OORA
