In [None]:
from datasets import load_dataset, load_metric, Audio
import evaluate

In [None]:
dataset=load_dataset("mozilla-foundation/common_voice_17_0", "ta", split="validation")

In [None]:
import noisereduce as nr
import librosa
import numpy as np

def reduce_noise(audio_file):
    y, sr = librosa.load(audio_file, sr=None)
    reduced_audio = nr.reduce_noise(y=y, sr=sr)
    return reduced_audio, sr

In [None]:
import scipy.signal
import librosa.display
import matplotlib.pyplot as plt

def spectral_noise_reduction(y, sr):
    D = librosa.stft(y)
    D_magnitude, D_phase = librosa.magphase(D)
    D_denoised = np.maximum(D_magnitude - 0.02, 0)
    y_denoised = librosa.istft(D_denoised * D_phase)
    return y_denoised

In [None]:
import soundfile as sf

def clean_audio(sample):
    y, sr = librosa.load(sample["path"], sr=None)
    y_clean = nr.reduce_noise(y=y, sr=sr)
    output_path = sample["path"].replace(".wav", "_clean.wav")
    sf.write(output_path, y_clean, sr)
    sample["clean_path"] = output_path
    return sample

dataset = dataset.map(clean_audio)

In [None]:
dataset.column_names

['client_id',
 'path',
 'audio',
 'sentence',
 'up_votes',
 'down_votes',
 'age',
 'gender',
 'accent',
 'locale',
 'segment',
 'variant']

In [None]:
dataset[0]

{'client_id': '1611748d32f931a24aa2a413c4e253820572fda3c02685d7ffbd5418294318a02bb1fcef6f5d4a281fd152f874b4b891d6a2077f30ff5b8df3bc6e1f031f68a1',
 'path': 'C:\\Users\\ruebe\\.cache\\huggingface\\datasets\\downloads\\extracted\\efd40ab9e02674a61eef74595700f2bffee2d8c43dfef96d0b87847ca004bd0f\\ta_dev_0/common_voice_ta_24959711.mp3',
 'audio': {'path': 'C:\\Users\\ruebe\\.cache\\huggingface\\datasets\\downloads\\extracted\\efd40ab9e02674a61eef74595700f2bffee2d8c43dfef96d0b87847ca004bd0f\\ta_dev_0/common_voice_ta_24959711.mp3',
  'array': array([ 2.84217094e-14,  2.72848411e-12,  3.58113539e-12, ...,
         -6.10014467e-07,  3.47161767e-06,  3.22641426e-06]),
  'sampling_rate': 48000},
 'sentence': 'பாலாஜி என்பது பாலன் என்ற பொருளுடையது.',
 'up_votes': 2,
 'down_votes': 0,
 'age': '',
 'gender': '',
 'accent': '',
 'locale': 'ta',
 'segment': '',
 'variant': ''}

In [None]:
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
common_voice_train = split_dataset["train"]
common_voice_test = split_dataset["test"]

In [None]:
len(common_voice_train)

9676

In [None]:
len(common_voice_test)

2419

In [None]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(common_voice_train.remove_columns(["path", "audio"]), num_examples=10)

Unnamed: 0,sentence,variant
0,முருகன் திடீரென்று அவனுக்கு எச்சரிக்கை எதுவும் செய்யாமல் கொன்றுவிடவில்லை.,
1,"வயல்களில் வேலைசெய்வதற்கும், வேட்டையில் துணை புரிவதற்கும் அரிசனங்களையே அமர்த்தியிருக்கின்றனர்.",
2,ஹேதம்மாளுக்குப் பல இடங்களில்கோயில்கள் எடுக்கப்பட்டுள்ளன.,
3,மிகை நகை தகைமை அன்று.,
4,"இன்னொரு குறை என்னவெனில், பெரும்பாலான மாதிரிகள் புகையை உமிழ்கின்றன.",
5,சேலம் மாவட்டத்திலுள்ள நீர்வீழ்ச்சிகளை நல்லமுறையில் பயன்படுத்திக் கொண்டால் இது ஒன்றும் நமக்குக் கடினமான செயல் அல்ல.,
6,ஆனால் இறைவனுடைய பெருமையைக் கூறும் புராணச் செய்திகளில் அவ்விரண்டும் இயல்பாகவே இணைந்து நிற்கின்றன.,
7,"ஒரு அரசன் தன் மாளிகையில் கம்பளத்தை விரித்து அதன்மேல் மாவைத் தூவி, அதன் மேல் பல பெண்களை நடக்கவிடுவானாம்.",
8,கடவுளுக்கு யார் சேவை செய்வது?,
9,என் உள்ளம் முன்பே புண்பட்டிருக்கிறது.,


In [None]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch

In [None]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=False, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=False, remove_columns=common_voice_test.column_names)

In [None]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [None]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 '(': 1,
 ')': 2,
 '\\': 3,
 'a': 4,
 'b': 5,
 'c': 6,
 'd': 7,
 'e': 8,
 'f': 9,
 'g': 10,
 'h': 11,
 'i': 12,
 'j': 13,
 'k': 14,
 'l': 15,
 'm': 16,
 'n': 17,
 'o': 18,
 'p': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'y': 26,
 'z': 27,
 '·': 28,
 '¾': 29,
 'ஃ': 30,
 'அ': 31,
 'ஆ': 32,
 'இ': 33,
 'ஈ': 34,
 'உ': 35,
 'ஊ': 36,
 'எ': 37,
 'ஏ': 38,
 'ஐ': 39,
 'ஒ': 40,
 'ஓ': 41,
 'ஔ': 42,
 'க': 43,
 'ங': 44,
 'ச': 45,
 'ஜ': 46,
 'ஞ': 47,
 'ட': 48,
 'ண': 49,
 'த': 50,
 'ந': 51,
 'ன': 52,
 'ப': 53,
 'ம': 54,
 'ய': 55,
 'ர': 56,
 'ற': 57,
 'ல': 58,
 'ள': 59,
 'ழ': 60,
 'வ': 61,
 'ஷ': 62,
 'ஸ': 63,
 'ஹ': 64,
 'ா': 65,
 'ி': 66,
 'ீ': 67,
 'ு': 68,
 'ூ': 69,
 'ெ': 70,
 'ே': 71,
 'ை': 72,
 'ொ': 73,
 'ோ': 74,
 'ௌ': 75,
 '்': 76,
 '–': 77,
 '—': 78,
 '’': 79,
 '•': 80,
 '…': 81,
 '◯': 82}

In [None]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [None]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

85

In [None]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [None]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(common_voice_train)-1)

print(common_voice_train[rand_int]["sentence"])
ipd.Audio(data=common_voice_train[rand_int]["audio"]["array"], autoplay=True, rate=16000)

எனது பழைய நண்பராகிய ச ராஜகணபதி முதலியார் பாலநேசர் வேடம் பூண்டு அதற்கேற்றபடி நடித்தார்


In [None]:
rand_int = random.randint(0, len(common_voice_train)-1)

print("Target text:", common_voice_train[rand_int]["sentence"])
print("Input array shape:", common_voice_train[rand_int]["audio"]["array"].shape)
print("Sampling rate:", common_voice_train[rand_int]["audio"]["sampling_rate"])

Target text: வேண்டிய பணம் அனுப்புகிறேன்
Input array shape: (63936,)
Sampling rate: 16000


In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [None]:
common_voice_train=common_voice_train.select(range(9676))
common_voice_test=common_voice_test.select(range(2419))

In [None]:
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)

In [None]:
max_input_length_in_sec = 5.0
common_voice_train = common_voice_train.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
from datasets import load_metric

In [None]:
import jiwer

In [None]:
wer_metric = load_metric("wer",trust_remote_code=True)

  wer_metric = load_metric("wer",trust_remote_code=True)


In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)

    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.1,
    mask_time_prob=0.15,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.freeze_feature_extractor()



In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./",
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  eval_strategy="epoch",
  num_train_epochs=110,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=1000,
  eval_steps=1000,
  logging_steps=500,
  learning_rate=3e-4,
  warmup_ratio=0.01,
  save_total_limit=2,
  push_to_hub=False,
  optim="adamw_bnb_8bit",
  report_to="none"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)

  trainer = Trainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Wer
1,No log,3.264173,1.0
2,No log,3.185013,1.0
3,No log,3.096417,1.0
4,No log,0.914146,0.898987
5,4.309500,0.572919,0.744081
6,4.309500,0.494053,0.661071
7,4.309500,0.45706,0.626165
8,4.309500,0.427107,0.587959
9,0.740600,0.427068,0.583618
10,0.740600,0.406135,0.559247


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=13530, training_loss=0.3399673966769369, metrics={'train_runtime': 66678.1231, 'train_samples_per_second': 6.49, 'train_steps_per_second': 0.203, 'total_flos': 5.2282745546454024e+19, 'train_loss': 0.3399673966769369, 'epoch': 110.0})

In [None]:
save_directory = "./asr_saved_model_ta_v6"
model.save_pretrained(save_directory)
processor.save_pretrained(save_directory)
print(f"Model and processor saved to {save_directory}")

Model and processor saved to ./asr_saved_model_ta_v6


In [None]:
import torch
from transformers import Wav2Vec2ForCTC
from torchviz import make_dot

model = Wav2Vec2ForCTC.from_pretrained("asr_saved_model_50000v2")
dummy_input = torch.randn(1, 16000, requires_grad=True)
output = model(dummy_input)
make_dot(output.logits, params=dict(model.named_parameters())).render("wav2vec2_model", format="png", view=True)


In [None]:
import torch
from transformers import Wav2Vec2ForCTC
from torchviz import make_dot

model = Wav2Vec2ForCTC.from_pretrained("asr_saved_model_50000v2")
print(model)

In [None]:
#load model
from transformers import Wav2Vec2ForCTC
from transformers import Wav2Vec2Processor
model = Wav2Vec2ForCTC.from_pretrained("asr_saved_model_ta_v4")
processor = Wav2Vec2Processor.from_pretrained("asr_saved_model_ta_v4")

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [None]:
def transcribe_audio(file_path):
    audio_input, sr = librosa.load(file_path, sr=16000)
    min_length = 16000
    if len(audio_input) < min_length:
        audio_input = np.pad(audio_input, (0, min_length - len(audio_input)), mode='constant')
    inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt")
    input_values = inputs.input_values.to(device)
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription

In [None]:
import librosa
sample_audio_path = "common_voice_ta_19137808.mp3"
try:
    transcription = transcribe_audio(sample_audio_path)
    print("Transcription:", transcription)
except Exception as e:
    print("Error during inference (check your audio file path):", e)

Transcription: ஏறினின்று பாரடா எங்கும்
