In [1]:
from datasets import load_dataset, load_metric, Audio
import evaluate

In [2]:
dataset=load_dataset("mozilla-foundation/common_voice_17_0", "hi", split="validated")

In [3]:
print(dataset.column_names)

['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant']


In [4]:
import noisereduce as nr
import librosa
import numpy as np

def reduce_noise(audio_file):
    y, sr = librosa.load(audio_file, sr=None)
    reduced_audio = nr.reduce_noise(y=y, sr=sr)
    return reduced_audio, sr

In [None]:
import scipy.signal
import librosa.display
import matplotlib.pyplot as plt

def spectral_noise_reduction(y, sr):
    D = librosa.stft(y)
    D_magnitude, D_phase = librosa.magphase(D)
    D_denoised = np.maximum(D_magnitude - 0.02, 0)
    y_denoised = librosa.istft(D_denoised * D_phase)
    return y_denoised

In [6]:
import soundfile as sf

def clean_audio(sample):
    y, sr = librosa.load(sample["audio"]["path"], sr=None)
    y_clean = nr.reduce_noise(y=y, sr=sr)
    output_path = sample["audio"]["path"].replace(".wav", "_clean.wav")
    sf.write(output_path, y_clean, sr)
    sample["clean_path"] = output_path
    return sample

In [7]:
dataset = dataset.map(clean_audio)

In [8]:
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
common_voice_train = split_dataset["train"]
common_voice_test = split_dataset["test"]

In [9]:
len(common_voice_train)

8263

In [10]:
len(common_voice_test)

2066

In [11]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [12]:
common_voice_train.column_names

['path', 'audio', 'sentence', 'variant', 'clean_path']

In [13]:
common_voice_test.column_names

['path', 'audio', 'sentence', 'variant', 'clean_path']

In [14]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [15]:
show_random_elements(common_voice_train.remove_columns(["path", "audio"]), num_examples=10)

Unnamed: 0,sentence,variant,clean_path
0,"कोहली से पाकिस्तानी फैन ने कहा- यहां आकर खेलें क्रिकेट, तो मिला ऐसा जवाब",,C:\Users\ruebe\.cache\huggingface\datasets\downloads\extracted\c90d974073b4b757eca5257c9dee07d2980e604b6ef8532d9ca83c4792a96340\hi_validated_0/common_voice_hi_27478477.mp3
1,क्या तुम उसे पूछने वाले हो?,,C:\Users\ruebe\.cache\huggingface\datasets\downloads\extracted\c90d974073b4b757eca5257c9dee07d2980e604b6ef8532d9ca83c4792a96340\hi_validated_0/common_voice_hi_25242682.mp3
2,बदल रहे हैं अनाज के कटोरे,,C:\Users\ruebe\.cache\huggingface\datasets\downloads\extracted\c90d974073b4b757eca5257c9dee07d2980e604b6ef8532d9ca83c4792a96340\hi_validated_0/common_voice_hi_27005364.mp3
3,यह कलम मेरी है।,,C:\Users\ruebe\.cache\huggingface\datasets\downloads\extracted\c90d974073b4b757eca5257c9dee07d2980e604b6ef8532d9ca83c4792a96340\hi_validated_0/common_voice_hi_25935935.mp3
4,सियाही अभी भी गीली है।,,C:\Users\ruebe\.cache\huggingface\datasets\downloads\extracted\c90d974073b4b757eca5257c9dee07d2980e604b6ef8532d9ca83c4792a96340\hi_validated_0/common_voice_hi_25728159.mp3
5,पंजाब सरकार की सुन नहीं रहे किसान!,,C:\Users\ruebe\.cache\huggingface\datasets\downloads\extracted\c90d974073b4b757eca5257c9dee07d2980e604b6ef8532d9ca83c4792a96340\hi_validated_0/common_voice_hi_26354163.mp3
6,अभी हम एक साथ रहते हैं।,,C:\Users\ruebe\.cache\huggingface\datasets\downloads\extracted\c90d974073b4b757eca5257c9dee07d2980e604b6ef8532d9ca83c4792a96340\hi_validated_0/common_voice_hi_26007444.mp3
7,"कुछ लड़के मछलियाँ पकड़ रहे हैं, और बाकी तैर रहे हैं।",,C:\Users\ruebe\.cache\huggingface\datasets\downloads\extracted\c90d974073b4b757eca5257c9dee07d2980e604b6ef8532d9ca83c4792a96340\hi_validated_0/common_voice_hi_26236288.mp3
8,फर्जी डिग्री विवाद: जितेंद्र तोमर ने केजरीवाल को पत्र लिखकर दी सफाई,,C:\Users\ruebe\.cache\huggingface\datasets\downloads\extracted\c90d974073b4b757eca5257c9dee07d2980e604b6ef8532d9ca83c4792a96340\hi_validated_0/common_voice_hi_26116157.mp3
9,समाज की कशमकश बयां करती 'धरा अंकुराई',,C:\Users\ruebe\.cache\huggingface\datasets\downloads\extracted\c90d974073b4b757eca5257c9dee07d2980e604b6ef8532d9ca83c4792a96340\hi_validated_0/common_voice_hi_26060243.mp3


In [16]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch

In [17]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

In [18]:
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

In [19]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=False, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=False, remove_columns=common_voice_test.column_names)

In [20]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [21]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 '&': 1,
 'a': 2,
 'b': 3,
 'c': 4,
 'd': 5,
 'e': 6,
 'f': 7,
 'g': 8,
 'h': 9,
 'i': 10,
 'j': 11,
 'k': 12,
 'l': 13,
 'm': 14,
 'n': 15,
 'o': 16,
 'p': 17,
 'q': 18,
 'r': 19,
 's': 20,
 't': 21,
 'u': 22,
 'v': 23,
 'w': 24,
 'x': 25,
 'y': 26,
 'z': 27,
 '|': 28,
 'ँ': 29,
 'ं': 30,
 'ः': 31,
 'अ': 32,
 'आ': 33,
 'इ': 34,
 'ई': 35,
 'उ': 36,
 'ऊ': 37,
 'ऋ': 38,
 'ए': 39,
 'ऐ': 40,
 'ऑ': 41,
 'ओ': 42,
 'औ': 43,
 'क': 44,
 'ख': 45,
 'ग': 46,
 'घ': 47,
 'च': 48,
 'छ': 49,
 'ज': 50,
 'झ': 51,
 'ञ': 52,
 'ट': 53,
 'ठ': 54,
 'ड': 55,
 'ढ': 56,
 'ण': 57,
 'त': 58,
 'थ': 59,
 'द': 60,
 'ध': 61,
 'न': 62,
 'प': 63,
 'फ': 64,
 'ब': 65,
 'भ': 66,
 'म': 67,
 'य': 68,
 'र': 69,
 'ल': 70,
 'व': 71,
 'श': 72,
 'ष': 73,
 'स': 74,
 'ह': 75,
 '़': 76,
 'ा': 77,
 'ि': 78,
 'ी': 79,
 'ु': 80,
 'ू': 81,
 'ृ': 82,
 'ॅ': 83,
 'े': 84,
 'ै': 85,
 'ॉ': 86,
 'ो': 87,
 'ौ': 88,
 '्': 89,
 'क़': 90,
 'ख़': 91,
 'ग़': 92,
 'ज़': 93,
 'ड़': 94,
 'ढ़': 95,
 'फ़': 96,
 '।': 97,
 '–': 98,
 '’': 99}

In [22]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [23]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

101

In [24]:
import json
with open('vocab_hi.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [25]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [26]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [27]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [28]:
common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16_000))

In [29]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(common_voice_train)-1)

print(common_voice_train[rand_int]["sentence"])
ipd.Audio(data=common_voice_train[rand_int]["audio"]["array"], autoplay=True, rate=16000)

हैरतअंगेजः ऑस्ट्रेलिया की झाड़ियों में लगी आग का जिम्मेदार भारतीय मॉनसून


In [30]:
rand_int = random.randint(0, len(common_voice_train)-1)

print("Target text:", common_voice_train[rand_int]["sentence"])
print("Input array shape:", common_voice_train[rand_int]["audio"]["array"].shape)
print("Sampling rate:", common_voice_train[rand_int]["audio"]["sampling_rate"])

Target text: टेस्ट श्रृंखला बचाने के मकसद से उतरेगी टीम इंडिया
Input array shape: (87552,)
Sampling rate: 16000


In [31]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [32]:
common_voice_train=common_voice_train.select(range(8263))
common_voice_test=common_voice_test.select(range(2066))

In [33]:
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)

In [34]:
max_input_length_in_sec = 5.0
common_voice_train = common_voice_train.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

In [35]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

In [36]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

In [37]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [38]:
from datasets import load_metric

In [39]:
import jiwer

In [40]:
wer_metric = load_metric("wer",trust_remote_code=True)

  wer_metric = load_metric("wer",trust_remote_code=True)


In [41]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [42]:
#old model

from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.1,
    mask_time_prob=0.25,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#new model
from transformers import Wav2Vec2ForCTC
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", 
    attention_dropout=0.75,
    hidden_dropout=0.75,
    feat_proj_dropout=0.75, 
    mask_time_prob=0.5,
    layerdrop=0.5,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

In [43]:
model.freeze_feature_extractor()



In [44]:
#oldtrainargs

from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./",
  group_by_length=True,
  per_device_train_batch_size=32,
  gradient_accumulation_steps=4,
  eval_strategy="epoch",
  num_train_epochs=50,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=100,
  learning_rate=5e-4,
  warmup_ratio=0.1,
  save_total_limit=2,
  push_to_hub=False,
  optim="adamw_bnb_8bit"
)

In [None]:
#newtrainargs

from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./",
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=8,
  eval_strategy="epoch",
  num_train_epochs=100,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=100,
  learning_rate=3e-5,
  warmup_ratio=0.1,
  save_total_limit=2,
  push_to_hub=False,
  optim="adamw_hf",
)


In [45]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)

  trainer = Trainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [46]:
trainer.train()



Epoch,Training Loss,Validation Loss,Wer
0,No log,7.746066,1.0
1,No log,3.956023,1.0
2,10.162000,3.544405,1.0
3,10.162000,3.548426,1.0
4,3.558400,3.441729,1.0
5,3.558400,3.40184,1.0
6,3.558400,3.307279,1.0
7,3.404600,2.877306,1.0
8,3.404600,1.526032,0.893214
9,2.262400,1.04326,0.80955


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=2000, training_loss=1.3500585947036743, metrics={'train_runtime': 282798.8185, 'train_samples_per_second': 0.916, 'train_steps_per_second': 0.007, 'total_flos': 2.8584367024167567e+19, 'train_loss': 1.3500585947036743, 'epoch': 49.98765432098765})

In [47]:
save_directory = "./asr_saved_model_hi_v3"
model.save_pretrained(save_directory)
processor.save_pretrained(save_directory)
print(f"Model and processor saved to {save_directory}")

Model and processor saved to ./asr_saved_model_hi_v3


In [None]:
#load model
model = Wav2Vec2ForCTC.from_pretrained("asr_saved_model_50000v2")
processor = Wav2Vec2Processor.from_pretrained("asr_saved_model_50000v2")

In [48]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [49]:
def transcribe_audio(file_path):
    audio_input, sr = librosa.load(file_path, sr=16000)
    min_length = 16000
    if len(audio_input) < min_length:
        audio_input = np.pad(audio_input, (0, min_length - len(audio_input)), mode='constant')
    inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt")
    input_values = inputs.input_values.to(device)
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription

In [50]:
sample_audio_path = "common_voice_hi_23828685.mp3"
try:
    transcription = transcribe_audio(sample_audio_path)
    print("Transcription:", transcription)
except Exception as e:
    print("Error during inference (check your audio file path):", e)

Transcription: बच्चों को माबाप की बात माननी चाहिए।
