# **Open AI Whisper Large V3 Pre-trianed Model**

### **Required Libs**

In [None]:
!pip install transformers  datasets accelerate

# transformers  : Access to a large collection of pre-trained models for various NLP tasks

# datasets      : simplify the process of downloading, preparing, and using datasets

# accelerate    : distributed training support , designed to accelerate training and inference on GPUs and TPUs.

In [None]:
!pip install jiwer

# jiwer  : provides metrics to evaluate ASR systems

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Mounts drive to colab-notebook

In [None]:
import torch

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

from datasets import load_dataset

import pandas as pd

import jiwer

import re

import os



# imported necessary modules

# torch: Used for tensor operations and neural networks.

# AutoModelForSpeechSeq2Seq: Loads pre-trained speech-to-sequence models.

# AutoProcessor: Loads pre-trained tokenizers/processors associated with specific models.

# pipeline: Simplifies the use of pre-trained models for specific NLP tasks, like text speech2text

# datasets: Library for loading and working with datasets commonly used in machine learning tasks

# pandas: Data manipulation purpose

# jiwer: Library for calculating Word Error Rate (WER) and other metrics used in evaluating ASR systems

# re : regular expression module : pattern matching in string

In [None]:
import warnings


warnings.filterwarnings("ignore")

# Avoids unwanted errors

### **Process Code**

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)

model.to(device)

processor = AutoProcessor.from_pretrained(model_id)


# line 1 : Assigns the device as GPU (cuda:0) if available, otherwise falls back to CPU.

# line 2 : Sets the torch data type to float16 for GPU if available, or float32 for CPU.

# line 3 : Specifies the identifier for the Whisper large version 3 model on Hugging Face Model Hub.

# line 4 : Loads the model with specified data type, low CPU memory usage, and safety tensor usage.

# line 5 : Moves the loaded model to the specified device (GPU or CPU).

# line 6 : initializes the processor for the specified model, which includes tokenization and other necessary pre-processing steps.

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
pipe = pipeline(

    "automatic-speech-recognition",                                              # Specifies the task for the pipeline.

    model=model,                                                                 # Associates the ASR pipeline with the loaded Whisper model.

    tokenizer=processor.tokenizer,                                               # Uses the tokenizer from the initialized processor.

    feature_extractor=processor.feature_extractor,                               # utilizes the feature extractor from the processor.

    max_new_tokens=128,                                                          # Sets the maximum number of new tokens.

    chunk_length_s=30,                                                           # Defines the length of audio chunks in seconds.

    batch_size=16,                                                               # Specifies the batch size for ASR processing.

    return_timestamps=True,                                                      # Requests the pipeline to return timestamps along with recognized text.

    torch_dtype=torch_dtype,                                                     # Passes the specified torch data type to the pipeline.

    device=device,                                                               # Sets the device (GPU or CPU) for processing.
)

In [None]:
dataset = load_dataset("/content/drive/MyDrive/Akaike")                          # Loads a dataset from this path ("/content/drive/MyDrive/Akaike")

c  = dataset['train']                                                            # Retrieves the 'train' split from the loaded dataset and stores data in The variable c.

Resolving data files:   0%|          | 0/1471 [00:00<?, ?it/s]

In [None]:
def drop_special_chars(text):

    pattern = '[,\|\?\.\!\-\;\:\"\“\%\‘\”\�।]'                                  # Defines a regular expression pattern that includes a set of characters to be matched and removed

    cleaned_text = re.sub(pattern, '', text).strip()                            # Uses the re.sub function to substitute all occurrences of the characters matched by the pattern with an empty string in the given text.

    return cleaned_text.strip()                                                 # Returns the final cleaned text with remvoing extra spaces

In [None]:
file_list = os.listdir('/content/drive/MyDrive/Akaike')                         # This function returns a list containing the names of the entries in the specified directory. In this case, it retrieves the list of files in the directory '/content/drive/MyDrive/Akaike'.

print(len(file_list))                                                           # Prints the number of files in the directory by getting the length of the list obtained from os.listdir.

1816


### **Test Data Provided By Akaike (trans.txt)**

In [None]:
test_data = pd.read_table('/content/trans.txt',delimiter='\t',names=['audio','text'])   # Imports content in table format

In [None]:
test_data.head()

Unnamed: 0,audio,text
0,common_voice_mr_32645351.wav,स्वेच्छानिवृतीचा पध्दत मोठ्या प्रमाणात अंमलात ...
1,common_voice_mr_32127661.wav,चार चार विषय आहेत
2,common_voice_mr_32127660.wav,क्रोष्टु सहस्रजित् नल अंतिक व लघु अशी त्याच्या...
3,common_voice_mr_32126825.wav,त्यातील एर्देनि हे रत् नाचे व सुबाशिदि हे सुभा...
4,common_voice_mr_32126698.wav,आदरार्थी शब्दांनी केला जात असे लहानपणी गंगेजवळ...


### **Word Error Rate**

**Dictionary for Dataframe**

In [None]:
result_ = { 'audio' : [] , 'generated_text' : [] , 'timestamp':[]}

result_

{'audio': [], 'generated_text': [], 'timestamp': []}

In [None]:
for i in range(len(file_list)):

   result  = pipe(f'/content/drive/MyDrive/Akaike/{file_list[i]}',return_timestamps=True)

   result_['audio'].append(file_list[i])

   result_['generated_text'].append(drop_special_chars(result['text']))

   result_['timestamp'].append(result['chunks'][0]['timestamp'])




# The code processes a list of audio files (file_list) using automatic speech recognition (pipe).

# For each audio file, it extracts information such as the file name, generated text (after removing special characters), and timestamp from the first chunk.

# All these details are stored in the result_ dictionary, providing comprehensive information about each processed audio file

In [None]:
print(result_)

{'audio': ['common_voice_mr_30957329.wav', 'common_voice_mr_30956920.wav', 'common_voice_mr_30954288.wav', 'common_voice_mr_30901892.wav', 'common_voice_mr_30899572.wav', 'common_voice_mr_30898879.wav', 'common_voice_mr_31496343.wav', 'common_voice_mr_31487897.wav', 'common_voice_mr_31485043.wav', 'common_voice_mr_31474938.wav', 'common_voice_mr_30992387.wav', 'common_voice_mr_30899384.wav', 'common_voice_mr_31138451.wav', 'common_voice_mr_31372846.wav', 'common_voice_mr_31433167.wav', 'common_voice_mr_31046553.wav', 'common_voice_mr_31209818.wav', 'common_voice_mr_31274908.wav', 'common_voice_mr_31383893.wav', 'common_voice_mr_31311294.wav', 'common_voice_mr_31119979.wav', 'common_voice_mr_31209644.wav', 'common_voice_mr_31472772.wav', 'common_voice_mr_31472769.wav', 'common_voice_mr_31046804.wav', 'common_voice_mr_30898880.wav', 'common_voice_mr_31475494.wav', 'common_voice_mr_31069673.wav', 'common_voice_mr_31472800.wav', 'common_voice_mr_30900116.wav', 'common_voice_mr_31329891.wav

In [None]:
prediction = pd.DataFrame(result_)                                               # Converts the result_  dictionary into dataframe

prediction

In [None]:
merged_df = pd.merge(prediction,test_data, on='audio', how='inner')              # merges two DataFrames, prediction and test_data, on the 'audio' column using an inner join.

merged_df

Unnamed: 0,audio,generated_text,text
0,common_voice_mr_30772761.wav,बारत आच्या प्राचीन इतिहा सातिल प्रसिद्ध कुरो क...,भारताच्या प्राचीन इतिहासातील प्रसिद्ध कुरू कुल...
1,common_voice_mr_30893941.wav,रच्च्ना कालाचा अप बेसा साटी बाशीऻी सोरुप भा पा...,रचनाकालाच्या अभ्यासासाठी भाषेचे स्वरूप हा पाया...
2,common_voice_mr_30728132.wav,शिवाजी महराजान्ते जन्मस्थान मनजे शिवनीरी किल्ड...,शिवाजी महाराजांचे जन्मस्थान म्हणजे शिवनेरी किल...
3,common_voice_mr_30773633.wav,चरा संदाने क्रुषनाचा आरोपान्ना समर्पक उत्टरे दिली,जरासंधाने कृष्णाच्या आरोपांना समर्पक उत्तरे दिली
4,common_voice_mr_30709390.wav,लूनार प्रोस्पेक्तस गुर्ट्वाकर्षन अब्यासा मदे क...,लुनार प्रोस्पेक्टर गुरुत्वाकर्षण अभ्यासामध्ये ...
...,...,...,...
195,common_voice_mr_30887009.wav,सत्ते बामा कुशनाची पत्तिनी हुती,सत्यभामा कृष्णाची पत्नी होती
196,common_voice_mr_31195437.wav,विश्म प्रतिद्न्या देवव्रद हा प्रष्तिना पुरासा ...,भीष्मप्रतिज्ञा देवव्रत हा हस्तिनापुराचा राजा श...
197,common_voice_mr_30887838.wav,ते सन्त्त तुकारा महारा जान्ना आपले गुरू मानित,ते संत तुकाराम महाराजांना आपले गुरू मानीत
198,common_voice_mr_30772765.wav,ब्रितिशान नि इराक दिरगगा काल आपले ताभेद थेवले ...,ब्रिटिशांनी इराक दीर्घकाल आपल्या ताब्यात ठेवले...


In [None]:
word_error_rate = []

In [None]:
for i in range(merged_df.shape[0]):

  gen,auc = merged_df.loc[i,['generated_text','text']].values                   # This code calculates the Word Error Rate (WER) for each pair of generated text ('generated_text') and actual text ('text') in the merged_df DataFrame.

  wer = jiwer.wer(gen,auc)                                                      # It iterates through each row, retrieves the generated and actual texts, computes the WER using the jiwer library, and appends the WER (multiplied by 100 for percentage) to the 'word_error_rate' list

  word_error_rate.append(wer*100)

In [None]:
len(word_error_rate)

1816

In [None]:
merged_df['wer'] = word_error_rate

In [None]:
merged_df.to_csv('Wer_whisper_large_v3.csv',index=False)

In [None]:
merged_df

Unnamed: 0,audio,generated_text,timestamp,text,wer
0,common_voice_mr_30957329.wav,रुक्मिनी सत्यभामा चामवती सत्या लक्ष्मणा कालिंद...,"(0.0, 9.52)",रुक्मिणी सत्यभामा जाम्बवती सत्या लक्ष्मणा कालि...,75.000000
1,common_voice_mr_30956920.wav,पुस्तके कृष्णाला साधारण पणे श्री कृष्ण मनतात,"(0.0, 5.64)",पुस्तके कृष्णाला साधारणपणे श्रीकृष्ण म्हणतात,71.428571
2,common_voice_mr_30954288.wav,पंडूला कुंटी पसुन जालेले युधिष्ठीर भीम व अर्जु...,"(0.0, 10.0)",पंडूला कुंतीपासून झालेले युधिष्ठिर भीम व अर्जु...,46.153846
3,common_voice_mr_30901892.wav,नेवाशा शेद्रात अपल्या गुरुण्चा प्रपाश्चिर्वादा...,"(0.0, 9.0)",नेवासा क्षेत्रात आपल्या गुरूंच्या कृपाशीर्वादा...,100.000000
4,common_voice_mr_30899572.wav,या नंतर कुंती अपल्या पाचही पुत्रान्ना घेऊन हस्...,"(0.0, 5.88)",यानंतर कुंति आपल्या पाचही पुत्रांना घेऊन हस्ति...,80.000000
...,...,...,...,...,...
1811,common_voice_mr_27762428.wav,ਯੈ ਦਰਮੈਆਨ ਯਾਨਾਵਰੇਲ ਇਂਜੇਨ ਅਡੀਚ ਕੀ ਮੀਨਟਾ ਸਾਥੀ ਸਾ...,"(0.0, 4.46)",यादरम्यान यानावरील इंजीन अडीच मिनिटांसाठी चालव...,100.000000
1812,common_voice_mr_27762396.wav,यानी वड्नुकान मदे अण्णाद्रमुकाचा मोठा पराबो जाला,"(0.0, 4.02)",या निवडणुकांमध्ये अण्णाद्रमुकचा मोठा पराभव झाला,85.714286
1813,common_voice_mr_27703243.wav,नोक्री आनी शिक्षानी आण्चा शी त्यानना जगडावे लागले,"(0.0, 4.06)",नोकरी आणि शिक्षण यांच्याशी त्यांना झगडावे लागले,87.500000
1814,common_voice_mr_27703240.wav,ओलिंपिक सर्क्या केला महोच सवांचा उदय जाला,"(0.0, 4.0)",ऑलिंपिकसारख्या खेळा महोत्सवांचा उदय झाला,85.714286


**Average Word Error Rate**

In [None]:
import numpy as np

np.mean(merged_df['wer'])

73.82864136519082

### **Audio Seconds**

In [None]:
from datasets import load_dataset                                                # The code loads the "Akaike" dataset, calculates audio lengths, and identifies files exceeding 30 seconds, providing the count in a formatted string.


dataset = load_dataset("/content/drive/MyDrive/Akaike")

c  = dataset['train']

def get_audio_length(sample):

    return len(sample['audio']['array'])/  sample['audio']['sampling_rate']

audio_lengths = [get_audio_length(c[i]) for i in range(0, c.num_rows)]

x = [i for i in audio_lengths if i>=30.00]

f'No of the audio files goes above 30 seconds : {x}'

Resolving data files:   0%|          | 0/1220 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

'No of the audio files goes above 30 seconds : []'

# **Whisper Open AI Small Pre-trained Model**



```
# Note : all the necessary package imported likewise large-v3
```



### **Required Libs**

In [None]:
import torch

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

from datasets import load_dataset

import pandas as pd

import jiwer

import os

In [None]:
file_list = os.listdir('/content/drive/MyDrive/Akaike')

print(len(file_list))

1220


### **Load Pre-trained check points**

In [None]:
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small")

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.84k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [None]:
result_ = { 'audio' : [] , 'generated_text' : []}

result_

{'audio': [], 'generated_text': []}

### **Audio Files to text generation**

In [None]:
for i in range(200):

   result  = pipe(f'/content/drive/MyDrive/Akaike/{file_list[i]}',generate_kwargs={"language": "marathi"})

   result_['audio'].append(file_list[i])

   result_['generated_text'].append(drop_special_chars(result['text']))

In [None]:
prediction = pd.DataFrame(result_)

In [None]:
prediction.to_csv('small_prediction.csv',index=False)

In [None]:
test_data = pd.read_table('/content/trans.txt',delimiter='\t',names=['audio','text'])

merged_df = pd.merge(prediction,test_data, on='audio', how='inner')

merged_df

Unnamed: 0,audio,generated_text,text
0,common_voice_mr_30772761.wav,बारत आच्या प्राचीन इतिहा सातिल प्रसिद्ध कुरो क...,भारताच्या प्राचीन इतिहासातील प्रसिद्ध कुरू कुल...
1,common_voice_mr_30893941.wav,रच्च्ना कालाचा अप बेसा साटी बाशीऻी सोरुप भा पा...,रचनाकालाच्या अभ्यासासाठी भाषेचे स्वरूप हा पाया...
2,common_voice_mr_30728132.wav,शिवाजी महराजान्ते जन्मस्थान मनजे शिवनीरी किल्ड...,शिवाजी महाराजांचे जन्मस्थान म्हणजे शिवनेरी किल...
3,common_voice_mr_30773633.wav,चरा संदाने क्रुषनाचा आरोपान्ना समर्पक उत्टरे दिली,जरासंधाने कृष्णाच्या आरोपांना समर्पक उत्तरे दिली
4,common_voice_mr_30709390.wav,लूनार प्रोस्पेक्तस गुर्ट्वाकर्षन अब्यासा मदे क...,लुनार प्रोस्पेक्टर गुरुत्वाकर्षण अभ्यासामध्ये ...
...,...,...,...
195,common_voice_mr_30887009.wav,सत्ते बामा कुशनाची पत्तिनी हुती,सत्यभामा कृष्णाची पत्नी होती
196,common_voice_mr_31195437.wav,विश्म प्रतिद्न्या देवव्रद हा प्रष्तिना पुरासा ...,भीष्मप्रतिज्ञा देवव्रत हा हस्तिनापुराचा राजा श...
197,common_voice_mr_30887838.wav,ते सन्त्त तुकारा महारा जान्ना आपले गुरू मानित,ते संत तुकाराम महाराजांना आपले गुरू मानीत
198,common_voice_mr_30772765.wav,ब्रितिशान नि इराक दिरगगा काल आपले ताभेद थेवले ...,ब्रिटिशांनी इराक दीर्घकाल आपल्या ताब्यात ठेवले...


### **Average Word Error Rate**

In [None]:
word_error_rate = []

for i in range(merged_df.shape[0]):
  gen,auc = merged_df.loc[i,['generated_text','text']].values
  wer = jiwer.wer(gen,auc)
  word_error_rate.append(wer*100)

merged_df['wer'] = word_error_rate

merged_df.to_excel('Wer_whisper_small.xlsx',index=False)

print('Mean Word Error Rate Using Whisper Small:',merged_df['wer'].mean())

Mean Word Error Rate Using Whisper Small: 93.31367132824177
