# Use conda_env to run this 

In [29]:
import os
import csv
import speech_recognition as sr
from evaluate import load
wer = load("wer")
from jiwer import wer




In [30]:
# Function to read CSV file and process data
def read_csv_file(csv_file):
    data = []
    with open(csv_file, 'r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            number_part = row['Number']
            text_part = row['Text'].strip()
            data.append((number_part, text_part))
    return data


## Using word to vec pretrained model  Hugging Face's Transformers library
### model is saved in models\word_vec

In [31]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import soundfile as sf

In [32]:

# Function to convert wav file to text
def convert_wav_to_text(audio_file_path):
        # Load pre-trained model and tokenizer
        model = Wav2Vec2ForCTC.from_pretrained("../models/word_vec")
        tokenizer = Wav2Vec2Tokenizer.from_pretrained("../models/word_vec/tokenizer")

        # Load audio file
        audio_input, sample_rate = sf.read(audio_file_path)

        # Tokenize and convert to input features
        input_values = tokenizer(audio_input, return_tensors="pt").input_values

        # Transcribe audio
        with torch.no_grad():
            logits = model(input_values).logits

        # Decode the transcription
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = tokenizer.batch_decode(predicted_ids)[0]

        return transcription


In [33]:

import string
def normalize_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text


In [34]:

# Function to process data and generate WER
def process_data_and_generate_wer(data,input_path, output_file):
    with open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['Number', 'Expected Text', 'Generated Text', 'WER'])
        
        for number_part, expected_text in data:
            wav_file = os.path.join(input_path, f"{number_part}.wav")  # audio file path
            generated_text = convert_wav_to_text(wav_file)
            # error_rate = wer.compute(predictions=[normalize_text(generated_text)], references=[normalize_text(expected_text)])
            error_rate = wer(normalize_text(expected_text), normalize_text(generated_text))
            writer.writerow([number_part, expected_text, generated_text, error_rate])
            print(f"Processed {wav_file} - WER: {error_rate}")

### To process noise added .wav files 

In [35]:
# Set the path to your main dataset folder
dataset_folder = r"..\dataset\LibriSpeech\test-clean"
output_folder=r"..\outputs\libri_dataset_outputs\noise_word_vec\without_group"

for level1 in [f.name for f in os.scandir(dataset_folder) if f.is_dir()] :
    level1_name=level1
    level1=os.path.join(dataset_folder,level1)

    for level2 in [f.name for f in os.scandir(level1) if f.is_dir()]:
        level2_name=level2
        level2=os.path.join(level1,level2)
        level2=os.path.join(level2,'output')
        input_path=os.path.join(level2,'noise_added')

        #to check normal file wer 
        # input_path=level2
       

        for allfiles in [f.name for f in os.scandir(level2) ]:
            #accessing .csv file in that folder
            if allfiles.endswith('.csv'):
                csv_file=os.path.join(level2,allfiles)

                #interept the csv file and get number and text part
                data = read_csv_file(csv_file)

                #creating output file to save in outputs directory
                output_file_without_path=str(level1_name)+"-"+str(level2_name)+".csv"
                output_file=os.path.join(output_folder,output_file_without_path)

                #we need data [number part,text part] , input_path [which contains all .wav files] ,ouput_file [to store the final outputs]
                process_data_and_generate_wer(data, input_path,output_file)

Processed ..\dataset\LibriSpeech\test-clean\1089\134686\output\noise_added\1089-134686-0000.wav - WER: 0.39285714285714285
Processed ..\dataset\LibriSpeech\test-clean\1089\134686\output\noise_added\1089-134686-0001.wav - WER: 0.5
Processed ..\dataset\LibriSpeech\test-clean\1089\134686\output\noise_added\1089-134686-0002.wav - WER: 0.3333333333333333
Processed ..\dataset\LibriSpeech\test-clean\1089\134686\output\noise_added\1089-134686-0003.wav - WER: 0.42857142857142855
Processed ..\dataset\LibriSpeech\test-clean\1089\134686\output\noise_added\1089-134686-0004.wav - WER: 0.2727272727272727


KeyboardInterrupt: 