In [3]:
import os
import csv
import speech_recognition as sr
from evaluate import load
wer = load("wer")
from jiwer import wer
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import soundfile as sf
from scipy.io import wavfile
import numpy as np
from scipy.signal import wiener
import librosa





HE BEGAN A CONFUSED COMPLAINT AGAINST THE WIZARD WHO HAD VANISHED BEHIND THE CURTAIN ON THE LY


In [None]:
# Function to read CSV file and process data
def read_csv_file(csv_file):
    data = []
    with open(csv_file, 'r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            number_part = row['Number']
            text_part = row['Text'].strip()
            data.append((number_part, text_part))
    return data




In [None]:
# Function to apply Weiner filter and denoise audio
def apply_weiner_filter(audio_file_path, output_file_path):
    sample_rate, audio_data = wavfile.read(audio_file_path)
    audio_data = audio_data.astype(np.float32) / np.max(np.abs(audio_data))
    denoised_audio = wiener(audio_data)
    denoised_audio = (denoised_audio * np.iinfo(np.int16).max).astype(np.int16)
    wavfile.write(output_file_path, sample_rate, denoised_audio)
    return output_file_path


In [None]:

# Function to convert wav file to text
def convert_wav_to_text(audio_file_path):
    # Load pre-trained model and tokenizer
    model = Wav2Vec2ForCTC.from_pretrained("../models/word_vec")
    tokenizer = Wav2Vec2Tokenizer.from_pretrained("../models/word_vec/tokenizer")

    # Load audio file
    audio_input, _ = librosa.load(audio_file_path, sr=16000)

    # Tokenize and convert to input features
    input_values = tokenizer(audio_input, return_tensors="pt").input_values

    # Transcribe audio
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode the transcription
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)[0]

    return transcription



In [None]:
def normalize_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text



In [None]:
# Function to process data and generate WER
def process_data_and_generate_wer(data, input_path, output_file):
    with open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['Number', 'Expected Text', 'Generated Text', 'WER'])
        
        for number_part, expected_text in data:
            wav_file = os.path.join(input_path, f"{number_part}.wav")  # audio file path
            denoised_wav_file = os.path.join(input_path, f"denoised_{number_part}.wav")
            
            # Apply Weiner filter to the audio file
            denoised_wav_file = apply_weiner_filter(wav_file, denoised_wav_file)
            
            # Convert denoised wav file to text
            generated_text = convert_wav_to_text(denoised_wav_file)
            
            error_rate = wer(normalize_text(expected_text), normalize_text(generated_text))
            writer.writerow([number_part, expected_text, generated_text, error_rate])
            print(f"Processed {wav_file} - WER: {error_rate}")



In [None]:
# Set the path to your main dataset folder
dataset_folder = r"..\dataset\LibriSpeech\test-clean"
output_folder = r"..\outputs\libri_dataset_outputs\noise_word_vec\without_group"

for level1 in [f.name for f in os.scandir(dataset_folder) if f.is_dir()]:
    level1_name = level1
    level1 = os.path.join(dataset_folder, level1)

    for level2 in [f.name for f in os.scandir(level1) if f.is_dir()]:
        level2_name = level2
        level2 = os.path.join(level1, level2)
        level2 = os.path.join(level2, 'output')
        input_path = os.path.join(level2, 'noise_added')

        #to check normal file wer 
        # input_path=level2
       

        for allfiles in [f.name for f in os.scandir(level2)]:
            if allfiles.endswith('.csv'):
                csv_file = os.path.join(level2, allfiles)

                # Interpret the CSV file and get number and text part
                data = read_csv_file(csv_file)

                # Creating output file to save in outputs directory
                output_file_without_path = f"{level1_name}-{level2_name}.csv"
                output_file = os.path.join(output_folder, output_file_without_path)

                # We need data [number part, text part], input_path [which contains all .wav files], output_file [to store the final outputs]
                process_data_and_generate_wer(data, input_path, output_file)