In [1]:
import csv
import os
import glob

In [2]:
def calculate_cer(reference, hypothesis):
    """
    Calculate Character Error Rate (CER) between reference and hypothesis strings.
    CER is the percentage of characters that are incorrect in the hypothesis compared to the reference.
    """
    reference = reference.lower()
    hypothesis = hypothesis.lower()

    # Create a matrix to store the distances between substrings
    dp = [[0] * (len(hypothesis) + 1) for _ in range(len(reference) + 1)]

    for i in range(len(reference) + 1):
        for j in range(len(hypothesis) + 1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif reference[i - 1] == hypothesis[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i][j - 1],      # Insert
                                   dp[i - 1][j],      # Remove
                                   dp[i - 1][j - 1])  # Replace

    # Calculate CER as the normalized edit distance
    cer = dp[len(reference)][len(hypothesis)] / len(reference)
    return cer


In [3]:
def calculate_average_wer_and_cer(csv_files):
    total_wer = 0
    total_cer = 0
    num_samples = 0

    for csv_file in csv_files:
        try:
            with open(csv_file, 'r', newline='', encoding='utf-8') as file:
                reader = csv.reader(file)
                header = next(reader)  # Read header row
                wer_index = header.index('WER')  # Find index of 'WER' column
                expected_text_index = header.index('Expected Text')
                generated_text_index = header.index('Generated Text')

                for row in reader:
                    wer = float(row[wer_index])
                    total_wer += wer
                    expected_text = row[expected_text_index]
                    generated_text = row[generated_text_index]
                    cer = calculate_cer(expected_text, generated_text)
                    total_cer += cer
                    num_samples += 1
        except UnicodeDecodeError:
            print(f'UnicodeDecodeError encountered in file {csv_file}, trying with ISO-8859-1 encoding.')
            with open(csv_file, 'r', newline='', encoding='ISO-8859-1') as file:
                reader = csv.reader(file)
                header = next(reader)  # Read header row
                wer_index = header.index('WER')  # Find index of 'WER' column
                expected_text_index = header.index('Expected Text')
                generated_text_index = header.index('Generated Text')

                for row in reader:
                    wer = float(row[wer_index])
                    total_wer += wer
                    expected_text = row[expected_text_index]
                    generated_text = row[generated_text_index]
                    cer = calculate_cer(expected_text, generated_text)
                    total_cer += cer
                    num_samples += 1

    if num_samples > 0:
        average_wer = total_wer / num_samples
        average_cer = total_cer / num_samples
        return average_wer, average_cer
    else:
        return 0, 0

In [4]:
def append_average_wer_and_cer_to_csv(pattern, average_wer, average_cer, output_csv):
    file_exists = os.path.isfile(output_csv)
    with open(output_csv, 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(['Pattern', 'Average WER', 'Average CER'])
        writer.writerow([pattern, average_wer, average_cer])

def process_files_with_pattern(pattern, input_path, output_csv):
    search_pattern = f'{pattern}-*.csv'
    input_file = os.path.join(input_path, search_pattern)

    matching_files = glob.glob(input_file)
    if not matching_files:
        print(f'No files found for pattern {pattern}')
        return

    average_wer, average_cer = calculate_average_wer_and_cer(matching_files)
    append_average_wer_and_cer_to_csv(pattern, average_wer, average_cer, output_csv)
    print(f'Average WER and CER for pattern {pattern} saved to {output_csv}')


In [5]:

# Example usage:
input_path = r"..\outputs\libri_dataset_outputs\denoised_word_vec\without_group"
output_csv = r"..\outputs\libri_dataset_outputs\denoised_word_vec\grouped\denoised_word_vec_folder_grouped_WER_CER_output.csv"
dataset_folder = r"..\dataset\LibriSpeech\test-clean"

for level1 in [f.name for f in os.scandir(dataset_folder) if f.is_dir()]:
    pattern = level1
    process_files_with_pattern(pattern, input_path, output_csv)

Average WER and CER for pattern 1089 saved to ..\outputs\libri_dataset_outputs\noise_word_vec\grouped\noise_word_vec_folder_grouped_WER_CER_output.csv
Average WER and CER for pattern 1188 saved to ..\outputs\libri_dataset_outputs\noise_word_vec\grouped\noise_word_vec_folder_grouped_WER_CER_output.csv
Average WER and CER for pattern 121 saved to ..\outputs\libri_dataset_outputs\noise_word_vec\grouped\noise_word_vec_folder_grouped_WER_CER_output.csv
Average WER and CER for pattern 1221 saved to ..\outputs\libri_dataset_outputs\noise_word_vec\grouped\noise_word_vec_folder_grouped_WER_CER_output.csv
Average WER and CER for pattern 1284 saved to ..\outputs\libri_dataset_outputs\noise_word_vec\grouped\noise_word_vec_folder_grouped_WER_CER_output.csv
Average WER and CER for pattern 1320 saved to ..\outputs\libri_dataset_outputs\noise_word_vec\grouped\noise_word_vec_folder_grouped_WER_CER_output.csv
Average WER and CER for pattern 1580 saved to ..\outputs\libri_dataset_outputs\noise_word_vec\g