In [15]:
import os
import pandas as pd

In [16]:
def calculate_cer(reference, hypothesis):
    """
    Calculate Character Error Rate (CER) between reference and hypothesis strings.
    CER is the percentage of characters that are incorrect in the hypothesis compared to the reference.
    """
    reference = reference.lower()
    hypothesis = hypothesis.lower()

    # Create a matrix to store the distances between substrings
    dp = [[0] * (len(hypothesis) + 1) for _ in range(len(reference) + 1)]

    for i in range(len(reference) + 1):
        for j in range(len(hypothesis) + 1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif reference[i - 1] == hypothesis[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i][j - 1],      # Insert
                                   dp[i - 1][j],      # Remove
                                   dp[i - 1][j - 1])  # Replace

    # Calculate CER as the normalized edit distance
    cer = dp[len(reference)][len(hypothesis)] / len(reference)
    return cer



In [17]:
input_directory = r"..\outputs\libri_dataset_outputs\noise_word_vec\without_group"


# Iterate through all CSV files in the directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        
        # Try different encodings if utf-8 fails
        encodings = ['utf-8', 'latin1', 'ISO-8859-1']
        for encoding in encodings:
            try:
                df = pd.read_csv(file_path, encoding=encoding)
                break
            except UnicodeDecodeError:
                print(f"Failed to decode {filename} with {encoding} encoding.")
        else:
            print(f"Unable to decode {filename} with any of the provided encodings.")
            continue

        # Add a new column for CER
        df['CER'] = df.apply(lambda row: calculate_cer(str(row['Expected Text']), str(row['Generated Text'])), axis=1)

        # Save the updated dataframe to a new CSV file
        output_file_path = os.path.join(input_directory, f'{filename}')
        df.to_csv(output_file_path, index=False)

        print(f"CER calculation completed and saved to '{output_file_path}'")

CER calculation completed and saved to '..\outputs\libri_dataset_outputs\noise_word_vec\without_group\1089-134686.csv'
CER calculation completed and saved to '..\outputs\libri_dataset_outputs\noise_word_vec\without_group\1089-134691.csv'
CER calculation completed and saved to '..\outputs\libri_dataset_outputs\noise_word_vec\without_group\1188-133604.csv'
CER calculation completed and saved to '..\outputs\libri_dataset_outputs\noise_word_vec\without_group\121-121726.csv'
CER calculation completed and saved to '..\outputs\libri_dataset_outputs\noise_word_vec\without_group\121-123852.csv'
CER calculation completed and saved to '..\outputs\libri_dataset_outputs\noise_word_vec\without_group\121-123859.csv'
CER calculation completed and saved to '..\outputs\libri_dataset_outputs\noise_word_vec\without_group\121-127105.csv'
CER calculation completed and saved to '..\outputs\libri_dataset_outputs\noise_word_vec\without_group\1221-135766.csv'
CER calculation completed and saved to '..\outputs\l