In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def get_file_lengths(folder_path):
    """Return a dictionary of {filename: length} for each file in the folder."""
    lengths = {}
    for filename in sorted(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
                lengths[filename] = len(text)
    return lengths

In [None]:
def compare_folder_lengths(folder1, folder2):
    """Compare character lengths of files with the same names in two folders."""
    lengths1 = get_file_lengths(folder1)
    lengths2 = get_file_lengths(folder2)

    if set(lengths1.keys()) != set(lengths2.keys()):
        raise ValueError("Folders do not contain the same files.")

    list1 = [lengths1[filename] for filename in sorted(lengths1)]
    list2 = [lengths2[filename] for filename in sorted(lengths2)]

    return list1, list2

In [None]:
folder1 = "test/kaminski-v"
folder2 = "test/kaminski-nyt"
lengths1, lengths2 = compare_folder_lengths(folder1, folder2)
print("Folder 1 lengths:", lengths1)
print("Folder 2 lengths:", lengths2)

In [None]:
def plot_binned_file_lengths(lengths1, lengths2, folder1_label='Cleaned', folder2_label='Generated'):
    """
    Plots a grayscale histogram with bins:
    - [0–128], [129–256], [257–512], [513–5120]
    """
    # Define new bin edges and labels
    bin_edges = [0, 128, 256, 512, 5120]
    bin_labels = ['0–128', '129–256', '257–512', '513–5120']

    # Cap lengths to max bin value
    lengths1_capped = [min(l, 5119) for l in lengths1]
    lengths2_capped = [min(l, 5119) for l in lengths2]

    # Get histogram counts
    counts1, _ = np.histogram(lengths1_capped, bins=bin_edges)
    counts2, _ = np.histogram(lengths2_capped, bins=bin_edges)

    # X-axis positions (equidistant)
    x = np.arange(len(bin_labels))
    bar_width = 0.4

    # Plot
    plt.figure(figsize=(10, 6))
    plt.bar(x - bar_width / 2, counts1, width=bar_width, label=folder1_label, color='lightgray')
    plt.bar(x + bar_width / 2, counts2, width=bar_width, label=folder2_label, color='dimgray')

    # Labeling and styling
    plt.xlabel('Character Count Range', fontsize=16)
    plt.ylabel('Number of Files', fontsize=16)
    # plt.title('Distribution of File Lengths (Grayscale Histogram)', fontsize=16)
    plt.xticks(x, bin_labels, rotation=0, fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(fontsize=16)
    plt.grid(axis='y', linestyle='--', alpha=0.6)
    plt.tight_layout()

    plt.show()

In [None]:
plot_binned_file_lengths(lengths1, lengths2)