In [None]:
import os

# I have been having filename length issues with HuggingFace download because they use a 80 character hash for intermediate filenames when downloading
# Very sad that we have to do this, but we need to shorten the filenames to avoid this issue

# Keywords to remove from filenames
KEYWORDS = [
    "custom_sae_",
    "AprilUpdate",
    "EleutherAI_",
    "google_",
    "Trainer",
    "ctx1024",
]

# Set to True to actually rename files, False to just preview changes
PERFORM_RENAME = True  # <-- Change this to True when ready to rename


def shorten_filenames(start_path="."):
    count = 0

    max_begin_length = 0
    max_end_length = 0

    for root, dirs, files in os.walk(start_path):
        for filename in files:
            # Only process JSON files
            if filename.endswith(".json"):
                new_filename = filename
                was_modified = False

                # Remove each keyword from just the filename
                for keyword in KEYWORDS:
                    if keyword in new_filename:
                        new_filename = new_filename.replace(keyword, "")
                        was_modified = True

                if was_modified:
                    old_path = os.path.join(root, filename)
                    new_path = os.path.join(
                        root, new_filename
                    )  # Only the filename is modified, root path stays the same

                    if PERFORM_RENAME:
                        try:
                            os.rename(old_path, new_path)
                            # print(f"In directory: {root}")
                            # print(f"Renamed: {filename}\n     -> {new_filename}")
                            # print(f"Old length: {len(filename)}, New length: {len(new_filename)}\n")
                            max_begin_length = max(max_begin_length, len(filename))
                            max_end_length = max(max_end_length, len(new_filename))
                        except Exception as e:
                            print(f"Error renaming {filename}: {e}")
                    else:
                        print(f"In directory: {root}")
                        print(f"Would rename: {filename}\n         -> {new_filename}")
                        print(
                            f"Old length: {len(filename)}, New length: {len(new_filename)}\n"
                        )

                        max_begin_length = max(max_begin_length, len(filename))
                        max_end_length = max(max_end_length, len(new_filename))

                    count += 1
    print(f"Max filename length before: {max_begin_length}, after: {max_end_length}")
    print(
        f"\nTotal files {'renamed' if PERFORM_RENAME else 'that would be renamed'}: {count}"
    )


# Example usage:
path = "."  # Adjust this path as needed
shorten_filenames(path)

In [None]:
import shutil


def organize_files_in_folders(folder_paths):
    # Define the prefixes we're looking for
    prefixes = [
        "saebench_pythia-160m-deduped_width-2pow14_date-0108",
        # 'sae_bench_gemma-2-2b_vanilla_width-2pow16_date-1109'
    ]

    for folder in folder_paths:
        if not os.path.exists(folder):
            print(f"Folder {folder} does not exist. Skipping.")
            continue

        # Get all json files in the current directory
        files = [f for f in os.listdir(".") if f.endswith(".json")]
        if len(files) == 0:
            print(f"No json files found in {folder}. Skipping.")
            continue

        num_files_with_prefix = 0

        for prefix in prefixes:
            num_files_with_prefix += len([f for f in files if f.startswith(prefix)])

        if num_files_with_prefix == 0:
            raise ValueError(f"No files with prefix {prefixes} found in {folder}.")

        os.chdir(folder)  # Change to the target directory
        print(f"Processing folder: {folder}")

        # Create folders if they don't exist
        for prefix in prefixes:
            if not os.path.exists(prefix):
                os.makedirs(prefix)

        # Move files to appropriate folders
        for file in files:
            for prefix in prefixes:
                if file.startswith(prefix):
                    shutil.move(file, os.path.join(prefix, file))
                    print(f"Moved {file} to {prefix}/")
                    break

        os.chdir("..")  # Return to the original directory


if __name__ == "__main__":
    # List of folders to process
    folder_paths = [
        "absorption",
        "autointerp",
        "core",
        "scr",
        "sparse_probing",
        "tpp",
        "unlearning",
    ]

    organize_files_in_folders(folder_paths)

In [None]:


def copy_autointerp():
    source = "autointerp"
    destination = "autointerp_with_generations"

    if not os.path.exists(source):
        print(f"Source folder '{source}' does not exist. Aborting.")
        return

    if os.path.exists(destination):
        raise Exception(f"Destination folder '{destination}' already exists. Aborting.")

    try:
        shutil.copytree(source, destination)
        print(f"Copied '{source}' to '{destination}'.")
    except Exception as e:
        print(f"Failed to copy '{source}' to '{destination}': {e}")


if __name__ == "__main__":
    copy_autointerp()

In [None]:
# purpose: remove the llm generations, which is over 99% of the file size

import json


def process_json_files(directory):
    """Recursively process all JSON files in directory and its subdirectories"""
    count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                filepath = os.path.join(root, file)
                try:
                    # Read the JSON file
                    with open(filepath) as f:
                        data = json.load(f)

                    # Remove the key if it exists
                    if "eval_result_unstructured" in data:
                        del data["eval_result_unstructured"]
                        count += 1

                    # Write back the modified data
                    with open(filepath, "w") as f:
                        json.dump(data, f)

                    # print(f"Processed: {filepath}")

                except Exception as e:
                    print(f"Error processing {filepath}: {str(e)}")

    return count


# Process files starting from the current directory
starting_dir = "autointerp"
files_modified = process_json_files(starting_dir)
print(f"\nCompleted! Modified {files_modified} files.")

In [None]:


def copy_autointerp():
    source = "core"
    destination = "core_with_feature_statistics"

    if not os.path.exists(source):
        print(f"Source folder '{source}' does not exist. Aborting.")
        return

    if os.path.exists(destination):
        raise Exception(f"Destination folder '{destination}' already exists. Aborting.")

    try:
        shutil.copytree(source, destination)
        print(f"Copied '{source}' to '{destination}'.")
    except Exception as e:
        print(f"Failed to copy '{source}' to '{destination}': {e}")


if __name__ == "__main__":
    copy_autointerp()

In [None]:
# purpose: remove the feature statistics, which is over 99% of the file size



def process_json_files(directory):
    """Recursively process all JSON files in directory and its subdirectories"""
    count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                filepath = os.path.join(root, file)
                try:
                    # Read the JSON file
                    with open(filepath) as f:
                        data = json.load(f)

                    # Remove the key if it exists
                    if "eval_result_details" in data:
                        del data["eval_result_details"]
                        count += 1

                    # Write back the modified data
                    with open(filepath, "w") as f:
                        json.dump(data, f)

                    # print(f"Processed: {filepath}")

                except Exception as e:
                    print(f"Error processing {filepath}: {str(e)}")

    return count


# Process files starting from the current directory
starting_dir = "core"
files_modified = process_json_files(starting_dir)
print(f"\nCompleted! Modified {files_modified} files.")

In [None]:
import re

from tqdm import tqdm

local_dir = "."
folders = [
    f"{local_dir}/absorption",
    f"{local_dir}/autointerp",
    f"{local_dir}/core",
    f"{local_dir}/scr",
    f"{local_dir}/sparse_probing",
    f"{local_dir}/tpp",
    f"{local_dir}/unlearning",
    f"{local_dir}/autointerp_with_generations",
    f"{local_dir}/core_with_feature_statistics",
]


def get_sae_bench_train_tokens(filename) -> int:
    """
    Calculate the number of training tokens based on the sae_release and sae_id.
    """
    if "saebench" not in filename:
        raise ValueError("This function is only for SAE Bench releases")

    batch_size = 2048

    if "step" not in filename:
        steps = 244140
        return steps * batch_size
    else:
        match = re.search(r"step_(\d+)", filename)
        if match:
            step = int(match.group(1))
            return step * batch_size
        else:
            raise ValueError("No step match found")


def process_file(filename: str):
    """
    Process a single file: load JSON, add training_tokens to sae_cfg_dict, and save it.
    """
    try:
        with open(filename) as f:
            eval_results = json.load(f)

        if "sae_cfg_dict" not in eval_results:
            raise KeyError("sae_cfg_dict not found in the JSON file")

        eval_results["sae_cfg_dict"]["training_tokens"] = get_sae_bench_train_tokens(
            filename
        )

        with open(filename, "w") as f:
            json.dump(eval_results, f, indent=4)

        return True
    except (json.JSONDecodeError, KeyError, ValueError) as e:
        print(f"Error processing file {filename}: {e}")
        return False


def main():
    total = 0
    total_updated = 0

    for folder in tqdm(folders, desc="Processing folders"):
        if not os.path.exists(folder):
            print(f"Folder {folder} does not exist. Skipping.")
            continue

        for root, _, files in os.walk(folder):
            for file in files:
                if file.endswith(".json"):
                    total += 1
                    filepath = os.path.join(root, file)
                    if process_file(filepath):
                        total_updated += 1

    print(f"Total files: {total}, Total updated: {total_updated}")


if __name__ == "__main__":
    main()

In [None]:
raise Exception("Stop here")

In [None]:
from huggingface_hub import HfApi

api = HfApi()

api.upload_large_folder(
    folder_path=".",
    # path_in_repo="",
    repo_id="adamkarvonen/new_sae_bench_results",
    repo_type="dataset",
    allow_patterns="*eval_results.json",
    ignore_patterns=[".DS_Store"],
)