# Merge biological replicates of image batches

In [15]:
import os
import re
import shutil
from tqdm import tqdm
import numpy as np

# Helper function to get allele file mapping for a directory.
def get_allele_files(dir_path):
    """
    Returns a dict mapping allele -> {file_type: full_path}.
    Expected file pattern: {allele}_{file_type}.npy, where file_type is label, pro, or nuc.
    """
    allele_files = {}
    pattern = re.compile(r'^(?P<allele>.+)_(?P<type>label|pro|nuc)\.npy$')
    for filename in os.listdir(dir_path):
        if filename.endswith('.npy'):
            match = pattern.match(filename)
            if match:
                allele = match.group("allele")
                file_type = match.group("type")
                allele_files.setdefault(allele, {})[file_type] = os.path.join(dir_path, filename)
            else:
                print(f"Warning: {filename} in {dir_path} does not match expected pattern")
    return allele_files

# List of directories to merge
dirs_to_merge = [
    "../inputs/1_model_input/2024_01_23_Batch_7",
    "../inputs/1_model_input/2024_02_06_Batch_8",
    "../inputs/1_model_input/2025_01_27_Batch_13",
    "../inputs/1_model_input/2025_01_28_Batch_14",
    "../inputs/1_model_input/2025_03_17_Batch_15",
    "../inputs/1_model_input/2025_03_17_Batch_16",
]

merged_dir = "../inputs/1_model_input/2024_05_B78-1314-1516"
# Ensure merged_dir exists
os.makedirs(merged_dir, exist_ok=True)

## Merge across multiple directories

In [None]:
# final_total_cells = 0
# for batch_dir in dirs_to_merge:
#     # Get file mappings from both directories.
#     allele_files = get_allele_files(batch_dir)
#     alleles = set(allele_files.keys())
#     # Expected file types for each allele
#     expected_types = ["label", "pro", "nuc"]
#     total_cells = 0
#     for allele in sorted(alleles):
#         for file_type in expected_types:
#             path1 = allele_files[allele][file_type]
#             arr1 = np.load(path1, allow_pickle=True)
#             if file_type == "label":
#                 cell_num_1 = arr1.shape[0]
#                 assert arr1.shape[1] == 2
#             else:
#                 assert arr1.shape[0] == cell_num_1
#         total_cells += arr1.shape[0]
#     print(batch_dir, total_cells)
#     final_total_cells += total_cells
# print("Final total cells:", final_total_cells)

In [17]:
# Collect allele files from all directories
allele_files_list = [get_allele_files(dir_path) for dir_path in dirs_to_merge]

# Determine the union of alleles across all directories
all_alleles = set()
for allele_files in allele_files_list:
    all_alleles.update(allele_files.keys())

# Expected file types for each allele
expected_types = ["label", "pro", "nuc"]

# Initialize counters
total_cells = 0
cells_per_dir = [0] * len(dirs_to_merge)

# Merge files for each allele
for allele in tqdm(sorted(all_alleles)):
    merged_data = {file_type: [] for file_type in expected_types}

    # Iterate through each directory and collect data for the current allele
    for i, allele_files in enumerate(allele_files_list):
        if allele in allele_files:
            for file_type in expected_types:
                if file_type in allele_files[allele]:
                    path = allele_files[allele][file_type]
                    arr = np.load(path, allow_pickle=True)
                    merged_data[file_type].append(arr)
                    if file_type == "label":
                        cell_num = arr.shape[0]
                        cells_per_dir[i] += cell_num

    # Concatenate data across all directories for the current allele
    for file_type, arrays in merged_data.items():
        if arrays:  # Only process if data exists for this file type
            merged_arr = np.concatenate(arrays, axis=0)
            out_filename = f"{allele}_{file_type}.npy"
            out_path = os.path.join(merged_dir, out_filename)

            if file_type == "label":
                assert merged_arr.shape[1] == 2, f"{allele} label shape mismatch"
            else:
                assert merged_arr.shape[1:] == (100, 100), f"{allele} crop dim mismatch"

            np.save(out_path, merged_arr)
            # print(f"Merged {file_type} for allele {allele} into {out_path}")

    # Update total cell count
    total_cells += merged_arr.shape[0]

# Print summary
print(f"Total cells merged: {total_cells}")
for i, dir_path in enumerate(dirs_to_merge):
    print(f"Cells from {dir_path}: {cells_per_dir[i]}")

  0%|          | 0/1568 [00:00<?, ?it/s]

100%|██████████| 1568/1568 [13:02<00:00,  2.00it/s]  

Total cells merged: 2378341
Cells from ../inputs/1_model_input/2024_01_23_Batch_7: 727589
Cells from ../inputs/1_model_input/2024_02_06_Batch_8: 119471
Cells from ../inputs/1_model_input/2025_01_27_Batch_13: 457223
Cells from ../inputs/1_model_input/2025_01_28_Batch_14: 436435
Cells from ../inputs/1_model_input/2025_03_17_Batch_15: 293390
Cells from ../inputs/1_model_input/2025_03_17_Batch_16: 344233





## Examine the number of cell crops per each batch (biological replicates)

In [None]:
# Get file mappings from both directories.
allele_files = get_allele_files(dir_1)
alleles = set(allele_files.keys())
# Expected file types for each allele
expected_types = ["label", "pro", "nuc"]

total_cells = 0
for allele in sorted(alleles):
    for file_type in expected_types:
        path1 = allele_files[allele][file_type]
        arr1 = np.load(path1, allow_pickle=True)
        
        if file_type == "label":
            cell_num_1 = arr1.shape[0]
            assert arr1.shape[1] == 2
        else:
            assert arr1.shape[0] == cell_num_1
    total_cells += arr1.shape[0]
print(total_cells)

705310


In [None]:
# Get file mappings from both directories.
allele_files = get_allele_files(dir_2)
alleles = set(allele_files.keys())
# Expected file types for each allele
expected_types = ["label", "pro", "nuc"]

total_cells = 0
for allele in sorted(alleles):
    for file_type in expected_types:
        path1 = allele_files[allele][file_type]
        arr1 = np.load(path1, allow_pickle=True)
        
        if file_type == "label":
            cell_num_1 = arr1.shape[0]
            assert arr1.shape[1] == 2
        else:
            assert arr1.shape[0] == cell_num_1
    total_cells += arr1.shape[0]
print(total_cells)

110912


### Merge two batches

In [None]:
# Get file mappings from both directories.
allele_files1 = get_allele_files(dir_1)
allele_files2 = get_allele_files(dir_2)

# Determine the union of alleles across both directories.
alleles = set(allele_files1.keys()) | set(allele_files2.keys())

# Expected file types for each allele
expected_types = ["label", "pro", "nuc"]

print(allele_files1)
print(allele_files2)

cells_dir1, cells_dir2, total_cells = 0, 0, 0

for allele in sorted(alleles):
    # Check for missing files in each directory that contains the allele.
    if allele in allele_files1:
        files1 = allele_files1[allele]
        missing = expected_types - files1.keys()
        if missing:
            raise Exception(f"Allele {allele} in {dir_1} is missing files: {missing}")
    if allele in allele_files2:
        files2 = allele_files2[allele]
        missing = expected_types - files2.keys()
        if missing:
            raise Exception(f"Allele {allele} in {dir_2} is missing files: {missing}")
    
    # If allele exists in both directories, stack the arrays.
    if allele in allele_files1 and allele in allele_files2:
        for file_type in expected_types:
            path1 = allele_files1[allele][file_type]
            path2 = allele_files2[allele][file_type]
            arr1 = np.load(path1, allow_pickle=True)
            arr2 = np.load(path2, allow_pickle=True)
            merged_arr = np.concatenate([arr1, arr2], axis=0)
            out_filename = f"{allele}_{file_type}.npy"
            out_path = os.path.join(merged_dir, out_filename)
            
            if file_type == "label":
                cell_num_1, cell_num_2 = arr1.shape[0], arr2.shape[0]
                assert merged_arr.shape[1] == 2
                print(f"Stacked {os.path.basename(path1)} {os.path.basename(path2)} to {out_path}")
            else:
                assert merged_arr.shape[1] == 100 and merged_arr.shape[2] == 100, f"{allele} cell crop dim. doesn't match"

            assert merged_arr.shape[0] == (cell_num_1+cell_num_2), f"{allele} cell number doesn't match"
            np.save(out_path, merged_arr)
        
        cells_dir1 += cell_num_1
        cells_dir2 += cell_num_2
        total_cells += merged_arr.shape[0]

    # If allele exists only in one directory, copy the files over.
    else:
        src_dir = dir_1 if allele in allele_files1 else dir_2
        allele_files = allele_files1.get(allele, allele_files2.get(allele))
        for file_type in expected_types:
            src_path = allele_files[file_type]
            arr1 = np.load(src_path, allow_pickle=True)
            if file_type == "label":
                cell_num = arr1.shape[0]
                print(f"Copied {os.path.basename(src_path)} from {src_dir} to {dest_path}")
                    
            dest_path = os.path.join(merged_dir, os.path.basename(src_path))
            shutil.copy(src_path, dest_path)
            
        if src_dir == dir_1:
            cells_dir1 += cell_num
        else:
            cells_dir2 += cell_num

        total_cells += cell_num

{'KRT6A-Ile178Asn': {'label': '../model_inputs/2024_01_23_Batch_7/KRT6A-Ile178Asn_label.npy', 'nuc': '../model_inputs/2024_01_23_Batch_7/KRT6A-Ile178Asn_nuc.npy', 'pro': '../model_inputs/2024_01_23_Batch_7/KRT6A-Ile178Asn_pro.npy'}, 'PKP2-Ser227Arg': {'pro': '../model_inputs/2024_01_23_Batch_7/PKP2-Ser227Arg_pro.npy', 'label': '../model_inputs/2024_01_23_Batch_7/PKP2-Ser227Arg_label.npy', 'nuc': '../model_inputs/2024_01_23_Batch_7/PKP2-Ser227Arg_nuc.npy'}, 'RNF135-Arg286His': {'pro': '../model_inputs/2024_01_23_Batch_7/RNF135-Arg286His_pro.npy', 'nuc': '../model_inputs/2024_01_23_Batch_7/RNF135-Arg286His_nuc.npy', 'label': '../model_inputs/2024_01_23_Batch_7/RNF135-Arg286His_label.npy'}, 'CRYAB-Gly154Ser': {'nuc': '../model_inputs/2024_01_23_Batch_7/CRYAB-Gly154Ser_nuc.npy', 'pro': '../model_inputs/2024_01_23_Batch_7/CRYAB-Gly154Ser_pro.npy', 'label': '../model_inputs/2024_01_23_Batch_7/CRYAB-Gly154Ser_label.npy'}, 'STXBP1-Cys354Arg': {'label': '../model_inputs/2024_01_23_Batch_7/STXBP

Copied ACSF3-Arg471Trp_label.npy from ../model_inputs/2024_01_23_Batch_7/ to ../model_inputs/2024_01-02_Batch_7-8/ACSF3-Ala197Thr_nuc.npy
Copied ACSF3-Arg558Trp_label.npy from ../model_inputs/2024_01_23_Batch_7/ to ../model_inputs/2024_01-02_Batch_7-8/ACSF3-Arg471Trp_nuc.npy
Copied ACSF3-Asp457Asn_label.npy from ../model_inputs/2024_01_23_Batch_7/ to ../model_inputs/2024_01-02_Batch_7-8/ACSF3-Arg558Trp_nuc.npy
Copied ACSF3-Glu359Lys_label.npy from ../model_inputs/2024_01_23_Batch_7/ to ../model_inputs/2024_01-02_Batch_7-8/ACSF3-Asp457Asn_nuc.npy
Copied ACSF3-Gly225Arg_label.npy from ../model_inputs/2024_01_23_Batch_7/ to ../model_inputs/2024_01-02_Batch_7-8/ACSF3-Glu359Lys_nuc.npy
Copied ACSF3-Ile200Met_label.npy from ../model_inputs/2024_01_23_Batch_7/ to ../model_inputs/2024_01-02_Batch_7-8/ACSF3-Gly225Arg_nuc.npy
Copied ACSF3-Met198Arg_label.npy from ../model_inputs/2024_01_23_Batch_7/ to ../model_inputs/2024_01-02_Batch_7-8/ACSF3-Ile200Met_nuc.npy
Copied ACSF3-Met266Val_label.npy f

In [None]:
# Get file mappings from both directories.
allele_files = get_allele_files(merged_dir)
alleles = set(allele_files.keys())

# Expected file types for each allele
expected_types = ["label", "pro", "nuc"]

total_cells = 0
for allele in sorted(alleles):
    for file_type in expected_types:
        path1 = allele_files[allele][file_type]
        arr1 = np.load(path1, allow_pickle=True)
        
        if file_type == "label":
            cell_num_1 = arr1.shape[0]
            assert arr1.shape[1] == 2
        else:
            assert arr1.shape[0] == cell_num_1
    total_cells += arr1.shape[0]
print(total_cells)

816222


In [None]:
# 816222 - (669302 + 66931) # + 66931
# 79989 / 816222
# 66931 / 816222