In [2]:
import pandas as pd
import os
import glob
import subprocess
from google.colab import drive

# ======================================================================
# Cell 1: Install All Dependencies & Mount Drive
# ======================================================================
print("Installing required libraries...")
!pip install -q pandas numpy torch torchvision transformers timm pillow tqdm scikit-learn umap-learn plotly awscli openpyxl pyarrow

print("\nMounting Google Drive...")
try:
    drive.mount('/content/drive')
    print("Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting drive: {e}")

# ======================================================================
# Cell 2: Clone Repo
# ======================================================================
print("\nCloning GitHub repository...")
# Clean up previous clone if it exists, just in case
!rm -rf 2024_Chandrasekaran_NatureMethods_CPJUMP1
!git clone https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1.git

try:
    %cd 2024_Chandrasekaran_NatureMethods_CPJUMP1
    print(f"Current directory set to: {os.getcwd()}")
except FileNotFoundError:
    print("ERROR: git clone failed. Cannot change directory.")
    # Stop the script if cloning failed
    raise SystemExit("Stopping script due to clone failure.")

# ======================================================================
# Cell 3: Find Batch Names (A quick pre-load)
# ======================================================================
barcode_files_temp = glob.glob("metadata/platemaps/*/barcode_platemap.csv")
all_experiments_temp = []
for f in barcode_files_temp:
    timepoint_name = f.split('/')[2]
    temp_df = pd.read_csv(f)
    temp_df['Timepoint'] = timepoint_name
    all_experiments_temp.append(temp_df)
main_df_temp = pd.concat(all_experiments_temp, ignore_index=True)

# *** THIS IS THE FIX (Part 1) ***
# Get ALL batch names, not just the first one
all_batch_names = main_df_temp['Timepoint'].unique()
print(f"Found {len(all_batch_names)} batches to download: {all_batch_names}")

# ======================================================================
# Cell 4: Download Pre-Computed Profiles (Now loops)
# ======================================================================

# *** THIS IS THE FIX (Part 2) ***
# Loop through every batch and download its profiles
for batch_to_download in all_batch_names:
    print(f"\n--- Downloading profiles for batch: {batch_to_download} ---")
    s3_path = f"s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace/backend/{batch_to_download}/"
    local_dir = f"profiles/{batch_to_download}"
    os.makedirs(local_dir, exist_ok=True)

    # Download command (excluding giant .sqlite files)
    aws_command = [
        "aws", "s3", "cp", "--no-sign-request", "--recursive",
        s3_path, local_dir, "--exclude", "*.sqlite"
    ]
    try:
        print(f"Running command: {' '.join(aws_command)}")
        subprocess.run(aws_command, check=True)
        print(f"--- Successfully downloaded profiles to {local_dir} ---")
    except Exception as e:
        print(f"\n--- WARNING: Download failed for batch {batch_to_download}. --- {e}")
        print("This is OK, the batch might be empty. Continuing...")

# ======================================================================
# Cell 5: Load, Merge, and Save Master Checkpoint
# ======================================================================
print("\n--- Starting Data Load & Merge Process ---")

# --- Load PERTURBATION DETAILS (Unchanged) ---
folder_path = "metadata/external_metadata"
pert_details = {}
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        try:
            if filename.endswith(".tsv"):
                pert_details[filename] = pd.read_csv(file_path, sep='\t')
            elif filename.endswith(".xlsx"):
                pert_details[filename] = pd.read_excel(file_path)
        except Exception:
            pass

compound_df = pert_details["JUMP-Target-1_compound_metadata.tsv"].rename(columns={'pert_iname': 'perturbation'})
crispr_df = pert_details["JUMP-Target-1_crispr_metadata.tsv"]
orf_df = pert_details["JUMP-Target-1_orf_metadata.tsv"]
crispr_df['perturbation'] = crispr_df['gene']
orf_df['perturbation'] = orf_df['gene']
all_pert_details_df = pd.concat([compound_df, crispr_df, orf_df], ignore_index=True)
print("--- Perturbation details loaded ---")

# --- Load PLATEMAPS (Unchanged) ---
platemap_files = glob.glob("metadata/platemaps/*/platemap/*.txt")
all_platemaps = []
for f in platemap_files:
    map_.name = os.path.basename(f).replace('.txt', '')
    temp_df = pd.read_csv(f, sep='\t')
    temp_df['Plate_Map_Name'] = map_name
    all_platemaps.append(temp_df)
platemap_df = pd.concat(all_platemaps, ignore_index=True)
platemap_df = platemap_df.rename(columns={'well': 'Metadata_Well', 'broad_sample': 'Metadata_Broad_Sample'})
print("--- Platemaps loaded ---")

# --- Load EXPERIMENT DATA (Timepoints) (Unchanged) ---
main_df = main_df_temp # Use the one we loaded in Cell 3
print("--- Experiment/Timepoint data loaded ---")

# --- Load DOWNLOADED PROFILES ---
# *** THIS IS THE FIX (Part 3) ***
# Search *all* subdirectories in 'profiles/' for .csv files
search_path = f"profiles/**/*.csv"
all_csv_files = glob.glob(search_path, recursive=True)

# Check if we found any files at all
if not all_csv_files:
    print("\n\n--- CRITICAL ERROR ---")
    print("No .csv files were found in any downloaded batch folder.")
    print("Please check the 'aws' command output.")
    raise SystemExit("Stopping script: No data to load.")

print(f"\nFound {len(all_csv_files)} total profile files to load.")
all_profiles_list = []
for f in all_csv_files:
    try:
        temp_df = pd.read_csv(f)
        barcode = f.split('/')[-2] # Get plate barcode from folder name
        temp_df['Assay_Plate_Barcode'] = barcode
        all_profiles_list.append(temp_df)
    except Exception as e:
        print(f"Skipping file {f}, error: {e}")

# This check prevents the ValueError
if not all_profiles_list:
    print("\n\n--- CRITICAL ERROR ---")
    print("Failed to load any of the found .csv files.")
    raise SystemExit("Stopping script: No data to concatenate.")

all_profiles_df = pd.concat(all_profiles_list, ignore_index=True)
print(f"--- Downloaded profiles loaded ---")

# --- MERGE EVERYTHING ---
print("Merging...")
meta = pd.merge(main_df, platemap_df, on="Plate_Map_Name")
meta = pd.merge(meta, all_pert_details_df, left_on="Metadata_Broad_Sample", right_on="broad_sample")
final_master_df = pd.merge(meta, all_profiles_df, left_on=["Assay_Plate_Barcode", "well_position"], right_on=["Assay_Plate_Barcode", "Metadata_Well"])
print("--- ðŸŽ‰ COMPLETE MASTER DATAFRAME CREATED ---")
print(f"Total rows: {len(final_master_df)}")

# --- SAVE CHECKPOINT ---
print("\nSaving master DataFrame to Google Drive...")
save_path = "/content/drive/My Drive/CPJUMP1_master_data.parquet"
final_master_df.to_parquet(save_path)
print(f"--- ðŸŽ‰ Checkpoint Saved to {save_path} ---")
print("You can now open a new notebook and run the analysis cells.")

Installing required libraries...

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive mounted successfully.

Cloning GitHub repository...
Cloning into '2024_Chandrasekaran_NatureMethods_CPJUMP1'...
remote: Enumerating objects: 6390, done.[K
remote: Counting objects: 100% (2400/2400), done.[K
remote: Compressing objects: 100% (1394/1394), done.[K
remote: Total 6390 (delta 1031), reused 2217 (delta 982), pack-reused 3990 (from 1)[K
Receiving objects: 100% (6390/6390), 822.18 MiB | 31.39 MiB/s, done.
Resolving deltas: 100% (1833/1833), done.
Updating files: 100% (1757/1757), done.
Downloading load_data_csv/2020_11_04_CPJUMP1/BR00116991/load_data.csv.gz (143 KB)
Error downloading object: load_data_csv/2020_11_04_CPJUMP1/BR00116991/load_data.csv.gz (33782fc): Smudge error: Error downloading load_data_csv/2020_11_04_CPJUMP1/BR00116991/load_data.csv.gz (33782fca8602a7a0d7ec71aa6a093

NameError: name 'map_' is not defined