In [1]:
import pandas as pd
import os
import glob
import subprocess
from google.colab import drive

#Installing dependencies
print("Installing required libraries")
!pip install -q pandas numpy scikit-learn plotly awscli openpyxl pyarrow

print("\nMounting Google Drive")
try:
    drive.mount('/content/drive')
    print("Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting drive: {e}")

#Cloning the official dataset
print("\nCloning GitHub repository...")
%cd /content/
!rm -rf 2024_Chandrasekaran_NatureMethods_CPJUMP1
!git clone https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1.git

try:
    %cd /content/2024_Chandrasekaran_NatureMethods_CPJUMP1
    print(f"Current directory set to: {os.getcwd()}")
except FileNotFoundError:
    print("ERROR: git clone failed. Cannot change directory.")
    raise SystemExit("Stopping script due to clone failure.")

#Downloading profiles for 1 batch
batch_to_download = "2020_12_02_CPJUMP1_2WeeksTimePoint"
print(f"\n Downloading profiles for ONE batch: {batch_to_download}")

s3_path = f"s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace/backend/{batch_to_download}/"
local_dir = f"profiles/{batch_to_download}"
os.makedirs(local_dir, exist_ok=True)

aws_command = [
    "aws", "s3", "cp", "--no-sign-request", "--recursive",
    s3_path, local_dir, "--exclude", "*.sqlite"
]
try:
    print(f"Running command: {' '.join(aws_command)}")
    subprocess.run(aws_command, check=True, capture_output=True, text=True)
    print(f" Successfully downloaded profiles to {local_dir}")
except Exception as e:
    print(f"\n ERROR: Download failed.  {e}")
    raise SystemExit("Stopping script due to download failure.")

#Loading metadata
print("\n Starting Data Load & Metadata Preparation ")

#Loading pertubation data
folder_path = "metadata/external_metadata"
pert_details = {}

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        try:
            if filename.endswith(".tsv"):
                pert_details[filename] = pd.read_csv(file_path, sep='\t')
            elif filename.endswith(".xlsx"):
                pert_details[filename] = pd.read_excel(file_path)
        except Exception: pass
compound_df = pert_details["JUMP-Target-1_compound_metadata.tsv"].rename(columns={'pert_iname': 'perturbation'})
crispr_df = pert_details["JUMP-Target-1_crispr_metadata.tsv"]
orf_df = pert_details["JUMP-Target-1_orf_metadata.tsv"]
crispr_df['perturbation'] = crispr_df['gene']
orf_df['perturbation'] = orf_df['gene']
all_pert_details_df = pd.concat([compound_df, crispr_df, orf_df], ignore_index=True)
print(" Perturbation details loaded ")


platemap_files = glob.glob("metadata/platemaps/*/platemap/*.txt")
all_platemaps = []

for f in platemap_files:
    map_name = os.path.basename(f).replace('.txt', '')
    temp_df = pd.read_csv(f, sep='\t')
    temp_df['Plate_Map_Name'] = map_name
    all_platemaps.append(temp_df)
platemap_df = pd.concat(all_platemaps, ignore_index=True)
platemap_df = platemap_df.rename(columns={'well': 'Metadata_Well', 'broad_sample': 'Metadata_Broad_Sample'})
print(" Platemaps loaded ")

#Loading experimental data
barcode_files = glob.glob("metadata/platemaps/*/barcode_platemap.csv")
all_experiments = []

for f in barcode_files:
    timepoint_name = f.split('/')[2]
    temp_df = pd.read_csv(f)
    temp_df['Timepoint'] = timepoint_name
    all_experiments.append(temp_df)
main_df = pd.concat(all_experiments, ignore_index=True)
print(" Experiment/Timepoint data loaded ")


print("Creating base metadata...")
meta = pd.merge(main_df, platemap_df, on="Plate_Map_Name")
meta = pd.merge(meta, all_pert_details_df, left_on="Metadata_Broad_Sample", right_on="broad_sample")
print(" Base 'meta' DataFrame created ")



#Loading profiles
print(f"\n Loading Profiles Plate by Plate for Batch: {batch_to_download} ")
search_path = f"{local_dir}/**/*.csv"
all_csv_files = glob.glob(search_path, recursive=True)

merged_plate_data = []

for f in all_csv_files:
    try:
        barcode = f.split('/')[-2]
        print(f"  Processing plate: {barcode}")

        #Load one plate's profile data
        profile_df_plate = pd.read_csv(f)
        profile_df_plate['Assay_Plate_Barcode'] = barcode

        #Filter the main metadata for JUST this plate
        meta_plate = meta[meta['Assay_Plate_Barcode'] == barcode]

        #Merge this single plate's profiles with its metadata
        merged_df_plate = pd.merge(
            meta_plate,
            profile_df_plate,
            left_on=["Assay_Plate_Barcode", "well_position"],
            right_on=["Assay_Plate_Barcode", "Metadata_Well"]
        )

        #Append the merged result to list
        merged_plate_data.append(merged_df_plate)

        print(f"    -> Merged {len(merged_df_plate)} wells.")

    except Exception as e:
        print(f"    -> Skipping file {f}, error: {e}")

if not merged_plate_data:
     print("\n\n--- CRITICAL ERROR ---")
     print("Failed to process any plates. No data to save.")
else:
    print("\nConcatenating all processed plates")
    #Combine the merged data from all plates
    final_master_df = pd.concat(merged_plate_data, ignore_index=True)
    print(" COMPLETE MASTER DATAFRAME CREATED ")
    print(f"Total rows (from one batch): {len(final_master_df)}")


    print("\nSaving master DataFrame to Google Drive")
    save_path = "/content/drive/My Drive/CPJUMP1_master_data_BATCH_1.parquet"
    final_master_df.to_parquet(save_path)
    print(f" Checkpoint Saved to {save_path} ")


Installing required libraries

Mounting Google Drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive mounted successfully.

Cloning GitHub repository...
/content
Cloning into '2024_Chandrasekaran_NatureMethods_CPJUMP1'...
remote: Enumerating objects: 6390, done.[K
remote: Counting objects: 100% (2400/2400), done.[K
remote: Compressing objects: 100% (1394/1394), done.[K
remote: Total 6390 (delta 1031), reused 2217 (delta 982), pack-reused 3990 (from 1)[K
Receiving objects: 100% (6390/6390), 822.19 MiB | 29.70 MiB/s, done.
Resolving deltas: 100% (1833/1833), done.
Updating files: 100% (1757/1757), done.
Downloading load_data_csv/2020_11_04_CPJUMP1/BR00116991/load_data.csv.gz (143 KB)
Error downloading object: load_data_csv/2020_11_04_CPJUMP1/BR00116991/load_data.csv.gz (33782fc): Smudge error: Error downloading load_data_csv/2020_11_04_CPJUMP1/BR00116991/load_data.csv.gz (33782fca8602a7a0d7ec71aa6a