In [1]:
'''Download and prepare data.'''
from cytotable import convert
from parsl.config import Config
from parsl.executors import ThreadPoolExecutor
import random

COLUMNS = (
    "TableNumber",
    "ImageNumber",
    "ObjectNumber",
    "Metadata_Well",
    "Metadata_Plate",
    "Parent_Cells",
    "Parent_Nuclei",
    "Cytoplasm_Parent_Cells",
    "Cytoplasm_Parent_Nuclei",
)

COMMANDS =  """
            WITH Image_Filtered AS (
                SELECT
                    Metadata_TableNumber,
                    Metadata_ImageNumber,
                    Metadata_Well,
                    Metadata_Plate
                FROM
                    read_parquet('image.parquet')
                )
            SELECT
                *
            FROM
                Image_Filtered AS image
            JOIN read_parquet('cytoplasm.parquet') AS cytoplasm ON
                cytoplasm.Metadata_TableNumber = image.Metadata_TableNumber
                AND cytoplasm.Metadata_ImageNumber = image.Metadata_ImageNumber
            JOIN read_parquet('cells.parquet') AS cells ON
                cells.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
                AND cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
                AND cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
            JOIN read_parquet('nuclei.parquet') AS nuclei ON
                nuclei.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
                AND nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
                AND nuclei.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
            """



def convert_parquet(
    input_file,
    output_file,
    cols=COLUMNS,
    chunk_size=150000,
    joins=COMMANDS,
    thread=2,
):
    """Convert sqlite profiles to parquet"""

    hash_str = str(random.getrandbits(128))
    parsl_config = Config(
                        executors=[
                            ThreadPoolExecutor(
                                max_threads=thread
                            )
                        ],
                        run_dir=f'./runinfo/{hash_str}'
                    )
   
    convert(
        source_path=input_file,
        dest_path=output_file,
        identifying_columns=cols,
        dest_datatype='parquet',
        chunk_size=chunk_size,
        preset="cell-health-cellprofiler-to-cytominer-database",
        joins=joins,
        reload_parsl_config=True,
        parsl_config=parsl_config
    )

In [2]:
import os
batch = '2024_02_06_Batch_8'
plates = os.listdir(f"../inputs/single_cell_profiles/{batch}/")

In [3]:
for plate in plates:
    input_path = f"../inputs/single_cell_profiles/{batch}/{plate}/{plate}.sqlite"
    output_path = f"../outputs/single_cell_profiles/{batch}/{plate}_raw_.parquet"
    print(input_path)
    if input_path != "../inputs/single_cell_profiles/2024_02_06_Batch_8/2024_02_02_B8A1R2_P3T2/2024_02_02_B8A1R2_P3T2.sqlite":
        continue
    if not os.path.isfile(output_path):
        threads = 64
        convert_parquet(input_path, output_path, thread=threads)

../inputs/single_cell_profiles/2024_02_06_Batch_8/2024_02_05_B8A1R2_P4T3/2024_02_05_B8A1R2_P4T3.sqlite
../inputs/single_cell_profiles/2024_02_06_Batch_8/2024_02_02_B8A1R2_P3T2/2024_02_02_B8A1R2_P3T2.sqlite
../inputs/single_cell_profiles/2024_02_06_Batch_8/2024_02_01_B8A1R2_P2T4/2024_02_01_B8A1R2_P2T4.sqlite
../inputs/single_cell_profiles/2024_02_06_Batch_8/2024_02_01_B8A1R2_P2T3/2024_02_01_B8A1R2_P2T3.sqlite
../inputs/single_cell_profiles/2024_02_06_Batch_8/2024_01_31_B8A1R2_P1T2/2024_01_31_B8A1R2_P1T2.sqlite
../inputs/single_cell_profiles/2024_02_06_Batch_8/2024_02_02_B8A1R2_P3T4/2024_02_02_B8A1R2_P3T4.sqlite
../inputs/single_cell_profiles/2024_02_06_Batch_8/2024_01_31_B8A1R2_P1T4/2024_01_31_B8A1R2_P1T4.sqlite
../inputs/single_cell_profiles/2024_02_06_Batch_8/2024_02_02_B8A1R2_P3T1/2024_02_02_B8A1R2_P3T1.sqlite
../inputs/single_cell_profiles/2024_02_06_Batch_8/2024_02_05_B8A1R2_P4T2/2024_02_05_B8A1R2_P4T2.sqlite
../inputs/single_cell_profiles/2024_02_06_Batch_8/2024_02_05_B8A1R2_P4T4/

In [None]:
# aws s3 cp s3://cellpainting-gallery/cpg0020-varchamp/broad/workspace/backend/2024_02_06_Batch_8/2024_02_02_B8A1R2_P3T2/2024_02_02_B8A1R2_P3T2.sqlite /home/shenrunx/igvf/varchamp/2021_09_01_VarChAMP/8.2_updated_snakemake_pipeline/inputs/single_cell_profiles/2024_02_06_Batch_8/2024_02_02_B8A1R2_P3T2/2024_02_02_B8A1R2_P3T2.sqlite