In [1]:
import pathlib

import pandas as pd

In [2]:
training_data_path = pathlib.Path(f"../2.format_training_data/results/training_data__ic.csv.gz")
training_data_ic = pd.read_csv(training_data_path, compression="gzip", index_col=0)

training_data_ic.head(10).to_csv("raw_feature_previews/training_data_ic_head.csv")
training_data_ic.head()

Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.037779,-0.136591,-0.217011,0.019583,-0.192538,0.60485,0.391432,-0.119948,-0.00202,0.747988
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.115009,-0.255369,-0.229094,0.020964,-0.158187,0.672871,-0.057893,-0.060406,-0.166688,0.467811
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.062292,0.077288,-0.211343,0.403338,-0.183594,0.302867,-0.040703,-0.109285,-0.165469,0.998986
3,Large,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-0.134947,-0.187267,-0.206022,0.343387,-0.163575,0.552434,-0.036164,-0.097822,-0.23264,1.216684
4,Large,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-0.059909,-0.188595,-0.200547,-0.128233,-0.201621,0.768074,-0.041732,-0.15365,-0.161041,1.531892


In [3]:
for col in training_data_ic.columns:
    print(col)

Mitocheck_Phenotypic_Class
Cell_UUID
Location_Center_X
Location_Center_Y
Metadata_Plate
Metadata_Well
Metadata_Frame
Metadata_Site
Metadata_Plate_Map_Name
Metadata_DNA
Metadata_Gene
Metadata_Gene_Replicate
CP__AreaShape_Area
CP__AreaShape_BoundingBoxArea
CP__AreaShape_BoundingBoxMaximum_X
CP__AreaShape_BoundingBoxMaximum_Y
CP__AreaShape_BoundingBoxMinimum_X
CP__AreaShape_BoundingBoxMinimum_Y
CP__AreaShape_Center_X
CP__AreaShape_Center_Y
CP__AreaShape_Compactness
CP__AreaShape_ConvexArea
CP__AreaShape_Eccentricity
CP__AreaShape_EquivalentDiameter
CP__AreaShape_EulerNumber
CP__AreaShape_Extent
CP__AreaShape_FormFactor
CP__AreaShape_MajorAxisLength
CP__AreaShape_MaxFeretDiameter
CP__AreaShape_MaximumRadius
CP__AreaShape_MeanRadius
CP__AreaShape_MedianRadius
CP__AreaShape_MinFeretDiameter
CP__AreaShape_MinorAxisLength
CP__AreaShape_Orientation
CP__AreaShape_Perimeter
CP__AreaShape_Solidity
CP__AreaShape_Zernike_0_0
CP__AreaShape_Zernike_1_1
CP__AreaShape_Zernike_2_0
CP__AreaShape_Zernike_2

In [4]:
def compile_mitocheck_batch_data(
    data_path: pathlib.Path, dataset: str = "CP_and_DP"
) -> pd.DataFrame:
    """
    compile batch data from a mitocheck idrstream merged features run

    Parameters
    ----------
    data_path : pathlib.Path
        path to folder with saved batches
        these batches must be merged (have CP and DP features)
    dataset : str, optional
        which dataset columns to load in (in addition to metadata),
        can be "CP" or "DP" or by default "CP_and_DP"

    Returns
    -------
    pd.DataFrame
        compiled batch dataframe
    """

    data = pd.DataFrame()

    # determine which cols to use for loading (depending on dataset)
    # load in first row to get all column names
    batch_0_row_0 = pd.read_csv(
        f"{data_path}/batch_0.csv.gz",
        compression="gzip",
        index_col=0,
        low_memory=False,
        nrows=1,
    )
    cols_to_load = batch_0_row_0.columns.to_list()

    # remove unecessary DP column that isnt part of features
    cols_to_remove = ["DP__Metadata_Model"]

    # Some CP columns are related to things besides the features we want (ex. location measurements)
    # We only want to get CP data from the feature modules below (__ ensures it is found as module name)
    cp_feature_modules = ["__AreaShape_", "__Granularity_", "__Intensity", "__Neighbors", "__RadialDistribution", "__Texture"]
    # remove CP columns that dont have a feature module as a substring
    for col in cols_to_load:
        if "CP__" not in col:
            continue
        has_feature_module = any(feature_module in col for feature_module in cp_feature_modules)
        if not has_feature_module:
            cols_to_remove.append(col)
    
    # remove columns we don't want from the list to load
    cols_to_load = list(set(cols_to_load) - set(cols_to_remove))

    # remove DP or CP features from columns to load depending on desired dataset
    if dataset == "CP":
        cols_to_load = [col for col in cols_to_load if "DP__" not in col]
    if dataset == "DP":
        cols_to_load = [col for col in cols_to_load if "CP__" not in col]

    print(f"Loading data from {data_path}...")
    for batch_path in data_path.iterdir():
        batch = pd.read_csv(
            batch_path,
            compression="gzip",
            low_memory=True,
            usecols=cols_to_load,
        )

        # split well_frame into well and frame columns
        batch[["Metadata_Well", "Metadata_Frame"]] = batch["Metadata_Well"].str.split(
            "_", expand=True
        )
        batch.insert(5, "Metadata_Frame", batch.pop("Metadata_Frame"))

        if data.empty:
            data = batch
        else:
            data = pd.concat([data, batch])
        
        # EXTRA BREAK TO ONLY GET SUBET OF DATA
        break

    return data.reset_index(drop=True)

negative_control_data_path = pathlib.Path(f"../1.idr_streams/extracted_features/negative_control_data__ic/merged_features")
negative_control_data = compile_mitocheck_batch_data(negative_control_data_path)
negative_control_data.head(10).to_csv("raw_feature_previews/negative_control_data_ic_head.csv")
negative_control_data.head()

Loading data from ../1.idr_streams/extracted_features/negative_control_data__ic/merged_features...


Unnamed: 0,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_Gene,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,f81c4fdf-b10b-4de1-ad12-93cd64556450,311,15,LT0070_39,304,39,1,LT0070_39_304,LT0070_39/LT0070_39_304_39.tif,negative control,...,-0.064656,-0.030192,-0.113592,-0.18985,-0.177927,-0.012188,-0.047461,-0.119937,-0.108912,-0.079671
1,c735868b-c6b3-464a-9497-94cedd084a53,22,30,LT0070_39,304,39,1,LT0070_39_304,LT0070_39/LT0070_39_304_39.tif,negative control,...,-0.044193,0.075131,-0.146837,-0.180861,-0.232069,-0.177789,-0.059121,-0.117794,-0.0758,0.662999
2,542c49cb-5d8d-424f-8d79-53d775681c46,965,24,LT0070_39,304,39,1,LT0070_39_304,LT0070_39/LT0070_39_304_39.tif,negative control,...,-0.02027,-0.226483,-0.108335,-0.161705,-0.200425,0.290491,-0.039992,-0.172161,-0.098577,0.749005
3,4cddd66a-45f1-407c-afa9-2777570fe6e6,158,24,LT0070_39,304,39,1,LT0070_39_304,LT0070_39/LT0070_39_304_39.tif,negative control,...,-0.047678,0.001401,0.053046,-0.025151,-0.221727,-0.108918,-0.072394,-0.090956,-0.072095,0.123148
4,02b97db9-f8b8-4892-89dd-7e2be7150cec,111,29,LT0070_39,304,39,1,LT0070_39_304,LT0070_39/LT0070_39_304_39.tif,negative control,...,-0.041636,-0.076882,-0.098817,0.078745,-0.214188,0.03889,-0.041608,-0.113545,-0.090873,0.27706


In [5]:
for col in negative_control_data.columns:
    print(col)

Cell_UUID
Location_Center_X
Location_Center_Y
Metadata_Plate
Metadata_Well
Metadata_Frame
Metadata_Site
Metadata_Plate_Map_Name
Metadata_DNA
Metadata_Gene
Metadata_Gene_Replicate
CP__AreaShape_Area
CP__AreaShape_BoundingBoxArea
CP__AreaShape_BoundingBoxMaximum_X
CP__AreaShape_BoundingBoxMaximum_Y
CP__AreaShape_BoundingBoxMinimum_X
CP__AreaShape_BoundingBoxMinimum_Y
CP__AreaShape_Center_X
CP__AreaShape_Center_Y
CP__AreaShape_Compactness
CP__AreaShape_ConvexArea
CP__AreaShape_Eccentricity
CP__AreaShape_EquivalentDiameter
CP__AreaShape_EulerNumber
CP__AreaShape_Extent
CP__AreaShape_FormFactor
CP__AreaShape_MajorAxisLength
CP__AreaShape_MaxFeretDiameter
CP__AreaShape_MaximumRadius
CP__AreaShape_MeanRadius
CP__AreaShape_MedianRadius
CP__AreaShape_MinFeretDiameter
CP__AreaShape_MinorAxisLength
CP__AreaShape_Orientation
CP__AreaShape_Perimeter
CP__AreaShape_Solidity
CP__AreaShape_Zernike_0_0
CP__AreaShape_Zernike_1_1
CP__AreaShape_Zernike_2_0
CP__AreaShape_Zernike_2_2
CP__AreaShape_Zernike_3_