In [1]:
import pathlib

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from joblib import dump
from pycytominer.operations import Spherize

import sys

sys.path.append("../utils")
from load_utils import compile_mitocheck_batch_data, split_data
from normalization_utils import get_normalization_scaler, get_normalized_mitocheck_data

In [5]:
feature_types = ["CP", "DP"]

In [6]:
# get normalization scaler from negative control features (normalization population)
print("Getting normalization scaler...")
negative_control_data_path = pathlib.Path(
    "../1.idr_streams/extracted_features/negative_control_data/merged_features/"
)

for feature_type in feature_types:
    # get normalization population
    norm_pop_data = compile_mitocheck_batch_data(negative_control_data_path, feature_type)

    # derive normalization scaler
    # get normalization population feature data
    _, norm_pop_feature_data = split_data(norm_pop_data, feature_type)

    # define normalization scaler to use
    spherize_center=True,
    spherize_method="ZCA-cor"
    spherize_epsilon=1e-6
    normalization_scaler = Spherize(
                center=spherize_center, method=spherize_method, epsilon=spherize_epsilon
            )

    # fit this scaler to featuire data
    normalization_scaler.fit(norm_pop_feature_data)

    # save normalization scaler
    norm_scaler_save_path = pathlib.Path(f"scaler/normalization_sphere_scaler_{feature_type}.joblib")
    norm_scaler_save_path.parent.mkdir(parents=True, exist_ok=True)
    dump(normalization_scaler, norm_scaler_save_path)

Getting normalization scaler...


['scaler/normalization_scaler_spherized.joblib']

In [7]:
# make results dir if it does not already exist
results_dir = pathlib.Path("normalized_data/")
results_dir.mkdir(parents=True, exist_ok=True)

# normalize the data at the following paths
training_data_path = pathlib.Path(
    "../2.format_training_data/results/training_data.csv.gz"
)
negative_control_data_path = pathlib.Path(
    "../1.idr_streams/extracted_features/negative_control_data/merged_features/"
)

# normalize training data
print("Normalizing training data...")
training_data = pd.read_csv(training_data_path, compression="gzip", index_col=0)
normalized_data = get_normalized_mitocheck_data(training_data, normalization_scaler)

# save normalized training data
save_path = pathlib.Path(f"{results_dir}/training_data_spherized.csv.gz")
normalized_data.to_csv(save_path, compression="gzip")

Normalizing training data...


In [5]:
pd.read_csv("normalized_data/training_data_spherized.csv.gz", index_col=0)

Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-1.240288,-0.217985,-1.712067,0.677377,-0.372362,1.127049,-0.451140,-1.025599,-0.555240,-2.760476
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-1.262342,-0.210863,-1.709567,0.558519,-0.363424,1.121141,-0.453032,-1.065441,-0.525243,-2.798944
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-1.309594,-0.209424,-1.692875,0.750167,-0.356478,1.104706,-0.497522,-1.036138,-0.589183,-2.759416
3,Large,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-1.322861,-0.199119,-1.729906,0.749667,-0.296960,1.121299,-0.463361,-1.016528,-0.469660,-2.464292
4,Large,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-1.295659,-0.199460,-1.695019,0.685430,-0.358787,1.245714,-0.494874,-1.019021,-0.550026,-2.375086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,OutOfFocus,380728fc-28b0-423f-b8a7-07be1af590d9,383,219,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,-1.129018,-0.063595,-1.623186,0.028274,-0.388594,0.865124,-0.486238,-1.031701,-0.664306,-2.610898
2912,OutOfFocus,30ed67c7-8de2-4d78-bce9-3fa1aff28565,975,294,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-1.243983,-0.180107,-1.667914,0.722376,-0.379664,1.010047,-0.475212,-0.997723,-0.581540,-2.054563
2913,OutOfFocus,2960b13e-6090-4592-b2a9-d1c4c1b24b50,898,302,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-1.295627,-0.081472,-1.698076,0.689539,-0.368386,0.940399,-0.469888,-1.021884,-0.610717,-2.414817
2914,OutOfFocus,fbc9ce6a-2b29-4115-b218-4ee5b8c50ac1,946,281,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-1.245852,-0.086796,-1.680316,0.787216,-0.384423,1.058181,-0.447113,-1.031586,-0.609996,-2.145411


In [7]:
pd.read_csv("normalized_data/training_data.csv.gz", index_col=0)

Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,1.526493,-0.388909,-0.715202,-0.939279,-0.077689,1.965509,18.685819,0.061676,2.641369,-0.086854
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.482883,-1.354858,-0.856680,-0.934949,0.725091,2.255450,-0.565433,1.628086,-0.605625,-0.748135
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.888706,1.350431,-0.648841,0.264205,0.131341,0.678315,0.171044,0.342206,-0.581597,0.505556
3,Large,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-1.001625,-0.801021,-0.586539,0.076197,0.599191,1.742090,0.365520,0.643759,-1.906097,1.019370
4,Large,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,0.950706,-0.811825,-0.522427,-1.402842,-0.289940,2.661250,0.126978,-0.824945,-0.494285,1.763332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,OutOfFocus,380728fc-28b0-423f-b8a7-07be1af590d9,383,219,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,0.549654,8.142944,1.619399,-1.521878,-0.182734,-1.608294,-0.783477,-2.613400,0.442609,1.977761
2912,OutOfFocus,30ed67c7-8de2-4d78-bce9-3fa1aff28565,975,294,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,0.358861,6.294227,1.827482,-0.997080,-0.614779,-1.270435,-1.335869,-0.560155,0.836314,3.473351
2913,OutOfFocus,2960b13e-6090-4592-b2a9-d1c4c1b24b50,898,302,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,0.570003,10.106912,1.130243,-1.288302,-0.956321,-1.409762,-0.058448,-0.025529,0.628679,1.657651
2914,OutOfFocus,fbc9ce6a-2b29-4115-b218-4ee5b8c50ac1,946,281,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.023441,11.088221,2.068912,-0.977407,-1.108647,-1.399433,-2.744383,-2.037700,0.667556,2.438798
