In [1]:
import pathlib

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from joblib import load, dump
from pycytominer.operations import Spherize

import sys

sys.path.append("../utils")
from load_utils import compile_mitocheck_batch_data, split_data
from normalization_utils import get_normalization_scaler, get_normalized_mitocheck_data

In [2]:
feature_types = ["CP", "DP"]

In [3]:
# get normalization scaler from negative control features (normalization population)
negative_control_data_path = pathlib.Path(
    "../1.idr_streams/extracted_features/negative_control_data/merged_features/"
)

for feature_type in feature_types:
    print(f"Getting {feature_type} normalization scaler...")
    # get normalization population
    norm_pop_data = compile_mitocheck_batch_data(negative_control_data_path, feature_type)

    # derive normalization scaler
    # get normalization population feature data
    _, norm_pop_feature_data = split_data(norm_pop_data, feature_type)

    # define normalization scaler to use
    spherize_center=True,
    spherize_method="ZCA-cor"
    spherize_epsilon=1e-6
    normalization_scaler = Spherize(
                center=spherize_center, method=spherize_method, epsilon=spherize_epsilon
            )

    # fit this scaler to featuire data
    normalization_scaler.fit(norm_pop_feature_data)

    # save normalization scaler
    norm_scaler_save_path = pathlib.Path(f"scaler/normalization_sphere_scaler_{feature_type}.joblib")
    norm_scaler_save_path.parent.mkdir(parents=True, exist_ok=True)
    dump(normalization_scaler, norm_scaler_save_path)

Getting normalization scaler...


In [7]:
# make results dir if it does not already exist
results_dir = pathlib.Path("normalized_data/")
results_dir.mkdir(parents=True, exist_ok=True)

# normalize the training_data
training_data_path = pathlib.Path(
    "normalized_data/training_data.csv.gz"
)

# normalize training data
print("Normalizing training data...")
training_data = pd.read_csv(training_data_path, compression="gzip", index_col=0)

# find columns that belong to metadata, CP, and DP features
all_cols = training_data.columns.to_list()
metadata = [col_name for col_name in all_cols if "P__" not in col_name]
metadata = training_data[metadata]

# final dataframe with metadata and normalized features
normalized_data = [metadata]

for feature_type in feature_types:
    
    print(f"Normalizing {feature_type} features...")
    
    # load normalization scaler for this feature type
    norm_scaler_save_path = pathlib.Path(f"scaler/normalization_sphere_scaler_{feature_type}.joblib")
    normalization_scaler = load(norm_scaler_save_path)
    
    # normalize features for the particular feature type
    feature_cols = [col_name for col_name in all_cols if f"{feature_type}__" in col_name]
    features = training_data[feature_cols].to_numpy()
    
    if feature_type == "DP":
        features = normalization_scaler.transform(features)
    
    # make features a dataframe so it can be combined with metadata
    features = pd.DataFrame(features, columns=feature_cols)
    
    # add this normalized features dataframe to final normalized dataframe
    normalized_data.append(features)

normalized_data = pd.concat(normalized_data, axis=1)

# save normalized training data
save_path = pathlib.Path(f"{results_dir}/training_data_spherized.csv.gz")
normalized_data.to_csv(save_path, compression="gzip")

Normalizing training data...
Normalizing CP features...
Normalizing DP features...


In [8]:
normalized_data

Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,40.461777,16.605322,9.124169,0.857989,9.036922,-16.015566,894.383759,-5.039979,35.547440,5.834666
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-24.073016,-1.402141,-3.418374,16.072593,9.543252,-10.255761,-38.300326,12.463685,4.254238,-1.992848
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,11.726268,9.890864,1.620144,-3.882739,6.063389,-4.739647,-6.095064,3.210283,5.357496,2.990269
3,Large,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-21.705518,-1.020671,2.206824,10.365674,10.867355,-2.512517,15.319852,3.016064,-11.462653,-29.966338
4,Large,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,18.175144,-3.070057,-1.487462,19.234720,-6.368348,-6.103120,5.946294,-10.073768,1.582111,-48.791122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,OutOfFocus,380728fc-28b0-423f-b8a7-07be1af590d9,383,219,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,8.018418,63.152362,-5.525855,-8.134043,-6.653163,16.484993,-28.237609,-26.512904,-1.512545,-46.310617
2912,OutOfFocus,30ed67c7-8de2-4d78-bce9-3fa1aff28565,975,294,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,15.260877,47.314268,9.545845,-3.674123,-3.848211,-2.645845,-50.215900,-6.916798,-16.323272,-93.026610
2913,OutOfFocus,2960b13e-6090-4592-b2a9-d1c4c1b24b50,898,302,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,5.821101,90.891923,-1.408638,0.899574,0.961952,3.719442,-8.516932,-2.192268,-12.270919,-48.300575
2914,OutOfFocus,fbc9ce6a-2b29-4115-b218-4ee5b8c50ac1,946,281,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-5.493540,105.619768,-12.174146,-0.527002,-17.130701,12.690915,-113.906727,-29.095839,9.255931,-70.304702
