In [1]:
import pathlib

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from joblib import load, dump
from pycytominer.operations import Spherize

import sys

sys.path.append("../utils")
from load_utils import compile_mitocheck_batch_data, split_data
from normalization_utils import get_normalization_scaler, get_normalized_mitocheck_data

In [2]:
feature_types = ["CP", "DP"]

In [3]:
# get normalization scaler from negative control features (normalization population)
negative_control_data_path = pathlib.Path(
    "../1.idr_streams/extracted_features/negative_control_data/merged_features/"
)

for feature_type in feature_types:
    print(f"Getting {feature_type} normalization scaler...")
    # get normalization population
    norm_pop_data = compile_mitocheck_batch_data(negative_control_data_path, feature_type)

    # derive normalization scaler
    # get normalization population feature data
    _, norm_pop_feature_data = split_data(norm_pop_data, feature_type)

    # define normalization scaler to use
    spherize_center=True,
    spherize_method="ZCA-cor"
    spherize_epsilon=1e-6
    normalization_scaler = Spherize(
                center=spherize_center, method=spherize_method, epsilon=spherize_epsilon
            )

    # fit this scaler to featuire data
    normalization_scaler.fit(norm_pop_feature_data)

    # save normalization scaler
    norm_scaler_save_path = pathlib.Path(f"scaler/normalization_sphere_scaler_{feature_type}.joblib")
    norm_scaler_save_path.parent.mkdir(parents=True, exist_ok=True)
    dump(normalization_scaler, norm_scaler_save_path)

Getting normalization scaler...


In [4]:
# make results dir if it does not already exist
results_dir = pathlib.Path("normalized_data/")
results_dir.mkdir(parents=True, exist_ok=True)

# normalize the training_data
training_data_path = pathlib.Path(
    "normalized_data/training_data.csv.gz"
)

# normalize training data
print("Normalizing training data...")
training_data = pd.read_csv(training_data_path, compression="gzip", index_col=0)

# find columns that belong to metadata, CP, and DP features
all_cols = training_data.columns.to_list()
metadata = [col_name for col_name in all_cols if "P__" not in col_name]
metadata = training_data[metadata]

# final dataframe with metadata and normalized features
normalized_data = [metadata]

for feature_type in feature_types:
    print(f"Normalizing {feature_type} features...")
    
    # load normalization scaler for this feature type
    norm_scaler_save_path = pathlib.Path(f"scaler/normalization_sphere_scaler_{feature_type}.joblib")
    normalization_scaler = load(norm_scaler_save_path)
    
    # normalize features for the particular feature type
    feature_cols = [col_name for col_name in all_cols if f"{feature_type}__" in col_name]
    features = training_data[feature_cols].to_numpy()
    features = normalization_scaler.transform(features)
    
    # make features a dataframe so it can be combined with metadata
    features = pd.DataFrame(features, columns=feature_cols)
    
    # add this normalized features dataframe to final normalized dataframe
    normalized_data.append(features)

normalized_data = pd.concat(normalized_data, axis=1)

# save normalized training data
save_path = pathlib.Path(f"{results_dir}/training_data_spherized.csv.gz")
normalized_data.to_csv(save_path, compression="gzip")

Normalizing training data...
Normalizing CP features...
Normalizing DP features...
