In [None]:
import pathlib

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from joblib import dump
from pycytominer.operations import Spherize

import sys

sys.path.append("../utils")
from load_utils import compile_mitocheck_batch_data, split_data
from normalization_utils import get_normalization_scaler, get_normalized_mitocheck_data

In [None]:
feature_type = "CP"

In [None]:
# get normalization scaler from negative control features (normalization population)
print("Getting normalization scaler...")
negative_control_data_path = pathlib.Path(
    "../1.idr_streams/extracted_features/negative_control_data/merged_features/"
)

# get normalization population
norm_pop_data = compile_mitocheck_batch_data(negative_control_data_path, feature_type)

# derive normalization scaler
# get normalization population feature data
_, norm_pop_feature_data = split_data(norm_pop_data, feature_type)

# define normalization scaler to use
spherize_center=True,
spherize_method="ZCA-cor"
spherize_epsilon=1e-6
normalization_scaler = Spherize(
            center=spherize_center, method=spherize_method, epsilon=spherize_epsilon
        )

# fit this scaler to featuire data
normalization_scaler.fit(norm_pop_feature_data)

# save normalization scaler
norm_scaler_save_path = pathlib.Path("scaler/normalization_scaler_spherized.joblib")
norm_scaler_save_path.parent.mkdir(parents=True, exist_ok=True)
dump(normalization_scaler, norm_scaler_save_path)

In [None]:
# make results dir if it does not already exist
results_dir = pathlib.Path("normalized_data/")
results_dir.mkdir(parents=True, exist_ok=True)

# normalize the data at the following paths
training_data_path = pathlib.Path(
    "../2.format_training_data/results/training_data.csv.gz"
)
negative_control_data_path = pathlib.Path(
    "../1.idr_streams/extracted_features/negative_control_data/merged_features/"
)

# normalize training data
print("Normalizing training data...")
training_data = pd.read_csv(training_data_path, compression="gzip", index_col=0)


normalized_data = get_normalized_mitocheck_data(data, normalization_scaler)


# save normalized training data
save_path = pathlib.Path(f"{results_dir}/training_data_spherized.csv.gz")
normalized_data.to_csv(save_path, compression="gzip")