In [1]:
import time
from datetime import date
import pandas as pd
import os
import glob
import pycytominer
import sys
import CBE_utils as CBE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import gc

In [2]:
import importlib
importlib.reload(CBE)

<module 'CBE_utils' from '/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/notebooks_revision/CBE_utils.py'>

In [3]:
input_path = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/output/"

annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/annotation/"
feature_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/reduced_features/"

# Test
input_path = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_output/"

annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_annotation/"

In [4]:
# get folder list 
# load for each site the normalized files
# reduce the features selecting the correct feature list
folders = [name for name in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, name))]

# Correlation based feature reduction

In [None]:
# compute new feature set for each site using correlation based feature redcution

def reduced_features(site_name):
    
    Data = []

    site_specific_path = os.path.join(input_path, site_name)
     
    # loads mad_robustize normalized data
    pattern = "[A-Z]*_R[1-4]_mad_robustize.csv"
    file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

    for file in file_list:
        
        plate_map_name = os.path.splitext(os.path.basename(file))[0]
        
        try:
            dataframe = pd.read_csv(file)
            row_count = dataframe.shape[0]

            Data.append(dataframe)
        
            print(f"INFO: {plate_map_name} has {row_count} rows")
        
        except Exception as e:
        
            print(f"Error reading file {filename}: {e}")

    print("Feature reduction with correlation threshold 0.9 and Outlier threshold 100")

    Data_aggregated = pd.concat(Data)
    Data_aggregated = Data_aggregated.reset_index(drop = True)
    reduced_feature_dataframe = CBE.feature_reduction(Data_aggregated,
                                             variance_freq_cut=0.1,
                                             variance_unique_cut=0.1,
                                             outlier_cutoff=100,
                                             corr_threshold = 0.9,
                                             print_stats = True)
    
    filename_norm_reduced_features = site_name+ "_reduced_features.csv"

    file_path_norm_reduced_features = os.path.join(site_specific_path, filename_norm_reduced_features)
    
    reduced_feature_dataframe.to_csv(file_path_norm_reduced_features, index=False)
    
    return CBE.get_feature_vector(reduced_feature_dataframe)

In [6]:
def apply_reduction(reduced_feature_set, site_name):
    
    dataframes_normalized_reduced_dict = {}

    site_specific_path = os.path.join(input_path, site_name)

    pattern = "[A-Z][0-9][0-9][0-9][0-9]_R[1-4]_mad_robustize.csv"
    file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

    for file in file_list:
    
        plate_map_name = os.path.splitext(os.path.basename(file))[0]
    
        try:
        
            dataframe = pd.read_csv(file)
            row_count = dataframe.shape[0]
        
            print(f"INFO: {plate_map_name} has {row_count} rows")

            features = CBE.get_feature_vector(dataframe)
            metadata = set(dataframe.columns) - set(features)

            number_features = len(features)

            print(f"INFO: {plate_map_name} has {number_features} features")

            features_to_remove = [item for item in features if item not in reduced_feature_set]
    
            dataframe_reduced_features = dataframe.drop(columns = features_to_remove)

            reduced_features = CBE.get_feature_vector(dataframe_reduced_features)

            number_reduced_features = len(reduced_features)

            print(f"INFO: {plate_map_name} has {number_reduced_features} features after feature reduction")
    
            filename_norm_reduced_data = plate_map_name + "_reduced-corr.csv"
    
            path_norm_reduced_data = os.path.dirname(file)
            file_path_norm_reduced_data = os.path.join(path_norm_reduced_data, filename_norm_reduced_data)
    
            dataframe_reduced_features.to_csv(file_path_norm_reduced_data, index=False)
    
            print(f"INFO: Saved {filename_norm_reduced_data}")

            dict_key = dataframe_reduced_features['Metadata_staining_date'][0].astype(str) + "_" + dataframe_reduced_features['Metadata_source'][0] + "_" + dataframe_reduced_features['Metadata_plate_map_name'][0]

            dataframes_normalized_reduced_dict[dict_key] = dataframe_reduced_features
        
        except Exception as e:
        
            print(f"Error reading file {plate_map_name}: {e}") 

In [7]:
fmp_feature_set = reduced_features('FMP')
apply_reduction(fmp_feature_set, 'FMP')
len(fmp_feature_set)

INFO: C1102_R4_mad_robustize has 384 rows
INFO: C1096_R2_mad_robustize has 384 rows
INFO: C1108_R4_mad_robustize has 383 rows
INFO: C1120_R1_mad_robustize has 384 rows
INFO: C1126_R1_mad_robustize has 384 rows
INFO: C1109_R1_mad_robustize has 384 rows
INFO: C1097_R4_mad_robustize has 384 rows
INFO: C1119_R4_mad_robustize has 384 rows
INFO: C1084_R4_mad_robustize has 384 rows
INFO: C1084_R2_mad_robustize has 384 rows
INFO: C1103_R2_mad_robustize has 384 rows
INFO: C1120_R2_mad_robustize has 384 rows
INFO: C1112_R3_mad_robustize has 384 rows
INFO: C1130_R4_mad_robustize has 384 rows
INFO: C1128_R4_mad_robustize has 383 rows
INFO: C1100_R2_mad_robustize has 384 rows
INFO: C1092_R1_mad_robustize has 384 rows
INFO: C1086_R4_mad_robustize has 384 rows
INFO: C1093_R4_mad_robustize has 384 rows
INFO: C1125_R2_mad_robustize has 384 rows
INFO: C1089_R1_mad_robustize has 384 rows
INFO: C1102_R2_mad_robustize has 384 rows
INFO: C1116_R2_mad_robustize has 384 rows
INFO: C1087_R2_mad_robustize has 3

369

In [8]:
imtm_feature_set = reduced_features('IMTM')
apply_reduction(imtm_feature_set, 'IMTM')
len(imtm_feature_set)

INFO: C1269_R2_mad_robustize has 384 rows
INFO: C1242_R2_mad_robustize has 384 rows
INFO: C1284_R4_mad_robustize has 384 rows
INFO: C1288_R1_mad_robustize has 384 rows
INFO: A1302_R2_mad_robustize has 384 rows
INFO: C1240_R4_mad_robustize has 384 rows
INFO: A1292_R2_mad_robustize has 384 rows
INFO: C1242_R3_mad_robustize has 384 rows
INFO: C1260_R1_mad_robustize has 384 rows
INFO: A1292_R3_mad_robustize has 384 rows
INFO: A1299_R2_mad_robustize has 384 rows
INFO: A1293_R3_mad_robustize has 384 rows
INFO: C1258_R2_mad_robustize has 384 rows
INFO: C1270_R3_mad_robustize has 384 rows
INFO: C1259_R1_mad_robustize has 384 rows
INFO: C1283_R2_mad_robustize has 384 rows
INFO: A1303_R1_mad_robustize has 384 rows
INFO: C1288_R3_mad_robustize has 384 rows
INFO: C1238_R3_mad_robustize has 384 rows
INFO: C1261_R1_mad_robustize has 384 rows
INFO: C1245_R4_mad_robustize has 384 rows
INFO: A1307_R1_mad_robustize has 384 rows
INFO: C1263_R1_mad_robustize has 384 rows
INFO: C1276_R1_mad_robustize has 3

421

In [9]:
medina_feature_set = reduced_features('MEDINA')
apply_reduction(medina_feature_set, 'MEDINA')
len(medina_feature_set)

INFO: C1019_R3_mad_robustize has 384 rows
INFO: C1048_R1_mad_robustize has 384 rows
INFO: C1037_R1_mad_robustize has 384 rows
INFO: C1077_R4_mad_robustize has 384 rows
INFO: C1040_R1_mad_robustize has 384 rows
INFO: C1049_R3_mad_robustize has 384 rows
INFO: C1014_R1_mad_robustize has 378 rows
INFO: C1014_R4_mad_robustize has 384 rows
INFO: C1078_R4_mad_robustize has 384 rows
INFO: C1060_R1_mad_robustize has 384 rows
INFO: C1057_R3_mad_robustize has 384 rows
INFO: C1056_R2_mad_robustize has 384 rows
INFO: C1027_R2_mad_robustize has 384 rows
INFO: C1012_R3_mad_robustize has 384 rows
INFO: C1028_R3_mad_robustize has 384 rows
INFO: C1077_R3_mad_robustize has 384 rows
INFO: C1062_R3_mad_robustize has 384 rows
INFO: C1041_R4_mad_robustize has 384 rows
INFO: C1040_R2_mad_robustize has 384 rows
INFO: C1042_R4_mad_robustize has 384 rows
INFO: C1052_R1_mad_robustize has 384 rows
INFO: C1079_R2_mad_robustize has 384 rows
INFO: C1059_R4_mad_robustize has 384 rows
INFO: C1052_R3_mad_robustize has 3

32

In [7]:
usc_feature_set = reduced_features('USC')
apply_reduction(usc_feature_set, 'USC')
len(usc_feature_set)

INFO: C1198_R3_mad_robustize has 384 rows
INFO: C1216_R4_mad_robustize has 384 rows
INFO: C1226_R1_mad_robustize has 384 rows
INFO: C1193_R2_mad_robustize has 384 rows
INFO: C1206_R2_mad_robustize has 384 rows
INFO: C1192_R1_mad_robustize has 384 rows
INFO: C1169_R4_mad_robustize has 384 rows
INFO: C1167_R4_mad_robustize has 384 rows
INFO: C1184_R1_mad_robustize has 384 rows
INFO: C1181_R4_mad_robustize has 384 rows
INFO: C1213_R3_mad_robustize has 384 rows
INFO: C1166_R2_mad_robustize has 384 rows
INFO: C1202_R3_mad_robustize has 384 rows
INFO: C1162_R4_mad_robustize has 384 rows
INFO: C1159_R1_mad_robustize has 384 rows
INFO: C1193_R1_mad_robustize has 384 rows
INFO: C1220_R1_mad_robustize has 384 rows
INFO: C1175_R1_mad_robustize has 384 rows
INFO: C1193_R3_mad_robustize has 384 rows
INFO: C1218_R4_mad_robustize has 384 rows
INFO: C1160_R2_mad_robustize has 384 rows
INFO: C1160_R3_mad_robustize has 384 rows
INFO: C1219_R3_mad_robustize has 384 rows
INFO: C1203_R4_mad_robustize has 3

617