In [1]:
import time
from datetime import date
import pandas as pd
import os
import glob
import pycytominer
import sys
import CBE_utils as CBE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import gc

In [2]:
import importlib
importlib.reload(CBE)

<module 'CBE_utils' from '/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/notebooks_revision/CBE_utils.py'>

In [3]:
input_path = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/output/"

annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/annotation/"
feature_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/reduced_features/"

# Test
input_path = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_output/"

annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_annotation/"

In [4]:
# get folder list 
# load for each site the normalized files
# reduce the features selecting the correct feature list
folders = [name for name in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, name))]

# Correlation based feature reduction

In [5]:
# compute new feature set for each site using correlation based feature redcution

def reduced_features(site_name):
    
    Data = []

    site_specific_path = os.path.join(input_path, site_name)
     
    # loads mad_robustize normalized data
    pattern = "[A-Z][0-9][0-9][0-9][0-9]_R[1-4]_mad_robustize.csv"
    file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

    for file in file_list:
        
        plate_map_name = os.path.splitext(os.path.basename(file))[0]
        
        try:
            dataframe = pd.read_csv(file)
            row_count = dataframe.shape[0]

            Data.append(dataframe)
        
            print(f"INFO: {plate_map_name} has {row_count} rows")
        
        except Exception as e:
        
            print(f"Error reading file {filename}: {e}")

    print("Feature reduction with correlation threshold 0.9 and Outlier threshold 100")

    Data_aggregated = pd.concat(Data)
    Data_aggregated = Data_aggregated.reset_index(drop = True)
    reduced_feature_dataframe = CBE.feature_reduction(Data_aggregated,
                                             variance_freq_cut=0.1,
                                             variance_unique_cut=0.1,
                                             outlier_cutoff=100,
                                             corr_threshold = 0.9,
                                             print_stats = True)
    
    return CBE.get_feature_vector(reduced_feature_dataframe)

In [6]:
fmp_feature_set = reduced_features('FMP')


INFO: C1102_R4_mad_robustize has 384 rows
INFO: C1096_R2_mad_robustize has 384 rows
INFO: C1108_R4_mad_robustize has 383 rows
INFO: C1120_R1_mad_robustize has 384 rows
INFO: C1109_R1_mad_robustize has 384 rows
INFO: C1097_R4_mad_robustize has 384 rows
INFO: C1119_R4_mad_robustize has 384 rows
INFO: C1084_R4_mad_robustize has 384 rows
INFO: C1084_R2_mad_robustize has 384 rows
INFO: C1103_R2_mad_robustize has 384 rows
INFO: C1120_R2_mad_robustize has 384 rows
INFO: C1112_R3_mad_robustize has 384 rows
INFO: C1100_R2_mad_robustize has 384 rows
INFO: C1092_R1_mad_robustize has 384 rows
INFO: C1086_R4_mad_robustize has 384 rows
INFO: C1093_R4_mad_robustize has 384 rows
INFO: C1089_R1_mad_robustize has 384 rows
INFO: C1102_R2_mad_robustize has 384 rows
INFO: C1116_R2_mad_robustize has 384 rows
INFO: C1087_R2_mad_robustize has 384 rows
INFO: C1085_R2_mad_robustize has 384 rows
INFO: C1112_R2_mad_robustize has 383 rows
INFO: C1117_R1_mad_robustize has 384 rows
INFO: C1088_R2_mad_robustize has 3

In [7]:
len(fmp_feature_set)

389

In [8]:
imtm_feature_set = reduced_features('IMTM')
len(imtm_feature_set)

INFO: C1231_R1_mad_robustize has 384 rows
INFO: C1269_R2_mad_robustize has 384 rows
INFO: C1240_R4_mad_robustize has 384 rows
INFO: C1260_R1_mad_robustize has 384 rows
INFO: C1258_R2_mad_robustize has 384 rows
INFO: C1270_R3_mad_robustize has 384 rows
INFO: C1259_R1_mad_robustize has 384 rows
INFO: C1283_R2_mad_robustize has 384 rows
INFO: C1261_R1_mad_robustize has 384 rows
INFO: C1245_R4_mad_robustize has 384 rows
INFO: C1234_R3_mad_robustize has 384 rows
INFO: C1263_R1_mad_robustize has 384 rows
INFO: C1239_R1_mad_robustize has 384 rows
INFO: C1261_R2_mad_robustize has 384 rows
INFO: C1248_R1_mad_robustize has 384 rows
INFO: C1241_R1_mad_robustize has 384 rows
INFO: C1232_R3_mad_robustize has 384 rows
INFO: C1251_R1_mad_robustize has 384 rows
INFO: C1264_R4_mad_robustize has 384 rows
INFO: C1264_R3_mad_robustize has 384 rows
INFO: C1248_R2_mad_robustize has 384 rows
INFO: C1272_R3_mad_robustize has 384 rows
INFO: C1254_R4_mad_robustize has 384 rows
INFO: C1247_R4_mad_robustize has 3

617

In [9]:
medina_feature_set = reduced_features('MEDINA')
len(medina_feature_set)

INFO: C1019_R3_mad_robustize has 384 rows
INFO: C1048_R1_mad_robustize has 384 rows
INFO: C1037_R1_mad_robustize has 384 rows
INFO: C1035_R3_mad_robustize has 384 rows
INFO: C1040_R1_mad_robustize has 384 rows
INFO: C1049_R3_mad_robustize has 384 rows
INFO: C1014_R1_mad_robustize has 378 rows
INFO: C1014_R4_mad_robustize has 384 rows
INFO: C1011_R4_mad_robustize has 384 rows
INFO: C1057_R3_mad_robustize has 384 rows
INFO: C1027_R2_mad_robustize has 384 rows
INFO: C1012_R3_mad_robustize has 384 rows
INFO: C1028_R3_mad_robustize has 384 rows
INFO: C1020_R1_mad_robustize has 384 rows
INFO: C1041_R4_mad_robustize has 384 rows
INFO: C1040_R2_mad_robustize has 384 rows
INFO: C1042_R4_mad_robustize has 384 rows
INFO: C1052_R1_mad_robustize has 384 rows
INFO: C1028_R3_mad_robustize has 384 rows
INFO: C1052_R3_mad_robustize has 384 rows
INFO: C1060_R3_mad_robustize has 384 rows
INFO: C1016_R3_mad_robustize has 384 rows
INFO: C1042_R1_mad_robustize has 384 rows
INFO: C1053_R3_mad_robustize has 3

624

In [10]:
usc_feature_set = reduced_features('USC')
len(usc_feature_set)

INFO: C1216_R4_mad_robustize has 384 rows
INFO: C1169_R4_mad_robustize has 384 rows
INFO: C1167_R4_mad_robustize has 384 rows
INFO: C1184_R1_mad_robustize has 384 rows
INFO: C1181_R4_mad_robustize has 384 rows
INFO: C1166_R2_mad_robustize has 384 rows
INFO: C1162_R4_mad_robustize has 384 rows
INFO: C1193_R1_mad_robustize has 384 rows
INFO: C1220_R1_mad_robustize has 384 rows
INFO: C1175_R1_mad_robustize has 384 rows
INFO: C1218_R4_mad_robustize has 384 rows
INFO: C1160_R2_mad_robustize has 384 rows
INFO: C1160_R3_mad_robustize has 384 rows
INFO: C1203_R4_mad_robustize has 384 rows
INFO: C1197_R4_mad_robustize has 384 rows
INFO: C1165_R2_mad_robustize has 384 rows
INFO: C1198_R1_mad_robustize has 384 rows
INFO: C1205_R1_mad_robustize has 384 rows
INFO: C1158_R1_mad_robustize has 384 rows
INFO: C1204_R3_mad_robustize has 384 rows
INFO: C1224_R3_mad_robustize has 384 rows
INFO: C1195_R4_mad_robustize has 384 rows
INFO: C1171_R2_mad_robustize has 384 rows
INFO: C1188_R1_mad_robustize has 3

666

In [11]:
dataframes_normalized_reduced_dict = {}

for folder in folders:

    site_specific_path = os.path.join(input_path, folder)

    # loads mad_robustize normalized data
    pattern = "[A-Z][0-9][0-9][0-9][0-9]_R[1-4]_mad_robustize.csv"
    file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

    for file in file_list:
    
        plate_map_name = os.path.splitext(os.path.basename(file))[0]
    
        try:
        
            dataframe = pd.read_csv(file)
            row_count = dataframe.shape[0]
        
            print(f"INFO: {plate_map_name} has {row_count} rows")

            features = CBE.get_feature_vector(dataframe)
            metadata = set(dataframe.columns) - set(features)

            number_features = len(features)

            print(f"INFO: {plate_map_name} has {number_features} features")

            if folder == 'FMP':
                reduced_feature_set = set(fmp_feature_set)
            elif folder == 'IMTM':
                reduced_feature_set = set(imtm_feature_set)
            elif folder == 'MEDINA':
                reduced_feature_set = set(medina_feature_set)
            elif folder == 'USC':
                reduced_feature_set = set(usc_feature_set)

            features_to_remove = [item for item in features if item not in reduced_feature_set]
    
            dataframe_reduced_features = dataframe.drop(columns = features_to_remove)

            reduced_features = CBE.get_feature_vector(dataframe_reduced_features)

            number_reduced_features = len(reduced_features)

            print(f"INFO: {plate_map_name} has {number_reduced_features} features after feature reduction")
    
            filename_norm_reduced_data = plate_map_name + "_reduced-corr.csv"
    
            path_norm_reduced_data = os.path.dirname(file)
            file_path_norm_reduced_data = os.path.join(path_norm_reduced_data, filename_norm_reduced_data)
    
            dataframe_reduced_features.to_csv(file_path_norm_reduced_data, index=False)
    
            print(f"INFO: Saved {filename_norm_reduced_data}")

            dict_key = dataframe_reduced_features['Metadata_staining_date'][0].astype(str) + "_" + dataframe_reduced_features['Metadata_source'][0] + "_" + dataframe_reduced_features['Metadata_plate_map_name'][0]

            dataframes_normalized_reduced_dict[dict_key] = dataframe_reduced_features
        
        except Exception as e:
        
            print(f"Error reading file {plate_map_name}: {e}") 

INFO: C1231_R1_mad_robustize has 384 rows
INFO: C1231_R1_mad_robustize has 2977 features
INFO: C1231_R1_mad_robustize has 617 features after feature reduction
INFO: Saved C1231_R1_mad_robustize_reduced-corr.csv
INFO: C1269_R2_mad_robustize has 384 rows
INFO: C1269_R2_mad_robustize has 2977 features
INFO: C1269_R2_mad_robustize has 617 features after feature reduction
INFO: Saved C1269_R2_mad_robustize_reduced-corr.csv
INFO: C1240_R4_mad_robustize has 384 rows
INFO: C1240_R4_mad_robustize has 2977 features
INFO: C1240_R4_mad_robustize has 617 features after feature reduction
INFO: Saved C1240_R4_mad_robustize_reduced-corr.csv
INFO: C1260_R1_mad_robustize has 384 rows
INFO: C1260_R1_mad_robustize has 2977 features
INFO: C1260_R1_mad_robustize has 617 features after feature reduction
INFO: Saved C1260_R1_mad_robustize_reduced-corr.csv
INFO: C1258_R2_mad_robustize has 384 rows
INFO: C1258_R2_mad_robustize has 2977 features
INFO: C1258_R2_mad_robustize has 617 features after feature reducti

# Feature reduction based on bioactive features

In [None]:
# Read in reduced feature set
fmp_features_path = feature_dir + "fmp_HepG2_features.txt"

with open(fmp_features_path, 'r') as f:
    fmp_features = f.read().split()

# Read in reduced feature set
imtm_features_path = feature_dir + "imtm_features.txt"

with open(imtm_features_path, 'r') as f:
    imtm_features = f.read().split()

# Read in reduced feature set
medina_features_path = feature_dir + "medina_features.txt"

with open(medina_features_path, 'r') as f:
    medina_features = f.read().split()

# Read in reduced feature set
usc_features_path = feature_dir + "usc_features.txt"

with open(usc_features_path, 'r') as f:
    usc_features = f.read().split()

In [None]:
dataframes_normalized_reduced_dict = {}

for folder in folders:

    site_specific_path = os.path.join(input_path, folder)

    # loads mad_robustize normalized data
    pattern = "[A-Z][0-9][0-9][0-9][0-9]_R[1-4]_mad_robustize.csv"
    file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

    for file in file_list:
    
        plate_map_name = os.path.splitext(os.path.basename(file))[0]
    
        try:
        
            dataframe = pd.read_csv(file)
            row_count = dataframe.shape[0]
        
            print(f"INFO: {plate_map_name} has {row_count} rows")

            features = CBE.get_feature_vector(dataframe)
            metadata = set(dataframe.columns) - set(features)

            number_features = len(features_list)

            print(f"INFO: {plate_map_name} has {number_features} features")

            if folder == 'FMP':
                reduced_feature_set = set(fmp_features)
            elif folder == 'IMTM':
                reduced_feature_set = set(imtm_features)
            elif folder == 'MEDINA':
                reduced_feature_set = set(medina_features)
            elif folder == 'USC':
                reduced_feature_set = set(usc_features)

            features_to_remove = [item for item in features if item not in reduced_feature_set]
    
            dataframe_reduced_features = dataframe.drop(columns = features_to_remove)

            reduced_features = CBE.get_feature_vector(dataframe_reduced_features)

            number_reduced_features = len(reduced_features)

            print(f"INFO: {plate_map_name} has {number_reduced_features} features after feature reduction")
    
            filename_norm_reduced_data = plate_map_name + "_reduced.csv"
    
            path_norm_reduced_data = os.path.dirname(file)
            file_path_norm_reduced_data = os.path.join(path_norm_reduced_data, filename_norm_reduced_data)
    
            dataframe_reduced_features.to_csv(file_path_norm_reduced_data, index=False)
    
            print(f"INFO: Saved {filename_norm_reduced_data}")

            # TODO: collect them in dictionary for further processing. 
            # TODO: Based on key Date_Source_Plate_replicate
            dict_key = dataframe_reduced_features['Metadata_staining_date'][0].astype(str) + "_" + dataframe_reduced_features['Metadata_source'][0] + "_" + dataframe_reduced_features['Metadata_plate_map_name'][0]

            dataframes_normalized_reduced_dict[dict_key] = dataframe_reduced_features
        
        except Exception as e:
        
            print(f"Error reading file {plate_map_name}: {e}")  