In [None]:
import pandas as pd
import os
import glob

import CBE_utils as CBE

In [None]:
import importlib
importlib.reload(CBE)

In [None]:
input_path = "/media/schmied.christopher/T7 Shield/Datasets/ECBL/processed/"
annotation_dir = "/home/schmied.christopher/FMP_Docs/Projects/eu_os_ecbl/annotation/"

feature_dir = "/home/schmied.christopher/FMP_Docs/Projects/eu_os_ecbl/reduced_features/"

# Test
input_path = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_output/"

annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_annotation/"

input_path = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/tech_test_output/"

annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/annotation/"
feature_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/reduced_features/"

In [None]:
# get folder list 
# load for each site the normalized files
# reduce the features selecting the correct feature list
folders = [name for name in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, name))]

# Correlation based feature reduction

In [None]:
# compute new feature set for each site using correlation based feature redcution

def reduced_features(site_name):
    
    Data = []

    site_specific_path = os.path.join(input_path, site_name)
     
    # loads mad_robustize normalized data
    pattern = "[A-Z]*_R[1-4]_mad_robustize.csv"
    file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

    for file in file_list:
        
        plate_map_name = os.path.splitext(os.path.basename(file))[0]
        
        try:
            dataframe = pd.read_csv(file)
            row_count = dataframe.shape[0]

            Data.append(dataframe)
        
            print(f"INFO: {plate_map_name} has {row_count} rows")
        
        except Exception as e:
        
            print(f"Error reading file {file}: {e}")

    print("Feature reduction with correlation threshold 0.9 and Outlier threshold 100")

    Data_aggregated = pd.concat(Data)
    Data_aggregated = Data_aggregated.reset_index(drop = True)
    reduced_feature_dataframe = CBE.feature_reduction(Data_aggregated,
                                             variance_freq_cut=0.1,
                                             variance_unique_cut=0.1,
                                             outlier_cutoff=100,
                                             corr_threshold = 0.9,
                                             print_stats = True)
    
    filename_norm_reduced_features = site_name+ "_reduced_features.csv"

    file_path_norm_reduced_features = os.path.join(site_specific_path, filename_norm_reduced_features)
    
    reduced_feature_dataframe.to_csv(file_path_norm_reduced_features, index=False)
    
    return CBE.get_feature_vector(reduced_feature_dataframe)

In [None]:
def apply_reduction(reduced_feature_set, site_name):
    
    dataframes_normalized_reduced_dict = {}

    site_specific_path = os.path.join(input_path, site_name)

    pattern = "[A-Z][0-9][0-9][0-9][0-9]_R[1-4]_mad_robustize.csv"
    file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

    for file in file_list:
    
        plate_map_name = os.path.splitext(os.path.basename(file))[0]
    
        try:
        
            dataframe = pd.read_csv(file)
            row_count = dataframe.shape[0]
        
            print(f"INFO: {plate_map_name} has {row_count} rows")

            features = CBE.get_feature_vector(dataframe)
            metadata = set(dataframe.columns) - set(features)

            number_features = len(features)

            print(f"INFO: {plate_map_name} has {number_features} features")

            features_to_remove = [item for item in features if item not in reduced_feature_set]
    
            dataframe_reduced_features = dataframe.drop(columns = features_to_remove)

            reduced_features = CBE.get_feature_vector(dataframe_reduced_features)

            number_reduced_features = len(reduced_features)

            print(f"INFO: {plate_map_name} has {number_reduced_features} features after feature reduction")
    
            filename_norm_reduced_data = plate_map_name + "_reduced-corr.csv"
    
            path_norm_reduced_data = os.path.dirname(file)
            file_path_norm_reduced_data = os.path.join(path_norm_reduced_data, filename_norm_reduced_data)
    
            dataframe_reduced_features.to_csv(file_path_norm_reduced_data, index=False)
    
            print(f"INFO: Saved {filename_norm_reduced_data}")

            dict_key = (
                str(dataframe_reduced_features['Metadata_staining_date'].iloc[0]) + "_" +
                str(dataframe_reduced_features['Metadata_source'].iloc[0]) + "_" +
                str(dataframe_reduced_features['Metadata_plate_map_name'].iloc[0])
                
            )

            dataframes_normalized_reduced_dict[dict_key] = dataframe_reduced_features
        
        except Exception as e:
        
            print(f"Error reading file {plate_map_name}: {e}") 

In [None]:
fmp_feature_set = reduced_features('FMP')
apply_reduction(fmp_feature_set, 'FMP')
len(fmp_feature_set)

In [None]:
imtm_feature_set = reduced_features('IMTM')
apply_reduction(imtm_feature_set, 'IMTM')
len(imtm_feature_set)

In [None]:
# Some batches make issues with MEDINA. Addressed in 04b workbook
medina_feature_set = reduced_features('MEDINA')
apply_reduction(medina_feature_set, 'MEDINA')
len(medina_feature_set)

In [None]:
usc_feature_set = reduced_features('USC')
apply_reduction(usc_feature_set, 'USC')
len(usc_feature_set)