In [None]:
import time
from datetime import date
import pandas as pd
import os
import glob
import pycytominer
import sys
import CBE_utils as CBE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import gc
from pathlib import Path

In [None]:
import importlib
importlib.reload(CBE)

In [None]:
input_path = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/output/"

annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/annotation/"

# Test
input_path = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_output/"

annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_annotation/"

In [None]:
# get folder list 
# load for each site the normalized files
# reduce the features selecting the correct feature list
folders = [name for name in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, name))]

In [None]:
# TODO: Get only FMP Data
# TODO: Filter out problematic batches
# FMP Problematic plates: all replicates of plates C1084-C1093
# Also problematic: 240710, 240717, 240718


In [None]:
def extract_plate_info(path): 
     
    match = re.search(r'(\d{4}-\d{2}-\d{2}_[A-Za-z0-9]+_R\d)', path)
    full_id = match.group(1) if match else None

    # Extract the date string (e.g., 2024-06-05)
    date_match = re.search(r'(\d{4}-\d{2}-\d{2})', full_id) if full_id else None
    batch = date_match.group(1) if date_match else None

    id_match  = re.search(r'_([A-Za-z0-9]+)_R(\d)', full_id) if full_id else None
    plate = id_match.group(1) if id_match else None
    replicate = id_match.group(2) if id_match else None

    return {
        "path": path,
        "full_id": full_id,
        "batch": batch,
        "plate": plate,
        "replicate": f"R{replicate}" if replicate else None
    }

In [None]:
site_specific_path = os.path.join(input_path, 'FMP')

# loads mad_robustize normalized data
pattern = "[A-Z][0-9][0-9][0-9][0-9]_R[1-4]_mad_robustize.csv"
file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

files = [extract_plate_info(path) for path in file_list]

file_dataframe = pd.DataFrame(files)

file_dataframe['plate_replicate'] = file_dataframe['plate'] + '_' + file_dataframe['replicate']

In [None]:
# Remove batches FMP
remove_batch = ['2024-07-10', 
                '2024-07-17', 
                '2024-07-18']

remove_plate = ['C1084', 
                'C1085', 
                'C1086',
                'C1087', 
                'C1088',
                'C1089',
                'C1090', 
                'C1091', 
                'C1092',
                'C1093']

remove_plate_replicate = ['C1119_R3',
                          'C1123_R2']
    

In [None]:
number_processed = len(file_dataframe)
print(number_processed, ' out of 300 plates processed') 
precent_processed = (number_processed / 300 ) * 100
print(int(precent_processed), '% processed')

file_dataframe_filtered_1 = file_dataframe[~file_dataframe["batch"].isin(remove_batch)]
file_dataframe_filtered_2 = file_dataframe_filtered_1[~file_dataframe_filtered_1 ["plate"].isin(remove_plate)]
file_dataframe_filtered_3 = file_dataframe_filtered_2[~file_dataframe_filtered_2['plate_replicate'].isin(remove_plate_replicate)]

rejected_plates = number_processed - len(file_dataframe_filtered_3)
print(rejected_plates, ' plates rejected')

percent_rejected = (rejected_plates / number_processed) * 100
print(int(percent_rejected), '% plates rejected')

# Rejected IMTM

In [None]:
site_specific_path = os.path.join(input_path, 'IMTM')

# loads mad_robustize normalized data
pattern = "[A-Z][0-9][0-9][0-9][0-9]_R[1-4]_mad_robustize.csv"
file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

files = [extract_plate_info(path) for path in file_list]

file_dataframe = pd.DataFrame(files)

file_dataframe['plate_replicate'] = file_dataframe['plate'] + '_' + file_dataframe['replicate']

In [None]:
# Remove batches FMP
remove_batch = ['2024-05-30', 
                '2024-06-03', 
                '2024-06-05', 
                '2024-06-06',
                '2024-06-07',
                '2024-06-10',
                '2024-06-11',
                '2024-06-17',
                '2024-06-19',
                '2024-06-20',
                '2024-06-21',
                '2024-06-24',
                '2024-06-25',
                '2024-06-27',
                '2024-07-01',
                '2024-11-05']

remove_plate = []

remove_plate_replicate = ['C1252_R2']

In [None]:
number_processed = len(file_dataframe)
print(number_processed, ' out of 300 plates processed') 
precent_processed = (number_processed / 300 ) * 100
print(int(precent_processed), '% processed')

file_dataframe_filtered_1 = file_dataframe[~file_dataframe["batch"].isin(remove_batch)]
file_dataframe_filtered_2 = file_dataframe_filtered_1[~file_dataframe_filtered_1["plate"].isin(remove_plate)]
file_dataframe_filtered_3 = file_dataframe_filtered_2[~file_dataframe_filtered_2['plate_replicate'].isin(remove_plate_replicate)]

rejected_plates = number_processed - len(file_dataframe_filtered_3)
rejected_corr = rejected_plates - len(remove_batch) + 1
print(rejected_corr, ' plates rejected')

percent_rejected = rejected_corr / number_processed * 100
print(int(percent_rejected), '% plates rejected')

# Reject USC

In [None]:
site_specific_path = os.path.join(input_path, 'USC')

# loads mad_robustize normalized data
pattern = "[A-Z][0-9][0-9][0-9][0-9]_R[1-4]_mad_robustize.csv"
file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

files = [extract_plate_info(path) for path in file_list]

file_dataframe = pd.DataFrame(files)

file_dataframe['plate_replicate'] = file_dataframe['plate'] + '_' + file_dataframe['replicate']

In [None]:
# Remove batches FMP
remove_batch = ['2024-06-14',
                '2024-06-15',
                '2024-06-17',
                '2024-06-19',
                '2024-06-21']

remove_plate = []

remove_plate_replicate = []

In [None]:
number_processed = len(file_dataframe)
print(number_processed, ' out of 300 plates processed') 
precent_processed = (number_processed / 300 ) * 100
print(int(precent_processed), '% processed')

file_dataframe_filtered_1 = file_dataframe[~file_dataframe["batch"].isin(remove_batch)]
file_dataframe_filtered_2 = file_dataframe_filtered_1[~file_dataframe_filtered_1["plate"].isin(remove_plate)]
file_dataframe_filtered_3 = file_dataframe_filtered_2[~file_dataframe_filtered_2['plate_replicate'].isin(remove_plate_replicate)]

rejected_plates = number_processed - len(file_dataframe_filtered_3)
rejected_corr = rejected_plates - len(remove_batch) + 1
print(rejected_corr, ' plates rejected')

percent_rejected = rejected_corr / number_processed * 100
print(int(percent_rejected), '% plates rejected')

# Reject MEDINA

In [None]:
site_specific_path = os.path.join(input_path, 'MEDINA')

# loads mad_robustize normalized data
pattern = "[A-Z][0-9][0-9][0-9][0-9]_R[1-4]_mad_robustize.csv"
file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

files = [extract_plate_info(path) for path in file_list]

file_dataframe = pd.DataFrame(files)

file_dataframe['plate_replicate'] = file_dataframe['plate'] + '_' + file_dataframe['replicate']

In [None]:
# Remove batches FMP
remove_batch = ['2024-07-19',
                '2024-09-13',
                '2024-09-26',]

remove_plate = []

remove_plate_replicate = ['C1032_R1']

In [None]:
number_processed = len(file_dataframe)
print(number_processed, ' out of 300 plates processed') 
precent_processed = (number_processed / 300 ) * 100
print(int(precent_processed), '% processed')

file_dataframe_filtered_1 = file_dataframe[~file_dataframe["batch"].isin(remove_batch)]
file_dataframe_filtered_2 = file_dataframe_filtered_1[~file_dataframe_filtered_1["plate"].isin(remove_plate)]
file_dataframe_filtered_3 = file_dataframe_filtered_2[~file_dataframe_filtered_2['plate_replicate'].isin(remove_plate_replicate)]

rejected_plates = number_processed - len(file_dataframe_filtered_3)
rejected_corr = rejected_plates - len(remove_batch) + 1
print(rejected_corr, ' plates rejected')

percent_rejected = rejected_corr / number_processed * 100
print(int(percent_rejected), '% plates rejected')

# Correlation based feature reduction

In [None]:
# compute new feature set for each site using correlation based feature redcution

def reduced_features(file_dataframe_filtered):

    Data = []

    file_list = file_dataframe_filtered['path']

    for file in file_list:
        
        plate_map_name = os.path.splitext(os.path.basename(file))[0]
        
        try:
            dataframe = pd.read_csv(file)
            row_count = dataframe.shape[0]

            Data.append(dataframe)
        
            print(f"INFO: {plate_map_name} has {row_count} rows")
        
        except Exception as e:
        
            print(f"Error reading file {filename}: {e}")

    print("Feature reduction with correlation threshold 0.9 and Outlier threshold 100")

    Data_aggregated = pd.concat(Data)
    Data_aggregated = Data_aggregated.reset_index(drop = True)
    reduced_feature_dataframe = CBE.feature_reduction(Data_aggregated,
                                             variance_freq_cut=0.1,
                                             variance_unique_cut=0.1,
                                             outlier_cutoff=100,
                                             corr_threshold = 0.9,
                                             print_stats = True)
    
    return CBE.get_feature_vector(reduced_feature_dataframe)

In [None]:
fmp_feature_set = reduced_features(file_dataframe_filtered_2)


In [None]:
len(fmp_feature_set)

In [None]:
test_file = file_dataframe_filtered_2['path'][0]

In [None]:
path_norm_reduced_data = os.path.dirname(test_file)

In [None]:
batch_plate_folder = Path(path_norm_reduced_data).parts[0]
batch_plate_folder

In [None]:
plate_map_name = os.path.splitext(os.path.basename(test_file))[0]
plate_map_name 

In [None]:
dataframes_normalized_reduced_dict = {}
file_list = file_dataframe_filtered_2['path']

for file in file_list:
    
    plate_map_name = os.path.splitext(os.path.basename(file))[0]
    
    try:
        
        dataframe = pd.read_csv(file)
        row_count = dataframe.shape[0]
        
        print(f"INFO: {plate_map_name} has {row_count} rows")

        features = CBE.get_feature_vector(dataframe)
        metadata = set(dataframe.columns) - set(features)

        number_features = len(features)

        print(f"INFO: {plate_map_name} has {number_features} features")

        reduced_feature_set = set(fmp_feature_set)

        features_to_remove = [item for item in features if item not in reduced_feature_set]
    
        dataframe_reduced_features = dataframe.drop(columns = features_to_remove)

        reduced_features = CBE.get_feature_vector(dataframe_reduced_features)

        number_reduced_features = len(reduced_features)

        print(f"INFO: {plate_map_name} has {number_reduced_features} features after feature reduction")
    
        filename_norm_reduced_data = plate_map_name + "_reduced-corr.csv"
    
        new_source_path = os.path.join(input_path, 'FMP_Filtered')

        try:
            os.makedirs(new_source_path)
            print(f"Folder '{new_source_path}' created successfully!")

        except FileExistsError:
            print(f"Folder '{new_source_path}' already exists.")


        

        new_plate_map_name_path = os.path.join(new_source_path, batch_plate_folder)

        try:
            os.makedirs(new_plate_map_name_path)
            print(f"Folder '{new_plate_map_name_path}' created successfully!")

        except FileExistsError:
            print(f"Folder '{new_plate_map_name_path}' already exists.")

        file_path_norm_reduced_data = os.path.join(new_plate_map_name_path, filename_norm_reduced_data)
    
        dataframe_reduced_features.to_csv(file_path_norm_reduced_data, index=False)
    
        print(f"INFO: Saved {filename_norm_reduced_data}")

        dict_key = dataframe_reduced_features['Metadata_staining_date'][0].astype(str) + "_" + dataframe_reduced_features['Metadata_source'][0] + "_" + dataframe_reduced_features['Metadata_plate_map_name'][0]

        dataframes_normalized_reduced_dict[dict_key] = dataframe_reduced_features
        
    except Exception as e:
        
        print(f"Error reading file {plate_map_name}: {e}") 