In [None]:
import pandas as pd
import os
import glob
import CBE_utils as CBE
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import importlib
importlib.reload(CBE)

In [None]:
input_path = "/media/schmied.christopher/T7 Shield/Datasets/ECBL/processed/"
output_path = "/home/schmied.christopher/FMP_Docs/Projects/eu_os_ecbl_qc/results/"

annotation_dir = "/home/schmied.christopher/FMP_Docs/Projects/eu_os_ecbl_qc/annotation/"

# For test
input_path = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_output/"
output_path = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_results/"

annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_annotation/"

In [None]:
# get folder list 
# load for each site the normalized files
# reduce the features selecting the correct feature list
folders = [name for name in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, name))]

In [None]:
pattern = "[A-Z]*_R[1-4]_mad_robustize_reduced-corr.csv"

FMP_Data = []
IMTM_Data = []
MEDINA_Data = []
USC_Data = []

for folder in folders:

    site_specific_path = os.path.join(input_path, folder)

    # load normalized data
    file_list = glob.glob(site_specific_path + os.sep + '*' + os.sep + pattern, recursive=True)

    for file in file_list:

        filename = os.path.basename(file)
    
        try:
        
            Data_Temp = pd.read_csv(file)
            row_count = Data_Temp.shape[0]
        
            print(f"File: {filename} has {row_count} rows")
        
            if folder == 'FMP':
                
                FMP_Data.append(Data_Temp)

            elif folder == 'IMTM':

                IMTM_Data.append(Data_Temp)

            elif folder == 'MEDINA':

                MEDINA_Data.append(Data_Temp)

            elif folder == 'USC':

                USC_Data.append(Data_Temp)
            
        except Exception as e:
        
            print(f"Error reading file {filename}: {e}")


In [None]:
### concat all files together
FMP_Data_aggregated = pd.concat(FMP_Data)
FMP_Data_aggregated = FMP_Data_aggregated.reset_index(drop = True)
print("Aggregated Data has shape ", FMP_Data_aggregated.shape)

In [None]:
IMTM_Data_aggregated = pd.concat(IMTM_Data)
IMTM_Data_aggregated = IMTM_Data_aggregated.reset_index(drop = True)
print("Aggregated Data has shape ", IMTM_Data_aggregated.shape)

In [None]:
MEDINA_Data_aggregated = pd.concat(MEDINA_Data)
MEDINA_Data_aggregated = MEDINA_Data_aggregated.reset_index(drop = True)
print("Aggregated Data has shape ", MEDINA_Data_aggregated.shape)

In [None]:
USC_Data_aggregated = pd.concat(USC_Data)
USC_Data_aggregated = USC_Data_aggregated.reset_index(drop = True)
print("Aggregated Data has shape ", USC_Data_aggregated.shape)


# Comparison over plates

In [None]:
# TODO: Correlation of pos. & neg. controls over time
# TODO: Compute consensus per normalized plate for pos. controls
# TODO: Implement consensus of first plates of specific source
# TODO: Then with an external control from the bioactives
# TODO: Create reference point for pos. controls and compare plates to this

In [None]:
def interplate_qc(Data_aggregated, source):

    # Gets the feature and metadata columns
    features_columns = CBE.get_feature_vector(Data_aggregated)
    metadata_columns = set(Data_aggregated) - set(features_columns)

    columns_to_drop = [
        'Metadata_imaging_date',
        'Metadata_staining_date',
        'Metadata_imaging_time',
        'Metadata_Plate',
        'Metadata_plate_name',
        'Metadata_replicate_number',
        'Metadata_source',
        'Metadata_user',
        'Metadata_RoughID']

    # Compute the mean profiles for pos. controls put into new table
    source_pos_ctrl = Data_aggregated[Data_aggregated['Metadata_RoughID'].isin(['Tetrandrine','Nocodazole'])]

    # Here the mean is used as this makes most sense for the QC
    source_pos_ctrl_mean = source_pos_ctrl.groupby(['Metadata_imaging_date',
                                                    'Metadata_staining_date',
                                                    'Metadata_imaging_time',
                                                    'Metadata_Plate',
                                                    'Metadata_plate_map_name',
                                                    'Metadata_plate_name',
                                                    'Metadata_replicate_number',
                                                    'Metadata_source',
                                                    'Metadata_user',
                                                    'Metadata_RoughID'],dropna=False)[features_columns].mean().reset_index().copy()
    
    batch_list = source_pos_ctrl_mean["Metadata_staining_date"].unique()

    for batch in batch_list:

        print(batch)

        batch_source_Data_aggregated = source_pos_ctrl_mean.loc[source_pos_ctrl_mean["Metadata_staining_date"] == batch].reset_index()

        ctrl_list = ('Nocodazole', 'Tetrandrine')

        for ctrl in ctrl_list:

            batch_source_Data_aggregated_ctrl = batch_source_Data_aggregated.loc[batch_source_Data_aggregated['Metadata_RoughID'] == ctrl].reset_index()

            batch_source_Data_aggregated_ctrl_2 = batch_source_Data_aggregated_ctrl.drop(columns_to_drop, axis=1)
            correlation_ctrl = batch_source_Data_aggregated_ctrl_2.set_index('Metadata_plate_map_name')
            correlation_ctrl_matrix = correlation_ctrl.T.corr()

            # Plot the correlation matrix using a heatmap
            plt.figure(figsize=(10, 8))
            sns.heatmap(correlation_ctrl_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
            plt.title(source + " " + str(batch) + " " + ctrl + " correlation", loc = 'left',fontsize= 16)
            plt.xlabel('Plate Map Name')
            plt.ylabel('Plate Map Name')

            # Cell Number output for FMP
            figure_path = os.path.join(output_path + os.sep + source + os.sep + 'intra_batch')  

            try:
                os.makedirs(figure_path, exist_ok=True)
            
            except OSError as error:
            
                print("Error: directory cannot be created") 

            plt.savefig(figure_path + os.sep + str(batch) + '_' + source + '_' + ctrl + '_IntraBatchCorr.pdf', format='pdf', bbox_inches = "tight")
            plt.close()

In [None]:
interplate_qc(IMTM_Data_aggregated, 'IMTM')

In [None]:
interplate_qc(FMP_Data_aggregated, 'FMP')

In [None]:
interplate_qc(USC_Data_aggregated, 'USC')

In [None]:
interplate_qc(MEDINA_Data_aggregated, 'MEDINA')

# Compute correlation matrix over different batches

In [None]:
def interbatch_qc(Data_aggregated, source):

    features_columns = CBE.get_feature_vector(Data_aggregated)

    columns_to_drop = [
    'Metadata_source',
    'Metadata_RoughID']

    # Compute the mean profiles for pos. controls put into new table
    source_pos_ctrl = Data_aggregated[Data_aggregated['Metadata_RoughID'].isin(['Tetrandrine','Nocodazole'])]

    # Here the mean is used as it makes most sense for a qc 
    source_pos_ctrl_mean = source_pos_ctrl.groupby(['Metadata_staining_date',
                                                    'Metadata_source',
                                                    'Metadata_RoughID'])[features_columns].mean().reset_index()

    ctrl_list = ('Nocodazole', 'Tetrandrine')

    for ctrl in ctrl_list:
        
        source_Data_aggregated_ctrl = source_pos_ctrl_mean.loc[source_pos_ctrl_mean['Metadata_RoughID'] == ctrl].reset_index()

        source_Data_aggregated_ctrl_2 = source_Data_aggregated_ctrl.drop(columns_to_drop, axis=1)
        correlation_ctrl = source_Data_aggregated_ctrl_2.set_index('Metadata_staining_date')
        correlation_ctrl_matrix = correlation_ctrl.T.corr()

        # Plot the correlation matrix using a heatmap
        plt.figure(figsize=(14, 12))
        sns.set(font_scale=0.4)
        sns.heatmap(correlation_ctrl_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        #plt.title(source + " " + str(batch) + " " + ctrl + " correlation", loc = 'left',fontsize= 16)
        plt.xlabel('Imaging date')
        plt.ylabel('Imaging date')

        # Cell Number output for FMP
        figure_path = os.path.join(output_path + os.sep + source)  

        

        try:
            os.makedirs(figure_path, exist_ok=True)
            
        except OSError as error:
            
            print("Error: directory cannot be created") 

        plt.savefig(figure_path + os.sep + source + '_' + ctrl + '_InterBatchCorr.pdf', format='pdf', bbox_inches = "tight")


        # Plot the correlation matrix using a heatmap
        plt.figure(figsize=(14, 12))
        sns.set(font_scale=0.4)
        sns.heatmap(correlation_ctrl_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
        plt.xlabel('Imaging date')
        plt.ylabel('Imaging date')

        # Cell Number output for FMP
        figure_path = os.path.join(output_path + os.sep + source) 

        plt.savefig(figure_path + os.sep + source + '_' + ctrl + '_InterBatchCorr_AnnotFalse.pdf', format='pdf', bbox_inches = "tight")



In [None]:
interbatch_qc(FMP_Data_aggregated, 'FMP')

In [None]:
interbatch_qc(IMTM_Data_aggregated, 'IMTM')

In [None]:
interbatch_qc(MEDINA_Data_aggregated, 'MEDINA')

In [None]:
interbatch_qc(USC_Data_aggregated, 'USC')