In [1]:
import time
from datetime import date
import pandas as pd
import os
import glob
import re

In [2]:
input_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/input/"
output_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/output/"

annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/annotation/"

In [3]:
# TODO: Adjust we can get site name from folder 
partner_user_id = {
    "Christopher": "FMP",
    "Alzbeta": "IMTM",
    "Imna": "MEDINA",
    "Maria": "USC",
    "Maria2": "USC",
    "Anton": "USC",
    "Daniel": "USC"
}

In [4]:
def list_folders(directory):
    try:
        # List all entries in the specified directory
        entries = os.listdir(directory)
        
        # Filter out non-folder entries
        folders = [entry for entry in entries if os.path.isdir(os.path.join(directory, entry))]
        return folders
    
    except FileNotFoundError:
        
        return f"The directory '{directory}' does not exist."
    
    except PermissionError:
        
        return f"Permission denied to access the directory '{directory}'."

In [5]:
def process_profile_metadata(profiles_dataframe, timestamp_dataframe, datetime_dataframe, partner_user_id_dict, source):

    # Naming convention
    ## Metadata_plate_name: B1001
    ## Metadata_replicate_number: R1
    ## Metadata_Plate: B1001R1
    ## Metadata_plate_map_name: B1001_R1
    
    ## Metadata_plate_name
    ## Metadata_replicate_number
    profiles_dataframe['Metadata_plate_name'] = profiles_dataframe['Metadata_Plate'].str[:-2]
    profiles_dataframe['Metadata_replicate_number'] = profiles_dataframe['Metadata_Plate'].str[-2:]
    profiles_dataframe['Metadata_plate_map_name'] = profiles_dataframe['Metadata_plate_name'] + "_" + profiles_dataframe['Metadata_replicate_number']
    
    profiles_dataframe = profiles_dataframe.rename(columns={"Metadata_Well": "Metadata_Well_randomized"})
    ## Metadata_source 
    ## Metadata_instrument
    profiles_dataframe['Metadata_user'] = timestamp_dataframe['user'][0]
    
    # TODO: Adjust we can get site name from folder 
    # profiles_dataframe['Metadata_source'] = partner_user_id_dict[timestamp_dataframe['user'][0]]
    profiles_dataframe['Metadata_source'] = source
    
    profiles_dataframe['Metadata_instrument'] = timestamp_dataframe['instrument'][0]
    
    ## Metadata_imaging_date
    ## Metadata_imaging_time
    ## Metadata_imaging_timezone
    profiles_dataframe['Metadata_imaging_date'] = datetime_dataframe.date()
    profiles_dataframe['Metadata_imaging_time'] = datetime_dataframe.time()
    profiles_dataframe['Metadata_imaging_timezone'] = datetime_dataframe.strftime('%z')
    
    return profiles_dataframe

In [6]:
def pad_numbers(value):
    match = re.match(r"([A-Z])(\d+)", value)
    if match:
        letter = match.group(1)
        number = match.group(2).zfill(2)  # Pad number to 2 digits
        return f"{letter}{number}"
    return value

In [7]:
source_list = list_folders(input_dir)

In [8]:
for source in source_list:
    
    print(source)
    
    source_path = os.path.join(input_dir, source)
    
    plate_list = list_folders(source_path)
    
    for plate_name in plate_list:
        
        print(plate_name)
    
        folder_path = os.path.join(source_path, plate_name)

        print(folder_path)

        if os.path.exists(folder_path):

            # The new data contains another layer with the batch date
            batch_list = list_folders(folder_path)

            for batch_name in batch_list:

                print(batch_name)

                batch_path = os.path.join(folder_path, batch_name)

                # IMPORTANT: there can be multiple merged files. 
                # TODO: readout the most recent file. 
                profiles_name = plate_name + "*_CP_Profiles_Aggregated.csv"
                profiles_file_list = glob.glob(os.path.join(batch_path, profiles_name))
        
                if len(profiles_file_list) > 0:
            
                    # print(f"Info: profiles found {plate_name}")
            
                    # read profiles file
                    try:
                
                        profiles = pd.read_csv(profiles_file_list[0])
                
                        # Check row count of original file
                        row_count = profiles.shape[0]
                        print(f"Info: {plate_name} has {row_count} rows")
                
                
                    except Exception as e:
                
                        print(f"Error reading {profiles_file_list[0]}: {str(e)}")
            
                    # read timestamp file
                    timestamp_file_path = os.path.join(batch_path, "TimeStamp.csv")
            
                    try:
                
                        timestamp = pd.read_csv(timestamp_file_path)
                    
                    except Exception as e:
                
                        print(f"Error reading {timestamp_file_path}: {str(e)}")
            
                    # get datatime info from the timestamp file
                    datetime = pd.to_datetime(timestamp['date'][0])
                    processed_profiles = process_profile_metadata(profiles, timestamp, datetime, partner_user_id, source)

                    # load correct plate layout based on replication number
                    replicate_number = processed_profiles['Metadata_replicate_number'][0]

                    replicate_layout_name = None

                    if (replicate_number == 'R1'):
                
                        replicate_layout_name = 'Picklist_Replicate1.csv'
            
                    elif (replicate_number == 'R2'):
                
                        replicate_layout_name = 'Picklist_Replicate2.csv'
                
                    elif (replicate_number == 'R3'):
                
                        replicate_layout_name = 'Picklist_Replicate3.csv'
                
                    elif (replicate_number == 'R4'):
                
                        replicate_layout_name = 'Picklist_Replicate4.csv'
                
                    else: 
                
                        print(f"Error: {replicate_number} not valid")
                
                    plate_layout_path = os.path.join(annotation_dir, 'plate_layout', replicate_layout_name)
                
                    try:
                
                        plate_layout = pd.read_csv(plate_layout_path)
                
                        # Issue in the plate layout the well numbers are not padded
                        plate_layout['Destination Well'] = plate_layout['Destination Well'].apply(pad_numbers)
                        plate_layout['Source Well'] = plate_layout['Source Well'].apply(pad_numbers)
            
                    except Exception as e:
                
                        print(f"Error reading { plate_layout_path}: {str(e)}")

                    # rename the columns accordingly
                    processed_profiles['Metadata_picklist_name'] = replicate_layout_name

                    # Destination well are the randomized wells 
                    plate_layout = plate_layout.rename(columns={"Destination Well": "Metadata_Well_randomized"})

                    # Source well are the well IDs of the compound mother plate
                    plate_layout = plate_layout.rename(columns={"Source Well": "Metadata_Well"})
                    processed_profiles = pd.merge(processed_profiles, plate_layout, on='Metadata_Well_randomized')
            
                    # get info for pos. and neg. controls
                    controls_layout_path = os.path.join(annotation_dir, "pos_neg_ctrl.csv")
            
                    try:
                
                        controls_layout = pd.read_csv(controls_layout_path)
                
                        # Issue in the plate layout the well numbers are not padded
                        controls_layout['Metadata_Well'] = controls_layout['Metadata_Well'].apply(pad_numbers)
            
                    except Exception as e:
                
                        print(f"Error reading {controls_layout}: {str(e)}")
                
                    processed_profiles = pd.merge(processed_profiles, controls_layout, on='Metadata_Well')
            
                    # create directories
                    plate_map_name = processed_profiles['Metadata_plate_map_name'][0]
                    date_plate_mape_name = processed_profiles['Metadata_imaging_date'][0].strftime('%Y-%m-%d') + "_" + plate_map_name

                    output_plate_path = os.path.join(output_dir, source, date_plate_mape_name) 
                    os.makedirs(output_plate_path, exist_ok=True)
            
                    file_path = os.path.join(output_plate_path, plate_map_name)
                    filename = file_path + ".csv"
                    processed_profiles['Metadata_staining_date'] = batch_name
                    processed_profiles['Metadata_staining_date'] = processed_profiles['Metadata_staining_date'].astype(str)
                    processed_profiles.to_csv(filename, index=False)
            
                else: 
            
                    print(f"Error: {plate_name} no profiles")

IMTM
C1284R2
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/input/IMTM/C1284R2
241106
Info: C1284R2 has 384 rows
C1235R4
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/input/IMTM/C1235R4
240612
Info: C1235R4 has 384 rows
C1244R2
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/input/IMTM/C1244R2
240710
Info: C1244R2 has 384 rows
C1270R2
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/input/IMTM/C1270R2
240816
Info: C1270R2 has 384 rows
C1230R4
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/input/IMTM/C1230R4
240529
Info: C1230R4 has 384 rows
C1275R3
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/input/IMTM/C1275R3
240906
Info: C1275R3 has 384 rows
C1242R3
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/input/IMTM/C1242R3
250110
Info: C1242R3 has 384 rows
C1278R4
/home/s

In [9]:
processed_profiles.head()

Unnamed: 0,Metadata_Batch,Metadata_Plate,Metadata_Well_randomized,Metadata_Object_Count,Nuc_AreaShape_Area,Nuc_AreaShape_BoundingBoxArea,Nuc_AreaShape_BoundingBoxMaximum_X,Nuc_AreaShape_BoundingBoxMaximum_Y,Nuc_AreaShape_BoundingBoxMinimum_X,Nuc_AreaShape_BoundingBoxMinimum_Y,...,Metadata_user,Metadata_source,Metadata_instrument,Metadata_imaging_date,Metadata_imaging_time,Metadata_imaging_timezone,Metadata_picklist_name,Metadata_Well,Metadata_RoughID,Metadata_staining_date
0,240920,C1033R2,A01,3500,439.0,598.0,583.0,570.0,556.5,544.0,...,Thomas,MEDINA,Sonata,2024-09-24,17:27:32,200,Picklist_Replicate2.csv,A14,EOS_cpd,240920
1,240920,C1033R2,A02,2733,463.0,624.0,525.0,574.0,499.0,546.0,...,Thomas,MEDINA,Sonata,2024-09-24,17:27:32,200,Picklist_Replicate2.csv,E20,EOS_cpd,240920
2,240920,C1033R2,A03,3236,438.0,598.0,574.5,574.0,546.0,545.0,...,Thomas,MEDINA,Sonata,2024-09-24,17:27:32,200,Picklist_Replicate2.csv,P13,EOS_cpd,240920
3,240920,C1033R2,A04,4099,427.0,576.0,619.0,567.0,594.0,542.0,...,Thomas,MEDINA,Sonata,2024-09-24,17:27:32,200,Picklist_Replicate2.csv,I20,EOS_cpd,240920
4,240920,C1033R2,A05,3898,427.0,576.0,593.0,521.0,565.0,493.5,...,Thomas,MEDINA,Sonata,2024-09-24,17:27:32,200,Picklist_Replicate2.csv,P14,EOS_cpd,240920
