In [1]:
import time
from datetime import date
import pandas as pd
import os
import glob
import re
import numpy as np # type: ignore

In [2]:
input_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_input/"
output_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_output/"

annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_annotation/"

In [3]:
# TODO: Adjust we can get site name from folder 
partner_user_id = {
    "Christopher": "FMP",
    "Alzbeta": "IMTM",
    "Imna": "MEDINA",
    "Maria": "USC",
    "Maria2": "USC",
    "Anton": "USC",
    "Daniel": "USC"
}

In [4]:
def list_folders(directory):
    try:
        # List all entries in the specified directory
        entries = os.listdir(directory)
        
        # Filter out non-folder entries
        folders = [entry for entry in entries if os.path.isdir(os.path.join(directory, entry))]
        return folders
    
    except FileNotFoundError:
        
        return f"The directory '{directory}' does not exist."
    
    except PermissionError:
        
        return f"Permission denied to access the directory '{directory}'."

In [5]:
def process_profile_metadata(profiles_dataframe, timestamp_dataframe, datetime_dataframe, partner_user_id_dict, source):
    
    ## Metadata_plate_name
    ## Metadata_replicate_number
    profiles_dataframe['Metadata_plate_name'] = profiles_dataframe['Metadata_Plate'].str[:-2]
    profiles_dataframe['Metadata_replicate_number'] = profiles_dataframe['Metadata_Plate'].str[-2:]
    profiles_dataframe['Metadata_plate_map_name'] = profiles_dataframe['Metadata_plate_name'] + "_" + profiles_dataframe['Metadata_replicate_number']
    
    profiles_dataframe = profiles_dataframe.rename(columns={"Metadata_Well": "Metadata_Well_randomized"})
    ## Metadata_source 
    ## Metadata_instrument
    profiles_dataframe['Metadata_user'] = timestamp_dataframe['user'][0]
    
    # TODO: Adjust we can get site name from folder 
    # profiles_dataframe['Metadata_source'] = partner_user_id_dict[timestamp_dataframe['user'][0]]
    profiles_dataframe['Metadata_source'] = source
    
    profiles_dataframe['Metadata_instrument'] = timestamp_dataframe['instrument'][0]
    
    ## Metadata_imaging_date
    ## Metadata_imaging_time
    ## Metadata_imaging_timezone
    profiles_dataframe['Metadata_imaging_date'] = datetime_dataframe.date()
    profiles_dataframe['Metadata_imaging_time'] = datetime_dataframe.time()
    profiles_dataframe['Metadata_imaging_timezone'] = datetime_dataframe.strftime('%z')
    
    return profiles_dataframe

In [6]:
def pad_numbers(value):
    match = re.match(r"([A-Z])(\d+)", value)
    if match:
        letter = match.group(1)
        number = match.group(2).zfill(2)  # Pad number to 2 digits
        return f"{letter}{number}"
    return value

In [7]:
source_list = list_folders(input_dir)

In [8]:
def transform_id(id_value):
    if id_value.startswith('EOS'):
        return 'EOS_cpd'
    else:
        return id_value

In [9]:
for source in source_list:
    
    print(source)
    
    source_path = os.path.join(input_dir, source)
    
    plate_list = list_folders(source_path)
    
    for plate_name in plate_list:
        
        print(plate_name)
    
        folder_path = os.path.join(source_path, plate_name)

        print(folder_path)

        if os.path.exists(folder_path):

            # The new data contains another layer with the batch date
            batch_list = list_folders(folder_path)

            for batch_name in batch_list:

                print(batch_name)

                batch_path = os.path.join(folder_path, batch_name)

                # IMPORTANT: there can be multiple merged files. 
                # TODO: readout the most recent file. 
                profiles_name = "*_CP_Profiles_Aggregated.csv"
                profiles_file_list = glob.glob(os.path.join(batch_path, profiles_name))

                if len(profiles_file_list) > 0:
            
                    # read profiles file
                    try:
                
                        profiles = pd.read_csv(profiles_file_list[0])
                
                        # Check row count of original file
                        row_count = profiles.shape[0]
                        print(f"Info: {plate_name} has {row_count} rows")
                
                
                    except Exception as e:
                
                        print(f"Error reading {profiles_file_list[0]}: {str(e)}")
            
                    # TODO: Replace replicate number 
                    replicate_number = plate_name[-2:]

                    print('Replicate is ' + replicate_number)

                    replicate_layout_name = None

                    # TODO: Add different source
                    if (replicate_number == 'R1'):
                
                        replicate_layout_name = source + '_Replicate1.csv'
            
                    elif (replicate_number == 'R2'):
                
                        replicate_layout_name = source + '_Replicate2.csv'
                
                    elif (replicate_number == 'R3'):
                
                        replicate_layout_name = source + '_Replicate3.csv'
                
                    elif (replicate_number == 'R4'):
                
                        replicate_layout_name = source + '_Replicate4.csv'
                
                    else: 
                
                        print(f"Error: {replicate_number} not valid")
                
                    plate_layout_path = os.path.join(annotation_dir, replicate_layout_name)
                
                    try:
                
                        plate_layout = pd.read_csv(plate_layout_path)
                
            
                    except Exception as e:
                
                        print(f"Error reading { plate_layout_path}: {str(e)}")

                    # Naming convention
                    ## Metadata_plate_name: B1001
                    ## Metadata_replicate_number: R1
                    ## Metadata_Plate: B1001R1
                    ## Metadata_plate_map_name: B1001_R1

                    # rename the columns in plate layout accordingly
                    plate_layout = plate_layout.drop(['Metadata_Concentration'], axis=1)
                    plate_layout = plate_layout.rename(columns={"Metadata_Partner": "Metadata_source"})

                    plate_layout = plate_layout.rename(columns={"Metadata_Plate": "Metadata_plate_name"})
                    plate_layout = plate_layout.rename(columns={"Metadata_Batch": "Metadata_replicate_number"})
                    plate_layout["Metadata_plate_map_name"] = plate_layout["Metadata_plate_name"] + "_" +  plate_layout["Metadata_replicate_number"]


                    profiles['Metadata_picklist_name'] = replicate_layout_name

                    if (source == "FMP" ) or (source == "MEDINA" ) or ( source == "USC" ):

                        plate_layout["Metadata_source"] = plate_layout["Metadata_source"].replace("Bioactives", source)

                    # FMP has Metadata_plate_name instead of Metadata_plate_map_name in column Metadata_Plate
                    if (source == "FMP"):

                        # rename columns in profiles accordingly
                        profiles = profiles.rename(columns={"Metadata_Plate": "Metadata_plate_name"})

                        # merge profiles and plate layout 
                        processed_profiles = pd.merge(profiles, plate_layout, on=["Metadata_plate_name", "Metadata_Well"])

                    else:

                        # rename columns in profiles accordingly
                        profiles = profiles.rename(columns={"Metadata_Plate": "Metadata_plate_map_name"})
                        profiles['Metadata_picklist_name'] = replicate_layout_name

                        # merge profiles and plate layout 
                        processed_profiles = pd.merge(profiles, plate_layout, on=["Metadata_plate_map_name", "Metadata_Well"])

                    # Adds staining date to metadata
                    processed_profiles['Metadata_staining_date'] = batch_name
                    processed_profiles['Metadata_staining_date'] = processed_profiles['Metadata_staining_date'].astype(str)
                    processed_profiles['Metadata_imaging_date'] = processed_profiles['Metadata_staining_date']

                    # This is missing for the bioactives data
                    processed_profiles['Metadata_imaging_time'] = np.nan
                    processed_profiles['Metadata_imaging_timezone'] = np.nan
                    processed_profiles['Metadata_instrument'] = np.nan
                    processed_profiles['Metadata_user'] = np.nan

                    # Turn EOS ID into RoughID
                    processed_profiles['Metadata_RoughID'] = processed_profiles['Metadata_EOS'].apply(transform_id)

                    
                    processed_profiles['Metadata_Plate'] = processed_profiles['Metadata_plate_name'] + processed_profiles["Metadata_replicate_number"]


                    # info for saving files
                    plate_map_name = processed_profiles['Metadata_plate_map_name'][0]
                    date_plate_map_name = processed_profiles['Metadata_staining_date'][0] + "_" + plate_map_name

                    
                    print(plate_map_name)
                    
                    # Create folder and save .csv
                    output_plate_path = os.path.join(output_dir, source, date_plate_map_name) 
                    os.makedirs(output_plate_path, exist_ok=True)
                    file_path = os.path.join(output_plate_path, plate_map_name)
                    filename = file_path + ".csv"
                    processed_profiles.to_csv(filename, index=False)
            
                else: 
            
                    print(f"Error: {plate_name} no profiles")

IMTM
B1004R2
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_input/IMTM/B1004R2
230711
Info: B1004R2 has 380 rows
Replicate is R2
B1004_R2
B1002R4
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_input/IMTM/B1002R4
230709
Info: B1002R4 has 383 rows
Replicate is R4
B1002_R4
B1007R1
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_input/IMTM/B1007R1
230716
Info: B1007R1 has 380 rows
Replicate is R1
B1007_R1
B1002R3
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_input/IMTM/B1002R3
230709
Info: B1002R3 has 382 rows
Replicate is R3
B1002_R3
B1003R2
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_input/IMTM/B1003R2
230710
Info: B1003R2 has 383 rows
Replicate is R2
B1003_R2
B1003R3
/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis_revision/test_input/IMTM/B1003R3
230720
Info: B1003R3 has 382 rows
