In [255]:
import time
from datetime import date
import pandas as pd
import os
import glob

In [256]:
input_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis/input_files/"
output_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis/output_files/"

In [257]:
annotation_dir = "/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis/annotation_files/"

In [258]:
partner_user_id = {
    "Christopher": "FMP",
    "Alzbeta": "IMTM",
    "Imna": "MEDINA",
    "Maria": "USC",
    "Maria2": "USC",
    "Anton": "USC",
    "Daniel": "USC"
}

In [259]:
def list_folders(directory):
    try:
        # List all entries in the specified directory
        entries = os.listdir(directory)
        
        # Filter out non-folder entries
        folders = [entry for entry in entries if os.path.isdir(os.path.join(directory, entry))]
        return folders
    
    except FileNotFoundError:
        
        return f"The directory '{directory}' does not exist."
    
    except PermissionError:
        
        return f"Permission denied to access the directory '{directory}'."

In [260]:
# Example usage
plate_list = list_folders(input_dir)

In [261]:
def process_profile_metadata(profiles_dataframe, timestamp_dataframe, datetime_dataframe, partner_user_id_dict):
    
    ## Metadata_plate_name
    ## Metadata_replicate_number
    profiles_dataframe['Metadata_plate_name'] = profiles_dataframe['Metadata_Plate'].str[:-2]
    profiles_dataframe['Metadata_replicate_number'] = profiles_dataframe['Metadata_Plate'].str[-2:]
    profiles_dataframe['Metadata_plate_map_name'] = profiles_dataframe['Metadata_plate_name'] + "_" + profiles_dataframe['Metadata_replicate_number']
    
    profiles_dataframe = profiles_dataframe.rename(columns={"Metadata_Well": "Metadata_Well_randomized"})
    ## Metadata_source 
    ## Metadata_instrument
    profiles_dataframe['Metadata_user'] = timestamp_dataframe['user'][0]
    profiles_dataframe['Metadata_source'] = partner_user_id_dict[timestamp_dataframe['user'][0]]
    profiles_dataframe['Metadata_instrument'] = timestamp_dataframe['instrument'][0]
    
    ## Metadata_imaging_date
    ## Metadata_imaging_time
    ## Metadata_imaging_timezone
    profiles_dataframe['Metadata_imaging_date'] = datetime_dataframe.date()
    profiles_dataframe['Metadata_imaging_time'] = datetime_dataframe.time()
    profiles_dataframe['Metadata_imaging_timezone'] = datetime_dataframe.strftime('%z')
    
    return profiles_dataframe

In [262]:
for plate_name in plate_list:
    
    folder_path = os.path.join(input_dir, plate_name)
    
    if os.path.exists(folder_path):
        
        # IMPORTANT: there can be multiple merged files. 
        # TODO: readout the most recent file. 
        profiles_name = plate_name + "_HepG2_10uM_*_CP_Profiles_Aggregated.csv"
        profiles_file_list = glob.glob(os.path.join(folder_path, profiles_name))
        
        if len(profiles_file_list) > 0:
            
            print(f"Info: profiles found {plate_name}")
            
            # read profiles file
            try:
                
                profiles = pd.read_csv(profiles_file_list[0])
                
                
            except Exception as e:
                
                print(f"Error reading {profiles_file_list[0]}: {str(e)}")
            
            # read timestamp file
            timestamp_file_path = os.path.join(folder_path, "TimeStamp.csv")
            
            try:
                
                timestamp = pd.read_csv(timestamp_file_path)
                    
            except Exception as e:
                
                print(f"Error reading {timestamp_file_path}: {str(e)}")
            
            # get datatime info from the timestamp file
            datetime = pd.to_datetime(timestamp['date'][0])
            processed_profiles = process_profile_metadata(profiles, timestamp, datetime, partner_user_id)

            # load correct plate layout based on replication number
            replicate_number = processed_profiles['Metadata_replicate_number'][0]

            replicate_layout_name = None

            if (replicate_number == 'R1'):
                
                replicate_layout_name = 'Picklist_Replicate1.csv'
            
            elif (replicate_number == 'R2'):
                
                replicate_layout_name = 'Picklist_Replicate2.csv'
                
            elif (replicate_number == 'R3'):
                
                replicate_layout_name = 'Picklist_Replicate3.csv'
                
            elif (replicate_number == 'R4'):
                
                replicate_layout_name = 'Picklist_Replicate4.csv'
                
            else: 
                
                print(f"Error: {replicate_number} not valid")
                
            plate_layout_path = os.path.join(annotation_dir, 'plate_layout', replicate_layout_name)
                
            try:
                
                plate_layout = pd.read_csv(plate_layout_path)
            
            except Exception as e:
                
                print(f"Error reading { plate_layout_path}: {str(e)}")
            
            # rename the columns accordingly
            processed_profiles['Metadata_picklist_name'] = replicate_layout_name
            plate_layout = plate_layout.rename(columns={"Destination Well": "Metadata_Well_randomized"})
            plate_layout = plate_layout.rename(columns={"Source Well": "Metadata_Well"})
            processed_profiles = pd.merge(processed_profiles, plate_layout, on='Metadata_Well_randomized')
            
            # get info for pos. and neg. controls
            controls_layout_path = os.path.join(annotation_dir, "pos_neg_ctrl.csv")
            
            try:
                
                controls_layout = pd.read_csv(controls_layout_path)
            
            except Exception as e:
                
                print(f"Error reading {controls_layout}: {str(e)}")
                
            processed_profiles = pd.merge(processed_profiles, controls_layout, on='Metadata_Well')
            
            # create directories
            plate_map_name = processed_profiles['Metadata_plate_map_name'][0]
            date_plate_mape_name = processed_profiles['Metadata_imaging_date'][0].strftime('%Y-%m-%d') + "_" + plate_map_name

            output_plate_path = os.path.join(output_dir, date_plate_mape_name) 
            os.makedirs(output_plate_path, exist_ok=True)
            
            file_path = os.path.join(output_plate_path, plate_map_name)
            filename = file_path + ".csv"
            processed_profiles.to_csv(filename, index=False)
            
        else: 
            
            print(f"Error: {plate_name} no profiles")
        

Error: C1230R4 no profiles
Info: profiles found C1158R2
Info: profiles found C1087R1
Info: profiles found C1084R1
Info: profiles found C1160R3
Error: C1088R2 no profiles
Info: profiles found C1090R1
Error: C1158R1 no profiles
Error: C1230R3 no profiles
Error: C1087R2 no profiles
Info: profiles found C1092R1
Info: profiles found C1160R1
Info: profiles found C1158R4
Error: C1230R1 no profiles
Info: profiles found C1086R1
Info: profiles found C1093R1
Error: C1086R2 no profiles
Error: C1085R2 no profiles
Info: profiles found C1091R1
Error reading /home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis/input_files/C1091R1/TimeStamp.csv: [Errno 2] No such file or directory: '/home/schmiedc/FMP_Docs/Projects/ECBL_Project/QualityControl_analysis/input_files/C1091R1/TimeStamp.csv'
Info: profiles found C1160R2
Info: profiles found C1088R1
Info: profiles found C1157R1
Info: profiles found C1160R4
Info: profiles found C1159R2
Error: C1230R2 no profiles
Info: profiles found C1159R3
In

In [263]:
processed_profiles.head()

Unnamed: 0,Metadata_Batch,Metadata_Plate,Metadata_Well_randomized,Metadata_Object_Count,Nuc_AreaShape_Area,Nuc_AreaShape_BoundingBoxArea,Nuc_AreaShape_BoundingBoxMaximum_X,Nuc_AreaShape_BoundingBoxMaximum_Y,Nuc_AreaShape_BoundingBoxMinimum_X,Nuc_AreaShape_BoundingBoxMinimum_Y,...,Metadata_plate_map_name,Metadata_user,Metadata_source,Metadata_instrument,Metadata_imaging_date,Metadata_imaging_time,Metadata_imaging_timezone,Metadata_picklist_name,Metadata_Well,Metadata_RoughID
0,HepG2_10uM,C1085R1,A10,2498,496.0,675.0,608.5,577.0,580.5,549.0,...,C1085_R1,Christopher,FMP,Phenix,2024-06-05,14:59:15,200,Picklist_Replicate1.csv,J5,EOS_cpd
1,HepG2_10uM,C1085R1,A11,2658,501.0,681.0,551.5,522.0,522.0,492.5,...,C1085_R1,Christopher,FMP,Phenix,2024-06-05,14:59:15,200,Picklist_Replicate1.csv,L2,EOS_cpd
2,HepG2_10uM,C1085R1,A12,3087,484.0,672.0,546.0,553.0,519.0,526.0,...,C1085_R1,Christopher,FMP,Phenix,2024-06-05,14:59:15,200,Picklist_Replicate1.csv,C16,EOS_cpd
3,HepG2_10uM,C1085R1,A13,2868,499.0,675.0,569.5,543.5,538.5,515.5,...,C1085_R1,Christopher,FMP,Phenix,2024-06-05,14:59:15,200,Picklist_Replicate1.csv,F6,EOS_cpd
4,HepG2_10uM,C1085R1,A14,3124,497.0,682.0,553.0,535.5,524.5,508.0,...,C1085_R1,Christopher,FMP,Phenix,2024-06-05,14:59:15,200,Picklist_Replicate1.csv,O6,EOS_cpd
