In [None]:
import pandas as pd
import os
import glob

In [None]:
raw_dir = "/media/schmied.christopher/T7 Shield/Datasets/ECBL/raw/"
processed_dir = "/media/schmied.christopher/T7 Shield/Datasets/ECBL/processed/"
results_path = "/home/schmied.christopher/FMP_Docs/Projects/eu_os_ecbl_qc/results/"
annotation_dir = "/home/schmied.christopher/FMP_Docs/Projects/eu_os_ecbl_qc/annotation/"

In [None]:
def list_folders(directory):
    try:
        # List all entries in the specified directory
        entries = os.listdir(directory)
        
        # Filter out non-folder entries
        folders = [entry for entry in entries if os.path.isdir(os.path.join(directory, entry))]
        return folders
    
    except FileNotFoundError:
        
        return f"The directory '{directory}' does not exist."
    
    except PermissionError:
        
        return f"Permission denied to access the directory '{directory}'."

In [None]:
# List of all files in raw data folder

raw_pattern = "*_CP_Profiles_Aggregated.csv"

raw_source_list = list_folders(raw_dir)

raw_files = []

for raw_source in raw_source_list:

    print(raw_source)

    raw_plate_list = list_folders(os.path.join(raw_dir, raw_source))

    for raw_plate in raw_plate_list:

        print(raw_plate)

        raw_batch_list = list_folders(os.path.join(raw_dir, raw_source, raw_plate))

        for raw_batch in raw_batch_list:

            print(raw_batch)

            if glob.glob(os.path.join(raw_dir, raw_source, raw_plate, raw_batch) + os.sep + raw_pattern , recursive=True):
                
                raw_file_exists = True
            
            else:
                
                raw_file_exists = False

            print(raw_file_exists)


            raw_files.append({'source': raw_source,
                              'plate': raw_plate,
                              'batch':raw_batch,
                              'raw_file': raw_file_exists})

In [None]:
# Turn plate into plate_name and replicate_number C1084 R1
# Create plate_map_name C1084_R1
raw_files_df = pd.DataFrame(raw_files)
raw_files_df["plate_name"] = raw_files_df["plate"].str[:5]
raw_files_df["replicate_number"] = raw_files_df["plate"].str[5:]
raw_files_df["plate_map_name"] = raw_files_df["plate_name"].str.cat(raw_files_df["replicate_number"], sep="_")

In [None]:
# Turn batch into batch_date, protect against 000000 batch 
raw_files_df["batch_date"] = pd.to_datetime(
    raw_files_df["batch"].replace("000000", pd.NA),
    format="%y%m%d",
    errors="coerce"
)

# Create a batch_date_str column for easier comparison with plate map batch dates, fill NA with 0000-00-00
raw_files_df["batch_date_str"] = (
    raw_files_df["batch_date"]
    .dt.strftime("%Y-%m-%d")
    .fillna("0000-00-00")
)

In [None]:
raw_column = raw_files_df.pop("raw_file")
raw_files_df["raw_file"] = raw_column

In [None]:
raw_files_df

In [None]:
# Based on raw data check if these files have processed counterparts in the processed folder
# Walk through source
# Get batch_date, plate_map_name and check if file exists in processed folder

for source in raw_files_df["source"].unique():

    print(source)

    processed_source_path = os.path.join(processed_dir, source)

    raw_files_df_source = raw_files_df[raw_files_df["source"] == source]

    for index, row in raw_files_df_source.iterrows():
        
        plate_folder = row["batch_date_str"] + "_" + row["plate_map_name"]
        
        print(plate_folder)
        
        plate_path = os.path.join(processed_source_path, plate_folder)

        if os.path.isdir(plate_path):

            process_folder = True
            
        else:
            
            process_folder = False

        processed_pattern = "[A-Z][A-Za-z0-9][A-Za-z0-9][0-9][0-9]_R[1-4].csv"

        if glob.glob(plate_path + os.sep + processed_pattern, recursive=True):

            processed_file_exists = True

        else:

            processed_file_exists = False

        norm_pattern = "[A-Z][A-Za-z0-9][A-Za-z0-9][0-9][0-9]_R[1-4]_mad_robustize.csv"

        if glob.glob(plate_path + os.sep + norm_pattern  , recursive=True):

            norm_file_exits = True
            
        else:

            norm_file_exits = False
       
        processed_pattern = "[A-Z][A-Za-z0-9][A-Za-z0-9][0-9][0-9]_R[1-4]_mad_robustize_reduced-corr.csv"
        
        if glob.glob(plate_path + os.sep + processed_pattern , recursive=True):

            reduced_file_exists = True
            
        else:

            reduced_file_exists = False

        raw_files_df.at[index, "process_folder"] = process_folder
        raw_files_df.at[index, "processed_file_exists"] = processed_file_exists
        raw_files_df.at[index, "norm_file_exists"] = norm_file_exits
        raw_files_df.at[index, "reduced_file_exists"] = reduced_file_exists
        

In [None]:
raw_files_df

In [None]:
raw_files_df.to_csv(os.path.join(results_path, "Processing_Status.csv"), index=False)