In [None]:
import pandas as pd
from spacr.plot import spacrGraph
import matplotlib.pyplot as plt
%matplotlib inline

srcs = ['/nas_mnt/carruthers/Einar/tsg101_screen/TSG101SCREEN_20240810_132824/plate1/results/feature_importance_compartment.csv',
       '/nas_mnt/carruthers/Einar/tsg101_screen/TSG101SCREEN_20240824_072829/plate2/results/feature_importance_compartment.csv',
       '/nas_mnt/carruthers/Einar/tsg101_screen/TSG101SCREEN_20240825_094106/plate3/results/feature_importance_compartment.csv',
       '/nas_mnt/carruthers/Einar/tsg101_screen/TSG101SCREEN_20240826_140251/plate4/results/feature_importance_compartment.csv']

dfs = []
for path in srcs:
    dft = pd.read_csv(path)
    dfs.append(dft)

df = pd.concat(dfs)
display(df)

spacr_graph = spacrGraph(
    df=df,                                     
    grouping_column='compartment',                         
    data_column='compartment_importance_sum',         
    graph_type='jitter_bar',          
    graph_name='compartment',          
    summary_func='mean',                         
    colors=None,                                
    output_dir='/nas_mnt/carruthers/Einar/tsg101_screen/TSG101SCREEN_20240810_132824/plate1/results',                              
    save=True,                       
    y_lim=None,                     
    error_bar_type='std',                       
    representation='object',
    theme='pastel',                    
)


# Create the plot
spacr_graph.create_plot()

# Get the figure object if needed
fig = spacr_graph.get_figure()
plt.show()

In [15]:
import spacr
import pandas as pd
import os

import numpy as np

def _split_data(df, group_by, object_type):
    """
    Splits the input dataframe into numeric and non-numeric parts, groups them by the specified column,
    and returns the grouped dataframes with conditional aggregation.

    Parameters:
    df (pandas.DataFrame): The input dataframe.
    group_by (str): The column name to group the dataframes by.
    object_type (str): The column name to concatenate with 'prcf' to create a new column 'prcfo'.

    Returns:
    grouped_numeric (pandas.DataFrame): The grouped dataframe containing numeric columns with conditional aggregation.
    grouped_non_numeric (pandas.DataFrame): The grouped dataframe containing non-numeric columns.
    """
    
    # Ensure 'prcf' column exists by concatenating specific columns
    if 'prcf' not in df.columns:
        try:
            df['prcf'] = df['plate'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str) + '_' + df['field'].astype(str)
        except Exception as e:
            print(e)    
    
    # Create the 'prcfo' column
    df['prcfo'] = df['prcf'] + '_' + df[object_type]
    df = df.set_index(group_by, inplace=False)

    # Split the DataFrame into numeric and non-numeric parts
    df_numeric = df.select_dtypes(include=np.number)
    df_non_numeric = df.select_dtypes(exclude=np.number)

    # Define keywords for columns to be summed instead of averaged
    sum_keywords = ['area', 'perimeter', 'convex_area', 'bbox_area', 'filled_area', 'major_axis_length', 'minor_axis_length', 'equivalent_diameter']

    # Create a dictionary for custom aggregation
    agg_dict = {}
    for column in df_numeric.columns:
        if any(keyword in column for keyword in sum_keywords):
            agg_dict[column] = 'sum'
        else:
            agg_dict[column] = 'mean'

    # Apply custom aggregation
    grouped_numeric = df_numeric.groupby(df_numeric.index).agg(agg_dict)
    grouped_non_numeric = df_non_numeric.groupby(df_non_numeric.index).first()

    return pd.DataFrame(grouped_numeric), pd.DataFrame(grouped_non_numeric)

def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_limit=10):
    from spacr.io import _read_db
    from spacr.utils import _split_data

    # Initialize an empty dictionary to store DataFrames by table name
    data_dict = {table: [] for table in tables}

    # Extract plate DataFrames
    for loc in locs:
        db_dfs = _read_db(loc, tables)
        for table, df in zip(tables, db_dfs):
            data_dict[table].append(df)

    # Concatenate rows across locations for each table
    for table, dfs in data_dict.items():
        if dfs:
            data_dict[table] = pd.concat(dfs, axis=0)
        if verbose:
            print(f"{table}: {len(data_dict[table])}")

    # Initialize merged DataFrame with 'cells' if available
    merged_df = pd.DataFrame()

    # Process each table
    if 'cell' in data_dict:
        cells = data_dict['cell'].copy()
        cells = cells.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
        cells = cells.assign(prcfo=lambda x: x['prcf'] + '_' + x['object_label'])
        cells_g_df, metadata = _split_data(cells, 'prcfo', 'object_label')
        merged_df = cells_g_df.copy()
        if verbose:
            print(f'cells: {len(cells)}, cells grouped: {len(cells_g_df)}')

    if 'cytoplasm' in data_dict:
        cytoplasms = data_dict['cytoplasm'].copy()
        cytoplasms = cytoplasms.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
        cytoplasms = cytoplasms.assign(prcfo=lambda x: x['prcf'] + '_' + x['object_label'])
        
        if not 'cell' in data_dict:
            merged_df, metadata = _split_data(cytoplasms, 'prcfo', 'object_label')
            
            if verbose:
                print(f'nucleus: {len(cytoplasms)}, cytoplasms grouped: {len(merged_df)}')
            
        else:
            cytoplasms_g_df, _ = _split_data(cytoplasms, 'prcfo', 'object_label')
            merged_df = merged_df.merge(cytoplasms_g_df, left_index=True, right_index=True)
            
            if verbose:
                print(f'cytoplasms: {len(cytoplasms)}, cytoplasms grouped: {len(cytoplasms_g_df)}')

    if 'nucleus' in data_dict:
        nucleus = data_dict['nucleus'].copy()
        nucleus = nucleus.dropna(subset=['cell_id'])
        nucleus = nucleus.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
        nucleus = nucleus.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
        nucleus = nucleus.assign(prcfo=lambda x: x['prcf'] + '_' + x['cell_id'])
        nucleus['nucleus_prcfo_count'] = nucleus.groupby('prcfo')['prcfo'].transform('count')
        if not nuclei_limit:
            nucleus = nucleus[nucleus['nucleus_prcfo_count'] == 1]
            
        if all(key not in data_dict for key in ['cell', 'cytoplasm']):
            merged_df, metadata = _split_data(nucleus, 'prcfo', 'cell_id')
            
            if verbose:
                print(f'nucleus: {len(nucleus)}, nucleus grouped: {len(merged_df)}')
            
        else:
            nucleus_g_df, _ = _split_data(nucleus, 'prcfo', 'cell_id')
            merged_df = merged_df.merge(nucleus_g_df, left_index=True, right_index=True)
            
            if verbose:
                print(f'nucleus: {len(nucleus)}, nucleus grouped: {len(nucleus_g_df)}')

    if 'pathogen' in data_dict:
        pathogens = data_dict['pathogen'].copy()
        pathogens = pathogens.dropna(subset=['cell_id'])
        pathogens = pathogens.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
        pathogens = pathogens.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
        pathogens = pathogens.assign(prcfo=lambda x: x['prcf'] + '_' + x['cell_id'])
        pathogens['pathogen_prcfo_count'] = pathogens.groupby('prcfo')['prcfo'].transform('count')

        if isinstance(pathogen_limit, bool) and not pathogen_limit:
            pathogens = pathogens[pathogens['pathogen_prcfo_count'] <= 1]
        elif isinstance(pathogen_limit, (float, int)):
            pathogens = pathogens[pathogens['pathogen_prcfo_count'] <= int(pathogen_limit)]

        if all(key not in data_dict for key in ['cell', 'cytoplasm', 'nucleus']):
            merged_df, metadata = _split_data(pathogens, 'prcfo', 'cell_id')
            
            if verbose:
                print(f'pathogens: {len(pathogens)}, pathogens grouped: {len(merged_df)}')
            
        else:
            pathogens_g_df, _ = _split_data(pathogens, 'prcfo', 'cell_id')
            merged_df = merged_df.merge(pathogens_g_df, left_index=True, right_index=True)
        
            if verbose:
                print(f'pathogens: {len(pathogens)}, pathogens grouped: {len(pathogens_g_df)}')
            
    if 'png_list' in data_dict:
        png_list = data_dict['png_list'].copy()
        png_list_g_df_numeric, png_list_g_df_non_numeric = _split_data(png_list, 'prcfo', 'cell_id')
        png_list_g_df_non_numeric.drop(columns=['plate','row_name','column_name','field','file_name','cell_id', 'prcf'], inplace=True)
        if verbose:
            print(f'png_list: {len(png_list)}, png_list grouped: {len(png_list_g_df_numeric)}')
        merged_df = merged_df.merge(png_list_g_df_numeric, left_index=True, right_index=True)
        merged_df = merged_df.merge(png_list_g_df_non_numeric, left_index=True, right_index=True)
        
    # Add prc (plate row column) and prcfo (plate row column field object) columns
    metadata = metadata.assign(prc=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'])
    cells_well = metadata.groupby('prc')['object_label'].nunique().reset_index(name='cells_per_well')
    metadata = metadata.merge(cells_well, on='prc')
    metadata = metadata.assign(prcfo=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'] + '_' + x['field'] + '_' + x['object_label'])
    metadata.set_index('prcfo', inplace=True)
    display(metadata)
    # Merge metadata with final merged DataFrame
    merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
    merged_df.drop(columns=['label_list_morphology', 'label_list_intensity'], errors='ignore', inplace=True)
    
    if verbose:
        print(f'Generated dataframe with: {len(merged_df.columns)} columns and {len(merged_df)} rows')
    
    # Prepare object DataFrames for output
    obj_df_ls = [data_dict[table] for table in ['cell', 'cytoplasm', 'nucleus', 'pathogen'] if table in data_dict]
    
    return merged_df, obj_df_ls

locs = ['/nas_mnt/carruthers/Einar/tsg101_screen/TSG101SCREEN_20240810_132824/plate1/measurements/measurements.db']
tables = ['nucleus','pathogen','cytoplasm', 'png_list']

merged_df, obj_df_ls = _read_and_merge_data(locs, tables, verbose=True, nuclei_limit=1000, pathogen_limit=1000)
display(merged_df)

nucleus: 80889
pathogen: 111317
cytoplasm: 60816
png_list: 60816
nucleus: 60816, cytoplasms grouped: 60816
nucleus: 80889, nucleus grouped: 60816
pathogens: 111317, pathogens grouped: 60816
png_list: 60816, png_list grouped: 60816


Unnamed: 0_level_0,object_label,plate,row_name,column_name,field,prcf,file_name,path_name,label_list_morphology,label_list_intensity,prc,cells_per_well
prcfo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
pplate1_r10_c11_f10_o101,o101,pplate1,r10,c11,f10,pplate1_r10_c11_f10,plate1_J11_10,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,[101],"[101, 101, 101, 101, 101, 101, 101, 101, 101, ...",pplate1_r10_c11,92
pplate1_r10_c11_f10_o104,o104,pplate1,r10,c11,f10,pplate1_r10_c11_f10,plate1_J11_10,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,[104],"[104, 104, 104, 104, 104, 104, 104, 104, 104, ...",pplate1_r10_c11,92
pplate1_r10_c11_f10_o107,o107,pplate1,r10,c11,f10,pplate1_r10_c11_f10,plate1_J11_10,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,[107],"[107, 107, 107, 107, 107, 107, 107, 107, 107, ...",pplate1_r10_c11,92
pplate1_r10_c11_f10_o137,o137,pplate1,r10,c11,f10,pplate1_r10_c11_f10,plate1_J11_10,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,[137],"[137, 137, 137, 137, 137, 137, 137, 137, 137, ...",pplate1_r10_c11,92
pplate1_r10_c11_f10_o138,o138,pplate1,r10,c11,f10,pplate1_r10_c11_f10,plate1_J11_10,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,[138],"[138, 138, 138, 138, 138, 138, 138, 138, 138, ...",pplate1_r10_c11,92
...,...,...,...,...,...,...,...,...,...,...,...,...
pplate1_r9_c9_f9_o4,o4,pplate1,r9,c9,f9,pplate1_r9_c9_f9,plate1_I09_9,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,[4],"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]",pplate1_r9_c9,129
pplate1_r9_c9_f9_o73,o73,pplate1,r9,c9,f9,pplate1_r9_c9_f9,plate1_I09_9,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,[73],"[73, 73, 73, 73, 73, 73, 73, 73, 73, 73]",pplate1_r9_c9,129
pplate1_r9_c9_f9_o75,o75,pplate1,r9,c9,f9,pplate1_r9_c9_f9,plate1_I09_9,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,[75],"[75, 75, 75, 75, 75, 75, 75, 75, 75, 75]",pplate1_r9_c9,129
pplate1_r9_c9_f9_o79,o79,pplate1,r9,c9,f9,pplate1_r9_c9_f9,plate1_I09_9,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,[79],"[79, 79, 79, 79, 79, 79, 79, 79, 79, 79]",pplate1_r9_c9,129


Generated dataframe with: 624 columns and 60816 rows


Unnamed: 0_level_0,object_label,plate,row_name,column_name,field,prcf,file_name,path_name,prc,cells_per_well,...,pathogen_channel_1_channel_3_M1_correlation_95,pathogen_channel_1_channel_3_M2_correlation_95,pathogen_channel_2_channel_3_M1_correlation_15,pathogen_channel_2_channel_3_M2_correlation_15,pathogen_channel_2_channel_3_M1_correlation_85,pathogen_channel_2_channel_3_M2_correlation_85,pathogen_channel_2_channel_3_M1_correlation_95,pathogen_channel_2_channel_3_M2_correlation_95,pathogen_prcfo_count,png_path
prcfo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pplate1_r10_c11_f10_o101,o101,pplate1,r10,c11,f10,pplate1_r10_c11_f10,plate1_J11_10,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,pplate1_r10_c11,92,...,0.000000,0.000000,0.700896,0.721041,0.000000,0.000000,0.000000,0.000000,1.0,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...
pplate1_r10_c11_f10_o104,o104,pplate1,r10,c11,f10,pplate1_r10_c11_f10,plate1_J11_10,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,pplate1_r10_c11,92,...,0.002224,0.001375,0.781562,0.751003,0.025694,0.020279,0.000470,0.000349,1.0,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...
pplate1_r10_c11_f10_o107,o107,pplate1,r10,c11,f10,pplate1_r10_c11_f10,plate1_J11_10,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,pplate1_r10_c11,92,...,0.004971,0.003943,0.824009,0.766166,0.029448,0.020089,0.009793,0.006330,1.0,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...
pplate1_r10_c11_f10_o137,o137,pplate1,r10,c11,f10,pplate1_r10_c11_f10,plate1_J11_10,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,pplate1_r10_c11,92,...,0.000000,0.000000,0.756574,0.729944,0.027023,0.020488,0.000000,0.000000,1.0,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...
pplate1_r10_c11_f10_o138,o138,pplate1,r10,c11,f10,pplate1_r10_c11_f10,plate1_J11_10,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,pplate1_r10_c11,92,...,0.000000,0.000000,0.826501,0.770436,0.073517,0.049407,0.012787,0.008092,1.0,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pplate1_r9_c9_f9_o4,o4,pplate1,r9,c9,f9,pplate1_r9_c9_f9,plate1_I09_9,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,pplate1_r9_c9,129,...,0.000662,0.000584,0.833220,0.781898,0.039290,0.022536,0.003286,0.001328,2.0,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...
pplate1_r9_c9_f9_o73,o73,pplate1,r9,c9,f9,pplate1_r9_c9_f9,plate1_I09_9,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,pplate1_r9_c9,129,...,0.000000,0.000000,0.746353,0.777684,0.027406,0.025505,0.001731,0.000656,1.0,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...
pplate1_r9_c9_f9_o75,o75,pplate1,r9,c9,f9,pplate1_r9_c9_f9,plate1_I09_9,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,pplate1_r9_c9,129,...,0.000313,0.000329,0.735061,0.736596,0.028893,0.022633,0.004980,0.003748,2.0,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...
pplate1_r9_c9_f9_o79,o79,pplate1,r9,c9,f9,pplate1_r9_c9_f9,plate1_I09_9,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...,pplate1_r9_c9,129,...,0.000000,0.000000,0.740277,0.732288,0.001253,0.000906,0.000000,0.000000,1.0,/nas_mnt/carruthers/Einar/tsg101_screen/TSG101...


In [None]:
def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=False, pathogen_limit=False):
    
    from spacr.io import _read_db
    from spacr.utils import _split_data
    
    #Extract plate DataFrames
    all_dfs = []
    for loc in locs:
        db_dfs = _read_db(loc, tables)
        all_dfs.append(db_dfs)
    
    #Extract Tables from DataFrames and concatinate rows
    for i, dfs in enumerate(all_dfs):
        if 'cell' in tables:
            cell = dfs[0]
            if verbose:
                print(f'plate: {i+1} cells:{len(cell)}')
        # see pathogens logic, copy logic to other tables #here
        if 'nucleus' in tables:
            nucleus = dfs[1]
            if verbose:
                print(f'plate: {i+1} nucleus:{len(nucleus)} ')

        if 'pathogen' in tables:
            if len(tables) == 1:
                pathogen = dfs[0]
                print(len(pathogen))
            else:
                pathogen = dfs[2]
            if verbose:
                print(f'plate: {i+1} pathogens:{len(pathogen)}')
        
        if 'cytoplasm' in tables:
            if not 'pathogen' in tables:
                cytoplasm = dfs[2]
            else:
                cytoplasm = dfs[3]
            if verbose:
                print(f'plate: {i+1} cytoplasms: {len(cytoplasm)}')

        if i > 0:
            if 'cell' in tables:
                cells = pd.concat([cells, cell], axis = 0)
            if 'nucleus' in tables:
                nucleus = pd.concat([nucleus, nucleus], axis = 0)
            if 'pathogen' in tables:
                pathogens = pd.concat([pathogens, pathogen], axis = 0)
            if 'cytoplasm' in tables:
                cytoplasms = pd.concat([cytoplasms, cytoplasm], axis = 0)
        else:
            if 'cell' in tables:
                cells = cell.copy()
            if 'nucleus' in tables:
                nucleus = nucleus.copy()
            if 'pathogen' in tables:
                pathogens = pathogen.copy()
            if 'cytoplasm' in tables:
                cytoplasms = cytoplasm.copy()
    
    #Add an o in front of all object and cell lables to convert them to strings
    if 'cell' in tables:
        cells = cells.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
        cells = cells.assign(prcfo = lambda x: x['prcf'] + '_' + x['object_label'])
        cells_g_df, metadata = _split_data(cells, 'prcfo', 'object_label')
        merged_df = cells_g_df.copy()
        if verbose:
            print(f'cells: {len(cells)}')
            print(f'cells grouped: {len(cells_g_df)}')

    if 'cytoplasm' in tables:
        cytoplasms = cytoplasms.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
        cytoplasms = cytoplasms.assign(prcfo = lambda x: x['prcf'] + '_' + x['object_label'])
        cytoplasms_g_df, _ = _split_data(cytoplasms, 'prcfo', 'object_label')
        merged_df = cells_g_df.merge(cytoplasms_g_df, left_index=True, right_index=True)
        if verbose:
            print(f'cytoplasms: {len(cytoplasms)}')
            print(f'cytoplasms grouped: {len(cytoplasms_g_df)}')

    if 'nucleus' in tables:
        if not 'cell' in tables:
            cells_g_df = pd.DataFrame()
        nucleus = nucleus.dropna(subset=['cell_id'])
        nucleus = nucleus.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
        nucleus = nucleus.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
        nucleus = nucleus.assign(prcfo = lambda x: x['prcf'] + '_' + x['cell_id'])
        nucleus['nucleus_prcfo_count'] = nucleus.groupby('prcfo')['prcfo'].transform('count')
        if nuclei_limit == False:
            nucleus = nucleus[nucleus['nucleus_prcfo_count']==1]
        nucleus_g_df, _ = _split_data(nucleus, 'prcfo', 'cell_id')
        if verbose:
            print(f'nucleus: {len(nucleus)}')
            print(f'nucleus grouped: {len(nucleus_g_df)}')
        if 'cytoplasm' in tables:
            merged_df = merged_df.merge(nucleus_g_df, left_index=True, right_index=True)
        else:
            merged_df = cells_g_df.merge(nucleus_g_df, left_index=True, right_index=True)

    if 'pathogen' in tables:
        if not 'cell' in tables:
            cells_g_df = pd.DataFrame()
            merged_df = []
        try:
            pathogens = pathogens.dropna(subset=['cell_id'])

        except:
            pathogens['cell_id'] = pathogens['object_label']
            pathogens = pathogens.dropna(subset=['cell_id'])

        pathogens = pathogens.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
        pathogens = pathogens.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
        pathogens = pathogens.assign(prcfo = lambda x: x['prcf'] + '_' + x['cell_id'])
        pathogens['pathogen_prcfo_count'] = pathogens.groupby('prcfo')['prcfo'].transform('count')
        
        if isinstance(pathogen_limit, bool):
            if pathogen_limit == False:
                pathogens = pathogens[pathogens['pathogen_prcfo_count']<=1]
                print(f"after multiinfected Bool: {len(pathogens)}")
        if isinstance(pathogen_limit, float):
            pathogen_limit = int(pathogen_limit)
        if isinstance(pathogen_limit, int):
            pathogens = pathogens[pathogens['pathogen_prcfo_count']<=pathogen_limit]
            print(f"afer multiinfected Float: {len(pathogens)}")
        if not 'cell' in tables:
            pathogens_g_df, metadata = _split_data(pathogens, 'prcfo', 'cell_id')
        else:
            pathogens_g_df, _ = _split_data(pathogens, 'prcfo', 'cell_id')
        
        if verbose:
            print(f'pathogens: {len(pathogens)}')
            print(f'pathogens grouped: {len(pathogens_g_df)}')
        
        if len(merged_df) == 0:
            merged_df = pathogens_g_df
        else:
            merged_df = merged_df.merge(pathogens_g_df, left_index=True, right_index=True)
        
    #Add prc column (plate row column)
    metadata = metadata.assign(prc = lambda x: x['plate'] + '_' + x['row_name'] + '_' +x['column_name'])

    #Count cells per well
    cells_well = pd.DataFrame(metadata.groupby('prc')['object_label'].nunique())

    cells_well.reset_index(inplace=True)
    cells_well.rename(columns={'object_label': 'cells_per_well'}, inplace=True)
    metadata = pd.merge(metadata, cells_well, on='prc', how='inner', suffixes=('', '_drop_col'))
    object_label_cols = [col for col in metadata.columns if '_drop_col' in col]
    metadata.drop(columns=object_label_cols, inplace=True)

    #Add prcfo column (plate row column field object)
    metadata = metadata.assign(prcfo = lambda x: x['plate'] + '_' + x['row_name'] + '_' +x['column_name']+ '_' +x['field']+ '_' +x['object_label'])
    metadata.set_index('prcfo', inplace=True)

    merged_df = metadata.merge(merged_df, left_index=True, right_index=True)
    
    merged_df = merged_df.dropna(axis=1)
    if verbose:
        print(f'Generated dataframe with: {len(merged_df.columns)} columns and {len(merged_df)} rows')
    
    obj_df_ls = []
    if 'cell' in tables:
        obj_df_ls.append(cells)
    if 'cytoplasm' in tables:
        obj_df_ls.append(cytoplasms)
    if 'nucleus' in tables:
        obj_df_ls.append(nucleus)
    if 'pathogen' in tables:
        obj_df_ls.append(pathogens)
        
    return merged_df, obj_df_ls 

locs = ['/nas_mnt/carruthers/Einar/tsg101_screen/TSG101SCREEN_20240810_132824/plate1/measurements/measurements.db']
tables = ['cell','nucleus','pathogen','cytoplasm']

merged_df, obj_df_ls = _read_and_merge_data(locs, tables, verbose=True, nuclei_limit=1000, pathogen_limit=1000)
display(merged_df)
