In [42]:
# Package Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import mplcursors
from scipy.optimize import curve_fit
import scipy.stats
# Qt popups for matplots instead of inline plots in jupyter notebook
%matplotlib qt

In [43]:
def read_and_combine_csv_files(root_directory, file_suffix, include_source_file=True):
    dataframes = []

    # Walk through all directories and subdirectories starting from the root
    for dirpath, dirnames, filenames in os.walk(root_directory):
        for filename in filenames:
            if filename.endswith(file_suffix):
                file_path = os.path.join(dirpath, filename)
                try:
                    df = pd.read_csv(file_path)
                    if not df.empty:
                        if include_source_file:
                            # Generate relative path from the root directory
                            relative_path = os.path.relpath(file_path, root_directory)
                            # remove the file suffix from the relative path
                            source_file_name = relative_path.replace(file_suffix, '')
                            df['source_file'] = source_file_name

                        # Append the DataFrame to the list
                        dataframes.append(df)
                        # Print file path and specified columns for debugging
                        #print(f"Reading from {relative_path}")
                        #if 'label' in df.columns and 'intensity_total' in df.columns and 'puncta_intensity_total' in df.columns:
                        #    print(df[['label', 'intensity_total', 'puncta_intensity_total']].head())
                        #else:
                        #    print("One or more specified columns are missing in this file.")

                    else:
                        print(f"Warning: '{file_path}' is empty and was skipped.")
                except Exception as e:
                    print(f"Error reading '{file_path}': {e}")

    # Combine all dataframes into one, if any were successfully added
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no files were read successfully


### Choose directory for files, provide separate directories for cell and puncta df if needed

In [44]:
dir_str = "/Users/christian.neureuter/Library/CloudStorage/Box-Box/Banerjee_Lab_Shared_Folder/Meet/Test"
combined_cell_df = read_and_combine_csv_files(dir_str, '_cell_df.csv', include_source_file=True)
combined_puncta_df = read_and_combine_csv_files(dir_str, '_puncta_df.csv', include_source_file=True)

In [45]:
# sort cell_df by intensity_mean (change to intensity_total if needed)
combined_cell_df = combined_cell_df.sort_values(by='intensity_mean', ascending=False)

### Set intensity range

In [46]:
# select range of rows from cell_df by intenisty_mean
# Define the intensity range
intensity_min = 0.001
intensity_max = 0.005

# Select rows within the specified intensity range
trunc_cell_df = combined_cell_df[(combined_cell_df['intensity_mean'] >= intensity_min) & (combined_cell_df['intensity_mean'] <= intensity_max)]

In [47]:
trunc_cell_df

Unnamed: 0.1,Unnamed: 0,label,area,intensity_mean,axis_major_length,axis_minor_length,eccentricity,perimeter,intensity_std_dev,intensity_median,intensity_total,cell_micron_area,image_resolution_um_per_px_sq,cell_snr,gaussian_snr_estimate,contrast,dissimilarity,homogeneity,ASM,energy,correlation,32_bit_entropy,8_bit_entropy,8_bit_entropy_img_avg,img_kurtosis,standardized_sixth_moment,kurtosis_z_score,p_val,lbp_mean,lbp_std,lbp_entropy,puncta_micron_area_mean,puncta_micron_area_std,puncta_ellipticity_mean,puncta_intensity_total,puncta_intensity_dist_mean,number_of_puncta,cell_xor_puncta_int_mean,cell_xor_puncta_int_std,cell_xor_puncta_int_total,cell_xor_puncta_area,snr_test,partition_test,partition_test_total_int,spark_score,puncta_classifier,source_file
0,0,1,31255.0,0.002255,244.040987,165.360799,0.735436,693.411255,0.00082,0.002447,70.489851,74.517727,0.002384,57.065662,2905.980494,1653.922813,26.76353,0.232634,0.04256,0.203159,0.80428,11.497075,6.099109,5.893421,-0.696131,7.589833,-40.251642,0.0,122.089844,251.444941,6.428752,0.072888,0.019017,0.049575,0.752423,0.003528,7.0,0.002247,0.000815,69.737428,7760.25,4.330108,1.570244,0.010789,0.010674,1.0,I-1-FUS-fl-gfp
7,2,3,30927.0,0.001653,264.08406,150.857177,0.820778,702.038672,0.000539,0.001756,51.11027,73.735714,0.002384,6.311709,1170.470398,1051.355234,20.463556,0.2749,0.058823,0.239824,0.812145,11.02721,5.624411,5.668546,-0.080183,12.680135,-2.9542,0.003134812,120.808594,239.166976,6.565474,0.086129,0.015889,0.10876,0.780213,0.002665,8.0,0.001643,0.000531,50.330057,7659.5,5.019919,1.622187,0.015502,0.015265,1.0,I-3-FUS-fl-gfp
5,0,1,48396.0,0.001399,288.445654,215.858909,0.663302,851.092496,0.000461,0.001473,67.711663,115.385056,0.002384,5.343559,990.93248,736.72204,17.943465,0.229116,0.035909,0.18666,0.798648,12.299413,5.762486,5.559008,0.236421,15.129559,9.19335,3.807932e-20,189.046875,382.792522,7.051144,0.083447,0.026204,0.111171,0.551077,0.002576,6.0,0.001394,0.000454,67.160586,12046.5,5.677472,1.848327,0.008205,0.008139,1.0,I-3-FUS-fl-gfp
2,0,1,22296.0,0.001328,226.383752,126.497235,0.82932,600.07316,0.000365,0.001329,29.60954,53.157806,0.002384,2.239001,1293.033938,31.924564,3.794541,0.340543,0.022471,0.146249,0.670178,12.004574,3.905986,3.180641,-0.006111,12.982833,-0.157603,0.8747697,87.09375,177.54075,6.503085,0.0,0.0,0.0,0.0,0.0,0.0,0.001328,0.000365,29.60954,5574.0,0.0,0.0,0.0,0.0,0.0,I-2-FUS-fl-gfp
1,1,2,29197.0,0.001011,236.14396,158.857548,0.739902,668.440692,0.000359,0.001097,29.526927,69.611073,0.002384,25.588675,1303.063644,485.687128,14.765486,0.218133,0.02694,0.1591,0.687934,11.866226,5.414978,5.125856,-0.595218,8.507867,-30.355454,2.128775e-202,114.050781,227.62617,6.58159,0.0,0.0,0.0,0.0,0.0,0.0,0.001011,0.000359,29.526927,7299.25,0.0,0.0,0.0,0.0,0.0,I-1-FUS-fl-gfp


In [48]:
# Create a dictionary to map (source_file, label) to intensity_mean
intensity_map = {(row['source_file'], row['label']): row['intensity_mean'] for _, row in trunc_cell_df.iterrows()}

# Define a function to get the intensity_mean from the map
def get_intensity_mean(row):
    return intensity_map.get((row['source_file'], row['cell label']), None)

# Add the new column to combined_puncta_df
combined_puncta_df['cell_intensity_mean'] = combined_puncta_df.apply(get_intensity_mean, axis=1)

# Filter the rows to match the truncated cell_df
trunc_puncta_df = combined_puncta_df[combined_puncta_df['cell_intensity_mean'].notnull()]

In [49]:
trunc_puncta_df

Unnamed: 0.1,Unnamed: 0,label,area,intensity_mean,axis_major_length,axis_minor_length,eccentricity,perimeter,ellipticity,circularity,micron area,cell label,source_file,cell_intensity_mean
0,0,1,38.0,0.00349,7.384194,6.624056,0.441911,20.142136,0.102941,0.367904,0.090599,1,I-3-FUS-fl-gfp,0.001399
1,1,2,47.0,0.002352,8.236911,7.361046,0.448733,22.970563,0.106334,0.0,0.112057,1,I-3-FUS-fl-gfp,0.001399
2,2,3,46.0,0.002751,8.44419,6.928203,0.57169,22.142136,0.17953,0.380818,0.109673,1,I-3-FUS-fl-gfp,0.001399
3,3,4,18.0,0.002238,5.033223,4.618802,0.39736,13.313708,0.082337,1.0,0.042915,1,I-3-FUS-fl-gfp,0.001399
4,4,5,31.0,0.002309,6.425755,6.236411,0.240966,18.142136,0.029466,0.409728,0.07391,1,I-3-FUS-fl-gfp,0.001399
5,5,6,30.0,0.002317,6.831301,5.694442,0.552397,18.142136,0.166419,0.166161,0.071526,1,I-3-FUS-fl-gfp,0.001399
6,6,1,34.0,0.002185,6.995797,6.207485,0.461161,18.727922,0.112684,0.779909,0.081062,3,I-3-FUS-fl-gfp,0.001653
7,7,2,41.0,0.002706,7.771928,6.705444,0.505584,20.727922,0.137223,0.673902,0.097752,3,I-3-FUS-fl-gfp,0.001653
8,8,3,35.0,0.00238,7.21103,6.283459,0.49063,19.556349,0.128632,0.399625,0.083447,3,I-3-FUS-fl-gfp,0.001653
9,9,4,24.0,0.002461,5.887841,5.330729,0.424604,16.142136,0.094621,0.441076,0.05722,3,I-3-FUS-fl-gfp,0.001653


### Save the new combined, truncated puncta_df to the same location puncta_df was loaded from, eg dir_str
Provide the construct name in the quotes before trunc_puncta_df.csv, eg 'FUS_2X_PLD_trunc_puncta_df.csv' 

In [50]:
# save the truncated puncta_df as a csv file to the same location where the puncta_df was loaded from, eg dir_str
trunc_puncta_df.to_csv(os.path.join(dir_str, 'trunc_puncta_df.csv'), index=False)

In [None]:
# deprecated code 
# select rows from puncta_df where source_file and cell label match a row in trunc_cell_df
matched_puncta_df = combined_puncta_df[combined_puncta_df.apply(lambda row: any((row['source_file'] == trunc_row['source_file']) and (row['cell label'] == trunc_row['label']) for _, trunc_row in trunc_cell_df.iterrows()), axis=1)]
