In [None]:
# imports

from glob import glob
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# function declarations

# input: path to folder containing data (data_fpath) and the generic file extension of interest (file_ext)
# output: list of data files from specified input directory
# reference: https://docs.python.org/3/library/glob.html
def find_data_files(data_fpath, file_extension):
    data_files = glob(data_fpath + file_extension)
    data_files.sort()
    return data_files

# input: list of csv files (csv_files) and the file path extension that specifies its id with (.+?) (id_search_ext)
# output: dictionary mapping a .csv file path itendifier to a dataframe containing its contents
# reference: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html 
def create_dataframe_dict(csv_files, id_search_ext):
    df_dict = {}
    for file in csv_files:
        df = pd.read_csv(file)
        df_id = re.search(id_search_ext, file).group(1)
        df_dict[df_id] = df
    return df_dict

# input: dictionary mapping an identifier to its dataframe (df_dict), list of reference column names of interest (col_ref_list), and list of new corresponding column names if different (col_name_list) 
# output: dataframe containing columns of interet, concatenated across all dataframes in input dictionary
# reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html, https://pandas.pydata.org/docs/reference/api/pandas.concat.html
def extract_df_subset(df_dict, col_ref_list, col_name_list=None):
    if col_name_list == None: col_name_list = col_ref_list
    df_subset_result = pd.DataFrame(columns = col_name_list)
    for df_id in df_dict:
        df = df_dict[df_id]
        df_subset = pd.DataFrame()
        df_subset[col_name_list] = df[col_ref_list]
        df_subset_result = pd.concat([df_subset_result, df_subset])
    return df_subset_result

In [None]:
# global variable declaration (user input parameters)

# path to folder containing data gathered in a single experiment
experiment_folder_fpath = '/Users/sarahfisher/Documents/project/data/CSV_FB_20230404'

# generic file extension for .csv data files (replace unique identifiers with `*`)
csv_file_ext = '/Time_points_dfs_*.csv'

# searchable file extension for .csv data file identifiers (replace `*` with `(.+?)`)
id_search_ext = '/Time_points_dfs_(.+?).csv'

# list of column names of features of interest on .csv files 
feature_ref_list = ['mean_F_C2','mean_F_C3','area']

# list of renamed features of interest for clarity
feature_name_list = ['Mean GFP','Mean RFP','Cell Area']

# reference name of colummn describing the timestep from .csv data files
timestep_ref = 'Time_interval'

# renamed column describing the timestep
num_cells_name = 'Timestep'

# reference (0-indexed) index of the colummn describing the timestep from .csv data files
timestep_index = 13

# reference name of number of colummn describing the number of cells from .csv data files
num_cells_ref = 'Number_cells'

# renamed column describing the number of cells
num_cells_name = 'Number of Cells'

# reference (0-indexed) index of the colummn describing the number of cells from .csv data files
num_cells_index = 12

# list of matplotlib binning methods used to determine optimal bin edges
methods = ['auto', 'fd', 'doane', 'scott', 'stone', 'rice', 'sturges', 'sqrt']

# subset of full list of matplotlib binning methods of interest
selected_methods = ['sqrt', 'scott', 'sturges']

In [None]:
# `main()` function declaration

# intended use in coordination with `calculation.ipynb` and `plots.ipynb`
# requires: global variable declaration (user input parameters) specified above
# output: returns None, assigns new global variables: `well_fpath_list`, `experiment_dict`, `feature_df`
def main():
    
    # list of filepaths of .csv files, each containing data from a single well of the experiment
    global well_fpath_list
    well_fpath_list = find_data_files(experiment_folder_fpath, csv_file_ext)
    # debug
    print('well_fpath_list:', type(well_fpath_list), '\n', well_fpath_list)
    
    # dictionary mapping each well number in the experiment to a dataframe containing its .csv file data contents
    global experiment_dict
    experiment_dict = create_dataframe_dict(well_fpath_list, id_search_ext)
    # debug
    print('experiment_dict:', type(experiment_dict), '\n', experiment_dict)
    
    # list of all well ids
    global well_id_list
    well_id_list = list(experiment_dict)
    # debug
    print('well_id_list:', type(well_id_list), '\n', well_id_list)
    
    # list of all timesteps
    global timestep_list
    timestep_list = list(set(experiment_dict[well_id_list[0]][timestep_ref]))
    # debug
    print('timestep_list:', type(timestep_list), '\n', timestep_list)
    
    # dataframe containing a specified subset of columns, contents concatenated across all wells of the experiment
    global feature_df
    feature_df = extract_df_subset(experiment_dict, feature_ref_list, feature_name_list)
    # debug
    print('feature_df:', type(feature_df), '\n', feature_df)
    
    return

In [None]:
# call to `main()` function

main()