In [1]:
# notebook import

from ipynb.fs.full.data_processing import *

In [2]:
# user input parameters

# boolean parameter to dictate whether code is run
run = True

# boolean parameter to dictate use of print statements
debug = True

# boolean parameter to dictate whether results are written to .csv files
write = True

In [3]:
# function declaration          
        
# input: timestep (timestep), reference name of a feature of interest (feature_ref), and dataframe dictionary to reference (well_dict)
# output: dictionary mapping a well's id number to its list of values of a given feature at an individual timestep
def get_timestep_population_dict(timestep, feature_ref):
    population_dict = {key:[] for key in well_id_list}
    for well in experiment_dict:
        well_df = experiment_dict[well]
        timestep_vals = list((well_df.loc[well_df[timestep_ref] == timestep])[feature_ref])
        population_dict[well] = timestep_vals
    return population_dict

# input:
# output:
def get_joint_pop_dict():
    joint_pop_dict = {}
    for name, ref in zip(feature_name_list, feature_ref_list):
        joint_pop_dict[name] = {}
        for t in timestep_list:
            subset_size = int(ncells_avg_dict[t]//len(well_id_list))
            population_dict = get_timestep_population_dict(t, ref)
            joint_pop_dict[name][t] = np.concatenate([np.random.choice(population_dict[pop], subset_size) for pop in population_dict])
    return joint_pop_dict
  

def emd_fn(old_vals, new_vals):
    return wasserstein_distance(old_vals, new_vals)

In [4]:
# `main()` function declaration

# intended use in coordination with `earth_movers_distance_plot.ipynb`
# requires: import of `data_processing.ipynb` and user input parameters specified above
# output: None (assigns new global variables `joint_pop_dict`, `emd_dict` and writes to output .csv files)

def main():
    
    global joint_pop_dict
    joint_pop_dict = get_joint_pop_dict()
    
    global emd_dict
    emd_dict = {key:{key2:[] for key2 in well_id_list} for key in feature_name_list}
    
    for name, ref in zip(feature_name_list, feature_ref_list):
        
        for well in well_id_list:
            
            well_df = experiment_dict[well]
            
            if write:
                csv_fpath = results_fpath + emd_ext + f'{name}_{well}_emd_list.csv'
                append_row_csv(csv_fpath, ['emd'])
            
            for t in timestep_list:
                
                well_list = list(well_df.loc[well_df[timestep_ref] == t][ref])
                joint_list = joint_pop_dict[name][t]
                
                emd = emd_fn(well_list, joint_list)
                emd_dict[name][well].append(emd)
                
                if write:
                    append_row_csv(csv_fpath, [emd])
                
    if debug:
        print('emd_dict:', type(emd_dict), '\n', emd_dict)
                
    return
        

In [5]:
# call to `main()` function
if run:
    main()

emd_dict: <class 'dict'> 
 {'gfp': {'01': [126.55424317933243, 54.635039599170504, 48.88952954219439, 33.50753482584888, 31.013389583743802, 51.529646373338196, 44.63617382421698, 39.3567244977013, 28.85544460257883, 28.70394144110541, 50.55016864244536, 47.862052871338314, 78.55565754000996, 63.385296789743194, 41.41122639759775, 45.65990795099215, 33.79274110405533, 34.38072859089556, 49.223162870597825, 19.1577057708774, 23.809644502510128, 20.466901244151966, 32.757594237922135, 31.04725241138719, 33.75853231814246, 37.1102192500348, 30.243287975013498, 23.39028206048511, 34.00714228978501, 27.895642935264554, 23.577046506190484, 30.11837871909742, 29.415181052129082, 23.569833406542525, 24.423637904865505, 23.22490649139421, 39.37096756860828, 38.62107995479005, 46.15299634816039, 62.54531647529011, 63.81532654254738, 57.79458000285182, 62.15280214503982, 62.68355325940017, 64.40787208461131, 58.027413571575195, 59.31839157941931, 64.71685541547399, 71.91654654541162, 63.557105472