In [1]:
# notebook import

from ipynb.fs.full.data_processing import *

In [2]:
# user input parameters

# boolean parameter to dictate whether code is run
run = False

# boolean parameter to dictate use of print statements
debug = False

# boolean parameter to dictate whether results are written to .csv files
write = False

# list of timesteps for which to calculate mutual information for all subsets
timesteps = [1000, 2000, 3000, 4000, 5000]

In [3]:
# input: timestep and reference feature name of interest (timestep, feature_ref)
# output: dictionary mapping a well's id number to its list of values of a given feature at an individual timestep
def get_timestep_population_dict(timestep, feature_ref):
    population_dict = {key:[] for key in well_id_list}
    for well in experiment_dict:
        well_df = experiment_dict[well]
        timestep_vals = list((well_df.loc[well_df[timestep_ref] == timestep])[feature_ref])
        population_dict[well] = timestep_vals
    return population_dict

# input: list of measurements (population) and a bin configuration (bins)
# output: shannon entropy for a given population and number of bins
def shannon_entropy_fn(population, bins):
    hist, _ = np.histogram(population, bins=bins, density=True)
    hist = hist[hist > 0]
    return entropy(hist)

# input: dictionary of populations (population_dict), size of random joint sampling of the populations (subset_size), and bin configuration (bins)
# output: mutual information at a timestep 
def calculate_mi(population_dict, subset_size, bins):
    try:
        joint_pop = np.concatenate([np.random.choice(population_dict[pop], subset_size) for pop in population_dict])
        joint_shannon_entropy = shannon_entropy_fn(joint_pop, bins)

        shannon_entropy_list = []
        for _, pop in enumerate(population_dict):
            shannon_entropy = shannon_entropy_fn(population_dict[pop], bins)
            shannon_entropy_list.append(shannon_entropy)
        mean_entropy = sum(shannon_entropy_list) / len(shannon_entropy_list)
        
        mi = joint_shannon_entropy - mean_entropy

        return mi
    except:
        print("Error calculating MI.")
        
        return None

In [4]:
# `main()` function declaration

# intended use in coordination with `bin_validation_plot.ipynb`
# requires: import of `data_processing.ipynb` and user input parameters specified above
# output: None (assigns new global variable `mi_dict` and writes to specified output .csv file)

def main():
    
    # list of timesteps relative to input data
    global unscaled
    unscaled = get_timesteps_unscaled(timesteps)
    
    # dictionary mapping each feature to its optimal bin edge configuration
    global bin_edges_dict
    bin_edges_dict = create_bin_edges_dict()
    
    # dictionary mapping each (feature, timestep) combination to its list of mutual information values
    global mi_dict
    mi_dict = {}
    
    # write
    if write:
        csv_fpath = results_fpath + bin_validation_ext + 'validation_mi.csv'
        append_row_csv(csv_fpath, ['feature', 'timestep', 'subset_size', 'mi'])
    
    for (name, ref) in zip(feature_name_list, feature_ref_list):
        
        # get bin configuration for feature
        bin_edges = bin_edges_dict[name]
        
        for t in unscaled:
            
            # get population dict for joint entropy calculation at specific timestep
            population_dict = get_timestep_population_dict(t, ref)
            
            for subset_size in range(1, int(ncells_avg_dict[t])+1):
                
                # calculate mi for each possible subset size
                mi = calculate_mi(population_dict, subset_size, bin_edges)
                if subset_size == 1: 
                    mi_dict[(name, t)] = [mi]
                else:
                    mi_dict[(name, t)].append(mi)
                
                # write
                if write:
                    append_row_csv(csv_fpath, [f'{name}', f'{t}', f'{subset_size}', f'{mi}'])
                    
    if debug:
        print('mi_dict:', type(mi_dict), '\n', mi_dict)
        
    return        

In [5]:
# call to `main()` function

if run:
    main()

mi_dict: <class 'dict'> 
 {('gfp', 76): [-0.7335666588893877, -0.36033356166480557, -0.252653146703397, -0.0006908147082569371, 0.2545483836577791, 0.2574022369942979, 0.11856426673544407, 0.305400088196051, 0.24889623425012974, 0.255285572875354, 0.4739663315052063, 0.40039038939569993, 0.352843304557231, 0.4234941996563637, 0.4430610904757346, 0.4589652911706783, 0.42828085956269213, 0.4695578157914415, 0.415118772300723, 0.35367376392968497, 0.4474943764157344, 0.49721844993585584, 0.460408711488939, 0.5553487102767254, 0.3647684398490947, 0.5238707807617207, 0.5353189090396824, 0.510019158882772, 0.5468531336595741, 0.5761052303859642, 0.4836009468233655, 0.5383348914000181, 0.5541351690534868, 0.5106974371591453, 0.5084663877363402, 0.5231857925526366, 0.5349122304340606, 0.561721413708363, 0.5390668408143351, 0.5602448192808538, 0.5490990185826039, 0.5364285795870902, 0.5202866130238473, 0.5689834303623424, 0.5378645550818404, 0.5321981554691906, 0.5385499695637819, 0.56814265010