In [None]:
# notebook import 

from ipynb.fs.full.data_processing import *

In [None]:
# user input parameters

# boolean parameter to dictate whether figures are saved
save = False

# dictionary mapping each binning method name to its corresponding proper name used in figures
proper_method = {'fd':'Fd', 'doane':'Doane', 'scott':'Scott', 'stone':'Stone', 'rice':'Rice', 'sturges':'Sturges', 'sqrt':'Sqrt'}

# list of colors used in figures
colors = ['lightcoral', 'lightsalmon', 'greenyellow', 'darkturquoise', 'lightskyblue', 'slateblue', 'orchid']

In [None]:
# function declaration

# input: a number of bins (n_bins) and a series of measurements (series) corresponding to a feature of interest (feature)
# output: None (plots the series of measurements as a distribution with the specified number of bins)
def plot_distribution(n_bins, series, feature):
    plt.hist(series, bins=n_bins, density=True)
    
    plt.title(f'{proper_feature[feature]} distribution with {n} bins')
    plt.xlabel(f'{proper_feature[feature]}')
    plt.ylabel('Density')
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
    if save:
        plt.savefig(results_fpath+bin_selection_ext+f'/distribution_plots/{feature}_distribution_{n}_bins.png', dpi=300,  bbox_inches='tight')
    
    plt.show()
    return

# input: list of binning methods (method_name_list) and the number of bins they suggest (method_bin_list), a set of number of bins (bin_range) and their corresponding entropies (entropies), and the optimal number of bins (optimal_n_bins) for a feature of interest (feature) 
# output: None (plots entropy as a function of bin size, specifying different numbers of bins of interest)
def plot_bin_entropies(method_name_list, method_bin_list, bin_range, entropies, optimal_n_bins, feature):
    method_name_sorted = [name for _, name in sorted(zip(method_bin_list, method_name_list))]
    method_bin_list.sort()
    
    plt.plot(bin_range, entropies, color='black', label='Entropy')
    
    plt.axvline(optimal_n_bins, color='red', linestyle='--', label=f'Knee point ({optimal_bins})')
    for (name, n_bins, c) in zip(method_name_sorted, method_bin_list, colors):
        plt.axvline(n_bins, color=c, linestyle='--', label=f'{proper_method[name]} ({n_bins})')
       
    plt.title(f'{proper_feature[feature]} entropy as a function of number of bins')
    plt.xlabel('Number of bins')
    plt.ylabel('Entropy')
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
    if save:
        plt.savefig(results_fpath+bin_selection_ext+f'/bin_entropies_plots/{feature}_bin_entropies.png', dpi=300,  bbox_inches='tight')
    
    plt.show()
    return

In [None]:
# `main()` function declaration

# intended use in coordination with `bin_selection.ipynb`
# requires: import of `data_processing.ipynb` and user input parameters specified above
# output: None (reads data files and produces plots of distribution and bin entropies)

def main():
    
    # find data files
    bin_entropies_files = find_data_files(results_fpath+bin_selection_ext, '*_bin_entropies.csv')
    bin_entropies_methods_files = find_data_files(results_fpath+bin_selection_ext, '*_bin_entropies_methods.csv')
    optimal_bins_files = find_data_files(results_fpath+bin_selection_ext, '*_optimal_bins.csv')
    
    # create dataframe dictionaries 
    bin_entropies_dict = create_dataframe_dict(bin_entropies_files, f'{results_fpath}{bin_selection_ext}(.+?)_bin_entropies.csv')
    bin_entropies_methods_dict = create_dataframe_dict(bin_entropies_methods_files, f'{results_fpath}{bin_selection_ext}(.+?)_bin_entropies_methods.csv')
    optimal_bins_dict = create_dataframe_dict(optimal_bins_files, f'{results_fpath}{bin_selection_ext}(.+?)_optimal_bins.csv')

    for feature in bin_entropies_dict:
        
        # series of measurements for given feature
        feature_series = feature_df[feature]

        # read variables from .csv files
        bin_range = list(pd.read_csv(bin_entropies_dict[feature])['n_bins'])
        entropies = list(pd.read_csv(bin_entropies_dict[feature])['entropy'])
        method_name_list = list(pd.read_csv(bin_entropies_methods_dict[feature])['method'])
        method_bin_list = list(pd.read_csv(bin_entropies_methods_dict[feature])['n_bins'])
        method_entropy_list = list(pd.read_csv(bin_entropies_methods_dict[feature])['entropy'])
        optimal_n_bins = int(list(pd.read_csv(optimal_bins_dict[feature])['n_bins'])[0])
        
        # plot
        for n_bins in bin_range:
            plot_distribution(n_bins, feature_series, feature)
        plot_bin_entropies(method_name_list, method_bin_list, bin_range, entropies, optimal_n_bins, feature)
        
    return

In [None]:
# call to `main()` function

main()