In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pathlib
import pickle
import seaborn as sns
import skimage
import sys

ampis_root = pathlib.Path('../../../src/')
assert ampis_root.is_dir()
sys.path.append(str(ampis_root))
from ampis.analyze import ordinal_hist_distance
from scipy.stats import ks_2samp

In [None]:
with open('powder_particle_cval_validation_instance_sets.pickle', 'rb') as f:
    particles = pickle.load(f)



In [None]:
gt = particles['gt_val']
pred = particles['pred_val']

gt_rprops = []
for x in gt:
    for y in x:
        gt_rprops.append(y.rprops)
pred_rprops = []
for x in pred:
    for y in x:
        pred_rprops.append(y.rprops)
        
df_gt = pd.concat(gt_rprops)
df_pred = pd.concat(pred_rprops)

In [None]:
len(df_gt), len(df_pred)

In [None]:
def quickplot(key, x=df_gt, y=df_pred,log=False, bin_width_k=1):
    
    
    xdata = x[key].to_numpy(dtype=np.float)
    ydata = y[key].to_numpy(dtype=np.float)
    
    colors = sns.color_palette('bright', 2)
    
    if log:
        xdata = np.log10(xdata)
        ydata = np.log10(ydata)
    
    
    _, xbins = np.histogram(xdata)
    _, ybins = np.histogram(ydata)
    
    bin_min = min(xbins.min(), ybins.min())
    bin_max = max(xbins.max(), ybins.max())
    
    xbinsize = xbins[1]-xbins[0]
    ybinsize = ybins[1]-ybins[0]
    minbinsize = min(xbinsize, ybinsize)
    
    nbins = int(((bin_max-bin_min)/minbinsize+1)*bin_width_k)
    
    new_bins = np.linspace(bin_min, bin_max, num =nbins)
    
    gt_hist, gt_bins = np.histogram(xdata, bins=new_bins)
    pred_hist, pred_bins = np.histogram(ydata, bins=new_bins)
    
    
    fig, ax = plt.subplots(1,2, figsize=(8,5), dpi=300)
    sns.distplot(xdata, bins=new_bins, color=colors[0], label='gt', ax=ax[0])
    sns.distplot(ydata, bins=new_bins, color=colors[1], label='pred', ax=ax[0])
    
    xlabel = key if not log else 'log ' + key
    ax[0].set_xlabel(xlabel)
    ax[0].set_ylabel("relative frequency")
    ax[0].legend(fontsize=14)
    
    ### ks test
    
    
    xunique, xcounts = np.unique(xdata, return_counts=True)
    yunique, ycounts=np.unique(ydata, return_counts=True)
    
    all_unique = np.unique(np.concatenate((xunique, yunique)))
    n = len(all_unique)
    dist_x = np.zeros(n, np.int)
    dist_y = np.zeros(n, np.int)
    xunique_idx = (xunique[:,np.newaxis] == all_unique).argmax(axis=1)
    yunique_idx = (yunique[:,np.newaxis] == all_unique).argmax(axis=1)
    
    dist_x[xunique_idx] = xcounts
    dist_y[yunique_idx] = ycounts
    
    ax[1].plot(all_unique, dist_x.cumsum()/dist_x.sum(), color=colors[0], label='gt')
    ax[1].plot(all_unique, dist_y.cumsum()/dist_y.sum(), color=colors[1], label='pred')
    ax[1].legend()
    
    Dvalue, pvalue = ks_2samp(xdata, ydata) #TODO double check this, p values seem reallllly small for very similar looking distributions
    
    # p- value indicates probability of null hypothesis- distributions are the same
    # high p- distributions are the same. low p- distributions are not the same.
    
    title_str = 'D: {:.3f}\np(same dist):{}'.format(Dvalue, pvalue)
    
    ax[1].set_title(title_str)
    
    
    
    
    plt.show()
    
    print('Normalized distance between histograms: {:.3f}'.format(ordinal_hist_distance(gt_hist, pred_hist)))
    binsizes = new_bins[1:]-new_bins[:-1]
    assert (binsizes.max()-binsizes.min()) / binsizes.min() < 0.001
    print('Bin size: {:.3f}'.format(binsizes[0]))
    
    
    
    
    
    
    

In [None]:
quickplot('area', bin_width_k = 3)

In [None]:
quickplot('area', log=True, bin_width_k = 2)

In [None]:
quickplot('solidity', bin_width_k = 3)

In [None]:
quickplot('major_axis_length', bin_width_k = 3)

In [None]:
quickplot('major_axis_length', log=True, bin_width_k=1.5)