# Statistical analysis comparing PEA test set (MAXMAG clipped) and flight data

This compares the 2022-Dec ASVT data set with MAXMAG clipped to flight data since
2019-July-01 (approx start time of MAXMAG clipping in flight products).

Summary: There is reasonable agreement for the all box sizes.

See the 2023x01x04 Aspect TWiki notes for the 6 notebooks for each of the halfwidths
60, 80, 100, 120, 140, and 160 arcsec.

Reference page:
https://occweb.cfa.harvard.edu/twiki/bin/view/Aspect/PeaAcqModelCalDec2022Testing

In [None]:
import numpy as np
from pathlib import Path
import os
from collections import Counter
import itertools

from scipy.stats import binom
from astropy.table import Table
import matplotlib.pyplot as plt
from matplotlib import patches
import tables
from cxotime import CxoTime
import agasc

from utils_asvt import flatten_pea_test_data, read_twiki_csv

%matplotlib inline

In [None]:
HALFWIDTH = 60  # Do analysis for one search box size

In [None]:
SKA = Path(os.environ['SKA'])

In [None]:
topic = 'PeaAcqModelCalDec2022Testing/'
name = 'pea_analysis_2022_336_AcqProbModel_calibration_test_results.csv'
dat8 = read_twiki_csv(topic + name)

In [None]:
dat = flatten_pea_test_data(dat8)

In [None]:
', '.join(sorted(dat.colnames))

In [None]:
Counter(dat['ccd_temp'])

In [None]:
# plt.hist(datf['star_mag'], bins=20)
Counter(dat['star_mag'])

In [None]:
asvt = dat[dat['search_box_hw'] == HALFWIDTH]

In [None]:
set(asvt['star_mag'])

In [None]:
set(asvt['ccd_temp'])

In [None]:
set(asvt['search_box_hw'])

In [None]:
acq_file = SKA / 'data' / 'acq_stats' / 'acq_stats.h5'
with tables.open_file(acq_file, 'r') as h5:
    cols = h5.root.data.cols
    names = {'tstart': 'guide_tstart',
             'obsid': 'obsid',
             'obc_id': 'acqid',
             'halfwidth': 'halfw',
             'mag_aca': 'mag_aca',
             'mag_obs': 'mag_obs',
             'known_bad': 'known_bad',
             'color': 'color1',
            'img_func': 'img_func', 
            'ion_rad': 'ion_rad',
            'sat_pix': 'sat_pix',
             'agasc_id': 'agasc_id',
             't_ccd': 'ccd_temp',
            'slot': 'slot'}
    acqs = Table([getattr(cols, h5_name)[:] for h5_name in names.values()],
                 names=list(names.keys()))


In [None]:

# Only data since July 1, 2019 and for color != 1.5 stars. Also ignore any
# failures of IR or SP flags because those don't get generated in ASVT data.
ok0 = acqs['tstart'] > CxoTime('2019-07-01').secs
ok1 = ~np.isclose(acqs['color'], 1.5)
ok = ok0 & ok1 & ~acqs['ion_rad'] & ~acqs['sat_pix'] & (acqs['halfwidth'] == HALFWIDTH)
flt = acqs[ok]
flt['ccd_temp'] = flt['t_ccd']

In [None]:
len(flt)

In [None]:
stars = agasc.get_stars(flt['agasc_id'])
flt['star_mag'] = stars['MAG_ACA']
flt['mag_catid'] = stars['MAG_CATID']

In [None]:
np.count_nonzero(flt['mag_catid'] == 100)

In [None]:
bad_stars = agasc.get_supplement_table('bad')
len(bad_stars)

In [None]:
bad = np.isin(flt['agasc_id'], bad_stars['agasc_id'])
print(np.count_nonzero(bad))
flt = flt[~bad]

In [None]:
def get_vals_and_bins(vals):
    out_vals = np.array(sorted(set(vals)))
    out_val_centers = (out_vals[1:] + out_vals[:-1]) / 2
    out_val_bins = np.concatenate(
        [
            [out_vals[0] - 0.5], 
            out_val_centers, 
            [out_vals[-1] + 0.5],
        ]
    )
    return out_vals, out_val_bins

In [None]:
t_ccd_vals, t_ccd_bins = get_vals_and_bins(asvt['ccd_temp'])
mag_vals, mag_bins = get_vals_and_bins(asvt['star_mag'])

In [None]:
print(t_ccd_bins)
print(t_ccd_vals)

In [None]:
zeros = np.zeros(shape=(len(mag_vals), len(t_ccd_vals)), dtype=int)
n_samp_asvt = zeros.copy()
n_succ_asvt = zeros.copy()

In [None]:
# Aggregate binned number of samples and successes for ASVT data

# Bin halfwidths (narrow since ASVT data are all at the same mag, T_ccd)
for ii, mag0, mag1 in zip(itertools.count(), mag_bins[:-1], mag_bins[1:]):
    for jj, t_ccd0, t_ccd1 in zip(itertools.count(), t_ccd_bins[:-1], t_ccd_bins[1:]):
        ok0 = (asvt['star_mag'] >= mag0) & (asvt['star_mag'] < mag1)
        ok1 = (asvt['ccd_temp'] >= t_ccd0) & (asvt['ccd_temp'] < t_ccd1)
        ok = ok0 & ok1
        n_samp_asvt[ii, jj] = np.count_nonzero(ok)
        n_succ_asvt[ii, jj] = np.count_nonzero(asvt['search_success'][ok])


In [None]:
# # Aggregate binned number of samples and successes for Flight data

# # Bin halfwidths (narrow since ASVT data are all at the same mag, T_ccd)
# for ii, mag0, mag1 in zip(itertools.count(), mag_bins[:-1], mag_bins[1:]):
#     for jj, t_ccd0, t_ccd1 in zip(itertools.count(), t_ccd_bins[:-1], t_ccd_bins[1:]):
#         ok0 = (flt['star_mag'] >= mag0) & (flt['star_mag'] < mag1)
#         ok1 = (flt['ccd_temp'] >= t_ccd0) & (flt['ccd_temp'] < t_ccd1)
#         ok = ok0 & ok1
#         n_samp_flt[ii, jj] = np.count_nonzero(ok)
#         n_succ_flt[ii, jj] = np.count_nonzero(flt['search_success'][ok])


In [None]:
# Aggregate binned number of samples and successes for flight data

# In the case of a repeated observation of the same star only use the first one
# if IGNORE_REPEATS is True. This reduces bias in the case of a star that is
# actually much fainter or brighter than the catalog mag.

n_samp_flt = zeros.copy()
n_succ_flt = zeros.copy()
IGNORE_REPEATS = False

for ii, mag0, mag1 in zip(itertools.count(), mag_bins[:-1], mag_bins[1:]):
    for jj, t_ccd0, t_ccd1 in zip(itertools.count(), t_ccd_bins[:-1], t_ccd_bins[1:]):
        ok0 = (flt['star_mag'] >= mag0) & (flt['star_mag'] < mag1)
        ok1 = (flt['ccd_temp'] >= t_ccd0) & (flt['ccd_temp'] < t_ccd1)
        ok = ok0 & ok1
        agasc_ids = set()
        for row in flt[ok]:
            # Only use the first instance of each AGASC ID in a bin
            if IGNORE_REPEATS and row['agasc_id'] in agasc_ids:
                continue
            agasc_ids.add(row['agasc_id'])

            n_samp_flt[ii, jj] += 1
            if row['obc_id']:
                n_succ_flt[ii, jj] += 1

In [None]:
def as_table(arr, fmt=None):
    """Turn one of the summary 6x6 arrays into a readable table"""
    t = Table()
    t['mag'] = [str(val) for val in mag_vals]
    names = [f"{t_ccd:.1f}" for t_ccd in t_ccd_vals]
    for jj, name in enumerate(names):
        t[name] = arr[:, jj]
        if fmt:
            t[name].info.format = fmt
    return t
            

In [None]:
as_table(n_samp_flt)

In [None]:
as_table(n_succ_flt)

In [None]:
# Percentage of failures for flight
as_table((1- n_succ_flt / n_samp_flt) * 100, fmt='.1f')

In [None]:
as_table(n_samp_asvt)

In [None]:
as_table(n_succ_asvt)

In [None]:
as_table((1- n_succ_asvt / n_samp_asvt) * 100, fmt='.1f')

In [None]:
def calc_diff_pmf(p, pmf1, pmf2):
    dp = p[1] - p[0]  # assume uniform grid
    pmf1 = pmf1 / np.sum(pmf1)
    pmf2 = pmf2 / np.sum(pmf2)

    i0 = int(1 / dp)
    n_out = 2 * i0 + 1
    x = (np.arange(n_out) - i0) * dp
    out = np.zeros(n_out)
    p2 = p
    for i1, p1 in enumerate(p):
        d_pmf12 = pmf1[i1] * pmf2
        i_out = np.round((p1 - p2) / dp).astype(int) + i0
        out[i_out] += d_pmf12

    return x, np.cumsum(out)

In [None]:
def plot_diff_pmf(k1, n1, k2, n2, title='', l1='', l2=''):
    dp = 0.001
    p = np.arange(0.0 + dp/2, 1.0, dp)
    pmf1 = binom.pmf(k1, n1, p)
    pmf2 = binom.pmf(k2, n2, p)
    dp, cdf = calc_diff_pmf(p, pmf1, pmf2)

    plt.figure(figsize=(10, 3.5))
    plt.subplot(1, 2, 1)
    plt.plot(p, pmf1, label=f'k={k1} n={n1} {l1}')
    plt.plot(p, pmf2, label=f'k={k2} n={n2} {l2}')
    plt.grid()
    if title:
        plt.title(title)
    plt.xlabel('p')
    plt.legend(loc='best')

    ax = plt.subplot(1, 2, 2)
    plt.plot(dp, cdf)
    plt.grid()
    plt.title('CDF of difference')
    i10, i90 = np.searchsorted(cdf, [0.1, 0.9])
    p10, p90 = dp[[i10, i90]]
    patch = patches.Rectangle((p10, 0.1), p90-p10, 0.8, fc='r', alpha=0.2, ec='k')
    ax.add_patch(patch)
    plt.xlim(-0.5, 0.5)

    

## Binomial distribution

http://mathworld.wolfram.com/BinomialDistribution.html

![Binomial PMF](http://mathworld.wolfram.com/images/equations/BinomialDistribution/Inline8.gif)

In [None]:
plot_diff_pmf(30, 100, 50, 100)

In [None]:
plot_diff_pmf(3, 10, 5, 10)

## Compute CDF of difference between flight and PEA test set data

Do this only for the faintest three bins in mag: 10-10.25 10.25-10.5 10.5-11

In [None]:
print('Success Probability Distributions')
ni, nj = n_samp_flt.shape
dp10 = np.zeros(shape=(ni, nj))
dp90 = np.zeros(shape=(ni, nj))
dp = 0.01
p = np.arange(0.0 + dp/2, 1.0, dp)

for ii in range(ni):
    for jj in range(nj):
        if n_samp_flt[ii, jj] < 30:
            continue
        k1 = n_succ_flt[ii, jj]
        n1 = n_samp_flt[ii, jj]
        k2 = n_succ_asvt[ii, jj]
        n2 = n_samp_asvt[ii, jj]
        
        title = f'mag={mag_vals[ii]} T_ccd={t_ccd_vals[jj]}'
        plot_diff_pmf(k1, n1, k2, n2, title, 'flight', 'PEA')
        
        pmf1 = binom.pmf(k1, n1, p)
        pmf2 = binom.pmf(k2, n2, p)
        dp, cdf = calc_diff_pmf(p, pmf1, pmf2)
        
        i10, i90 = np.searchsorted(cdf, [0.1, 0.9])
        dp10[ii, jj] = dp[i10]
        dp90[ii, jj] = dp[i90]
