In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
from astropy.io import fits

In [None]:
# reset defalult plotting values
plt.rcParams['figure.figsize'] = (10, 5)
plt.rc('font', family='sans-serif')
plt.rc('axes', labelsize=14)
plt.rc('axes', labelweight='bold')
plt.rc('axes', titlesize=16)
plt.rc('axes', titleweight='bold')
plt.rc('axes', linewidth=2)
plt.rc('xtick',labelsize=14)
plt.rc('ytick',labelsize=14)

# $\sigma$-Clippinig
## removing outliers from your data

![]()

### Prof. Robert Quimby
&copy; 2019 Robert Quimby

## In this tutorial you will...

- Identify outliers from a sample population
- Compare sample mean and STD with and without the outliers
- Employ iterative techniques to identify outliers
- Check if "outliers" are actually consistent with the expected distribution

## Lets use a real bias image for our sample

In [None]:
# load a bias frame
bias = fits.getdata('media/bias.fits')

In [None]:
# plot a histogram of the pixel values
bmin = ????
bmax = ????
plt.hist(bias.flatten(), range=(bmin, bmax), bins=bmax-bmin+1, width=10);
plt.xlabel('Counts'), plt.ylabel('Number'), plt.title('Count Distribution (Bias Frame)');
plt.yscale('log')

## Checking for Outliers

In [None]:
med = ????
std = ????

# locate outliers
w = ????
print("found {} outliers".format(w.sum()))

In [None]:
# show the outliers
j, i = np.indices(bias.shape)
plt.figure(figsize=(10, 10))
plt.imshow(bias, vmin=med-std, vmax=med+5*std, origin='lower', cmap='gray');
plt.plot(i[w], j[w], 'ro');

## $\sigma$-clipping

* one (often dangerous!) way to get rid of outliers in a distribution

In [None]:
# calculate the sample mean and STD (including outliers)
mean = np.mean(bias)
std = bias.std(ddof=1)
print(mean, std, bias.size)

In [None]:
# identify the (non)outliers
w = ????

# now remove outliers and re-calculate mean, STD
sample = ????
print(sample.mean(), sample.std(), sample.size)

## Iterative $\sigma$-clipping

* can be very (dangerous) useful

In [None]:
def sigclip(sample, nsigma=5.0, niter=100):
    # check for outliers
    med = np.median(sample)
    std = np.std(sample, ddof=1)
    isok = ????

    # are we done sigma clipping?
    if (niter <= 0) or (isok.sum() == sample.size):
        return sample

    # recursive call
    return ????

In [None]:
# test it out
sample = sigclip(bias)
print("removing {} outliers:".format(????))
print("  bias mean is {:.4f}".format(sample.mean()))
print("  bias std is {:.4f}".format(sample.std()))

## $\sigma$-clipping is a common task

In [None]:
from astropy.stats import ????

sample = ????
print("bias mean is {:.4f}".format(sample.mean()))
print("bias std is {:.4f}".format(sample.std()))

## Should I clip at $3\sigma$, $4\sigma$, or $5\sigma$?

- Think about your data
- How many outliers would you expect by random chance?

## Consider the cumulative distribution of the parent population

In [None]:
x = np.linspace(-5, 5, 100)
plt.plot(x, ????)
plt.grid()

In [None]:
# how many bias pixels do we expect to be n-sigma larger than the mean?
nsigma = ????
ndata = ????
expect = ????
print("expect {:.2f} sample points {}-sigma from the mean".format(expect, nsigma))