# Assessing Models

In [None]:
from datascience import *
%matplotlib inline
path_data = '../../../assets/data/'
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=np.VisibleDeprecationWarning)

Simulating jury panels picked at random

In [None]:
sample_size = 100
eligible = [0.26, 0.74]

In [None]:
sample_proportions(sample_size, eligible).item(0)

In [None]:
def simulate_one_count():
    return sample_size * sample_proportions(sample_size, eligible).item(0)

In [None]:
counts = make_array()
for i in np.arange(10000):
    counts = np.append(counts, simulate_one_count())

In [None]:
Table().with_column(
    'Count in a Random Sample', counts
).hist(bins = np.arange(5.5, 46.6, 1))

# Plotting details; ignore this code
plots.ylim(-0.002, 0.09)
plots.scatter(8, 0, color='red', s=30);

## Multiple Categories ##

In [None]:
jury = Table().with_columns(
    'Ethnicity', make_array('Asian/PI', 'Black/AA', 'Caucasian', 'Hispanic', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.54, 0.12, 0.01),
    'Panels', make_array(0.26, 0.08, 0.54, 0.08, 0.04)
)

jury

In [None]:
jury.barh('Ethnicity')

In [None]:
eligible_population = jury.column('Eligible')

In [None]:
sample_proportions(1453, eligible_population)

In [None]:
sample_distribution = sample_proportions(1453, eligible_population)
panels_and_sample = jury.with_column('Random Sample', sample_distribution)
panels_and_sample

In [None]:
panels_and_sample.barh('Ethnicity')

## Distance Between Two Distributions ##

In [None]:
jury.barh('Ethnicity')

In [None]:
jury_with_diffs = jury.with_column(
    'Difference', jury.column('Panels') - jury.column('Eligible')
)
jury_with_diffs

In [None]:
jury_with_diffs = jury_with_diffs.with_column(
    'Absolute Difference', np.abs(jury_with_diffs.column('Difference'))
)

jury_with_diffs

In [None]:
jury_with_diffs.column('Absolute Difference').sum() / 2

In [None]:
def total_variation_distance(distribution_1, distribution_2):
    return sum(np.abs(distribution_1 - distribution_2)) / 2

In [None]:
panels = jury.column('Panels')
total_variation_distance(panels, eligible_population)

In [None]:
sample_distribution = sample_proportions(1453, eligible_population)
total_variation_distance(sample_distribution, eligible_population)

## Simulating TVD Under the Model of Random Selection ##

In [None]:
def one_simulated_tvd():
    sample_distribution = sample_proportions(1453, eligible_population)
    return total_variation_distance(sample_distribution, eligible_population)

In [None]:
tvds = make_array()
num_simulations = 10000
for i in np.arange(num_simulations):
    tvds = np.append(tvds, one_simulated_tvd())

### Prediction and Reality ###

In [None]:
Table().with_column('TVD', tvds).hist(bins=np.arange(0, 0.2, 0.005))

# Plotting parameters; you can ignore this code
plots.title('Prediction Assuming Random Selection')
plots.xlim(0, 0.15)
plots.ylim(-5, 50)
plots.scatter(0.14, 0, color='red', s=30);