# Decisions and Uncertainty

In [None]:
from datascience import *
%matplotlib inline
path_data = '../../../assets/data/'
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=np.VisibleDeprecationWarning)

# Lecture 18

## Comparing Categorical Distributions ##

In [None]:
jury = Table().with_columns(
    'Ethnicity', make_array('Asian/PI', 'Black/AA', 'Caucasian', 'Hispanic', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.54, 0.12, 0.01),
    'Panels', make_array(0.26, 0.08, 0.54, 0.08, 0.04)
)

jury

In [None]:
jury.barh('Ethnicity')

In [None]:
eligible_population = jury.column('Eligible')

In [None]:
sample_proportions(1453, eligible_population)

In [None]:
sample_distribution = sample_proportions(1453, eligible_population)
panels_and_sample = jury.with_column('Random Sample', sample_distribution)
panels_and_sample

In [None]:
panels_and_sample.barh('Ethnicity')

## Distance Between Two Distributions ##

In [None]:
jury.barh('Ethnicity')

In [None]:
jury_with_diffs = jury.with_column(
    'Difference', jury.column('Panels') - jury.column('Eligible')
)
jury_with_diffs

In [None]:
jury_with_diffs = jury_with_diffs.with_column(
    'Absolute Difference', np.abs(jury_with_diffs.column('Difference'))
)

jury_with_diffs

In [None]:
jury_with_diffs.column('Absolute Difference').sum() / 2

In [None]:
def total_variation_distance(distribution_1, distribution_2):
    return sum(np.abs(distribution_1 - distribution_2)) / 2

In [None]:
panels = jury.column('Panels')
total_variation_distance(panels, eligible_population)

In [None]:
sample_distribution = sample_proportions(1453, eligible_population)
total_variation_distance(sample_distribution, eligible_population)

## Simulating TVD Under the Model of Random Selection ##

In [None]:
def one_simulated_tvd():
    sample_distribution = sample_proportions(1453, eligible_population)
    return total_variation_distance(sample_distribution, eligible_population)

In [None]:
one_simulated_tvd()

In [None]:
tvds = make_array()
num_simulations = 10000
for i in np.arange(num_simulations):
    tvds = np.append(tvds, one_simulated_tvd())

### Prediction and Reality ###

In [None]:
Table().with_column('TVD', tvds).hist(bins=np.arange(0, 0.2, 0.005))

# Plotting parameters; you can ignore this code
plots.title('Prediction Assuming Random Selection')
plots.xlim(0, 0.15)
plots.ylim(-5, 50)
plots.scatter(0.14, 0, color='red', s=30);

## Assessing Mendel's Model ##

In [None]:
# Model for [purple, white]:
# Draws at random with replacement from

mendel_proportions = [0.75, 0.25]

In [None]:
# Mendel's data

sample_size = 929
observed_count = 705
observed_percent = 100 * observed_count / sample_size

In [None]:
observed_percent 

## Test Statistic ##

In [None]:
observed_distance = abs(observed_percent - 75)
observed_distance

In [None]:
# Sampling under the null hypothesis
sample_proportions(sample_size, mendel_proportions)

## Predicting the Test Statistic Under the Null ##

In [None]:
# Simulate one value of the test statistic
# under the null hypothesis
def one_simulated_distance():
    sample_percent = 100 * sample_proportions(sample_size, mendel_proportions).item(0)
    return abs(sample_percent - 75)

In [None]:
one_simulated_distance()

In [None]:
num_simulations
distances = make_array()
for i in np.arange(num_simulations):
    distances = np.append(distances, one_simulated_distance())

In [None]:
mendel_simulations = Table().with_columns('Distance between Sample % and 75%', distances)

In [None]:
mendel_simulations.hist()
plots.title('Prediction Made by the Null Hypothesis');

In [None]:
mendel_simulations.hist()
plots.ylim(-0.02, 0.5)
plots.title('Prediction Made by the Null Hypothesis')
plots.scatter(observed_distance, 0, color='red', s=60);

In [None]:
mendel_simulations.hist(left_end=observed_distance)
plots.ylim(-0.02, 0.5)
plots.title('Prediction Made by the Null Hypothesis')
plots.scatter(observed_distance, 0, color='red', s=60, zorder=3);

In [None]:
# Shaded area (as a proportion)

np.count_nonzero(distances >= observed_distance) / num_simulations

## How Far is Too Far?##
Suppose you decide to use the 5% cutoff.

In [None]:
np.count_nonzero(distances >= 2.85) / num_simulations

In [None]:
cutoff_5_percent = 2.85
mendel_simulations.hist(left_end=cutoff_5_percent)
plots.ylim(-0.02, 0.5)
plots.title('Prediction Made by the Null Hypothesis');