# Lecture 13 : Testing Hypothesis

## 6.1: Module 6 Notebook 1 ##

In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Assessing Models

## Swain vs. Alabama ##

In [None]:
population_proportions = make_array(.26, .74)
population_proportions

In [None]:
sample_proportions(100, population_proportions)

In [None]:
def panel_proportion():
    return sample_proportions(100, population_proportions).item(0)

In [None]:
panel_proportion()

In [None]:
panels = make_array()

for i in np.arange(10000):
    new_panel = panel_proportion() * 100
    panels = np.append(panels, new_panel)

In [None]:
Table().with_column('Number of Black Men on Panel of 100', panels).hist(bins=np.arange(5.5,40.))

In [None]:
# let's plot the observed value of the test statistic (shown as a red dot)
Table().with_column('Number of Black Men on Panel of 100', panels).hist(bins=np.arange(5.5,40.))
plots.scatter(8, -0.0001, color='red', s=30);

## Mendel and Pea Flowers ##

### Test statistic 1: proportion of purple flowers

In [None]:
## Mendel had 929 plants, of which 709 had purple flowers
observed_purples = 705 / 929
observed_purples

In [None]:
predicted_proportions = make_array(.75, .25)
sample_proportions(929, predicted_proportions)

In [None]:
# let's create a function to return just the proportion of purple flowers in percentage
def purple_flowers():
    return sample_proportions(929, predicted_proportions).item(0) * 100

In [None]:
purple_flowers()

In [None]:
# now we simulate the percentage of plants that had purple flowers by iterating 10000 times
purples = make_array()

for i in np.arange(10000):
    new_purple = purple_flowers()
    purples = np.append(purples, new_purple)

In [None]:
# let's visualize those simulated percents
Table().with_column('Percent of purple flowers in sample of 929', purples).hist()

In [None]:
# now, let's add to the histogram, the observed percent
# first, we compute it: 
observed_percent = observed_purples * 100
Table().with_column('Percent of purple flowers in sample of 929', purples).hist()
plots.scatter(observed_percent, -0.0001, color='red', s=30);

### Test statistic 2: distance between the sample percent and 75%

In [None]:
# Our goal is to see whether or not Mendel's model is good. 
# We need to simulate a statistic that will help us make this decision.

# If the model is good, the percent of purple-flowering plants in the sample should be close to 75%. 
# If the model is not good, the percent purple-flowering will be away from 75%. 
# It may be higher, or lower; the direction doesn't matter.

# so our test_statistic becomes the distance between the sample percent and 75% (Mendel's model)

# remember: model proportions = 0.75, and, 0.25, for purple and white flowers respectively
test_statistic = abs(100 * sample_proportions(929, [0.75, 0.25]).item(0) - 75)
test_statistic

In [None]:
# let's simulate the new statistic by sampling 10000 times
model_proportions = [0.75, 0.25]
distances = make_array()

repetitions = 10000
for i in np.arange(repetitions):
    one_distance = abs(100 * sample_proportions(929, model_proportions).item(0) - 75)
    distances = np.append(distances, one_distance)

In [None]:
# let's visualize the empirical distribution of the sample value of the statistic
Table().with_column(
    'Distance between Sample % and 75%', distances
).hist()

In [None]:
# notice, we could also have avoided the last three steps 
# by computing the discrepancy between the sample and model data 
#(i.e., assuming model is true, then the sample value should be close to 75%)
discrepancies = abs(purples - 75)
discrepancies
#print(len(discrepancies))

In [None]:
# this visualization is the same as the one above
# the difference is, this one uses computed discrepancies
# it is a shorter process because we had already computed 
# the percents of purple flowering plants in a sample
Table().with_column('Discrepancy in sample of 929 if the model is true', abs(purples- 75)).hist()

In [None]:
# finally, let's compute the observed value of our statistic
observed_distance = abs(observed_purples * 100 - 75)
observed_distance

In [None]:
# and let's add this point on the histogram
Table().with_column(
    'Distance between Sample % and 75%', distances
).hist()
plots.scatter(observed_distance, -0.0001, color='red', s=30);

In [None]:
# what do we conclude? Is Mendel's model a good model?

# Assessing models with multiple categories

## Alameda County Jury Panels ##

In [None]:
jury = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

jury

In [None]:
jury.barh('Ethnicity')

In [None]:
# Under the model, this is the true distribution of people
# from which the jurors are randomly sampled
model = make_array(0.15, 0.18, 0.12, 0.54, 0.01)

In [None]:
# Let's simulate a random draw of 1423 jurors from this distribution
simulated = sample_proportions(1423, model)
simulated

In [None]:
# let's add the simulated values to the table
jury_with_simulated = jury.with_column('Simulated', simulated)
jury_with_simulated

In [None]:
# The actual observed distribution (Panels) looks quite different
# from the simulation -- try running the this cell several times to confirm!
simulated = sample_proportions(1423, model)
jury_with_simulated = jury.with_column('Simulated', simulated)
jury_with_simulated

In [None]:
jury_with_simulated.barh('Ethnicity')

## Distance Between Distributions

In [None]:
# In the simulation of the Mendel model, the difference between observed purple/white
# and their expected values (25%/75%) was our statistic.
#
# In this case, we need to understand how each of the 5 categories
# differ from their expected values according to the model.

diffs = jury.column('Panels') - jury.column('Eligible')
jury_with_difference = jury.with_column('Difference', diffs)
jury_with_difference

In [None]:
# How can we use the obtained difference as a statistic?

## Total Variation Distance

In [None]:
def tvd(dist1, dist2):
    return sum(abs(dist1 - dist2))/2

In [None]:
# The TVD of our observed data (Panels) from their expected values
# assuming the model is true (Eligbible)
obsvd_tvd = tvd(jury.column('Panels'), jury.column('Eligible'))
obsvd_tvd

In [None]:
# The TVD of a model simulation from its expected values
tvd(sample_proportions(1423, model), jury.column('Eligible'))

In [None]:
def simulated_tvd():
    return tvd(sample_proportions(1423, model), model)

tvds = make_array()

num_simulations = 10000
for i in np.arange(num_simulations):
    new_tvd = simulated_tvd()
    tvds = np.append(tvds, new_tvd)

In [None]:
title = 'Simulated TVDs (if model is true)'
bins = np.arange(0, .05, .005)

Table().with_column(title, tvds).hist(bins = bins)
print('Observed TVD: ' + str(obsvd_tvd))

In [None]:
# what does this tell us about the Jury selection model?

In [None]:
# our model of random selection of jury is not supported by the data