# Hypothesis Testing Examples

In [None]:
from datascience import *
%matplotlib inline
path_data = '../../../assets/data/'
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=np.VisibleDeprecationWarning)

# Lecture 21

## Randomized Controlled Experiment ##

In [None]:
rct = Table.read_table('bta.csv')
rct.show()

In [None]:
rct.pivot('Result', 'Group')

In [None]:
rct.group('Group', np.average)

In [None]:
observed_outcomes = Table.read_table('observed_outcomes.csv')
observed_outcomes.show()

## Test Statistic ##

potential treatment proportion - potential control proportion

In [None]:
group_proportions = rct.group('Group', np.average).column(1)
group_proportions

In [None]:
group_proportions.item(1) - group_proportions.item(0)

## Simulating the Statistic Under the Null ##

In [None]:
shuffled_labels = rct.sample(with_replacement=False).column('Group')

In [None]:
original_and_shuffled = rct.with_column('Shuffled Label', shuffled_labels)
original_and_shuffled

In [None]:
original_and_shuffled.select('Result', 'Group').group(
    'Group', np.average)

In [None]:
original_and_shuffled.select('Result', 'Shuffled Label').group(
    'Shuffled Label', np.average)

In [None]:
def difference_of_proportions(table, group_label):
    """Takes: name of table,
    column label that indicates which group the row relates to
    Returns: Difference of proportions of 1's in the two groups"""
    
    # table with the two relevant columns
    reduced = table.select('Result', group_label)  
    
    # table containing group means
    proportions_table = reduced.group(group_label, np.average)
    
    # array of group means
    proportions = proportions_table.column('Result average')
    
    return proportions.item(1) - proportions.item(0)

In [None]:
observed_diff = difference_of_proportions(rct, 'Group')
observed_diff

In [None]:
def one_simulated_difference_of_proportions():
    """Returns: Difference of proportions of 1's
    of treatment and control groups after shuffling labels"""
    
    # array of shuffled labels
    shuffled_labels = rct.sample(with_replacement=False).column('Group')
    
    # table of numerical variable and shuffled labels
    shuffled_table = rct.select('Result').with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_proportions(shuffled_table, 'Shuffled Label')   

In [None]:
one_simulated_difference_of_proportions()

In [None]:
simulated_diffs = make_array()

for i in np.arange(20000):
    sim_diff = one_simulated_difference_of_proportions()
    simulated_diffs = np.append(simulated_diffs, sim_diff)

In [None]:
col_name = 'Difference between Treatment and Control'
Table().with_column(col_name, simulated_diffs).hist(
    bins=np.arange(-0.6, 0.81, 0.15), left_end=observed_diff)
plots.ylim(-0.1, 2)
plots.scatter(observed_diff, 0, color='red', s=40, zorder=3);

In [None]:
# p-value
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)

## Discussion Question 1 ##

In [None]:
# One row per student
scores = Table.read_table('scores_by_section.csv').relabeled(1, 'Score')
scores

In [None]:
# max
max(scores.column('Score'))

In [None]:
# class average
np.average(scores.column('Score'))

In [None]:
section_counts = scores.group('Section')
section_counts.show()

In [None]:
section_averages = scores.group('Section', np.average)
section_averages

In [None]:
# section_data should have 3 columns:
# Section, count, and Score average
section_data = section_counts.join('Section', section_averages, 'Section')
section_data.show()

Null Hypothesis:


Alternative Hypothesis:


Test Statistic:


For the p-value, we will start at ... and look to the ...

In [None]:
observed_average = 13.6667

In [None]:
def one_simulated_average():
    random_sample = scores.sample(27, with_replacement=False)
    return np.average(random_sample.column('Score'))

In [None]:
num_simulations = 50000
averages = make_array()
for i in np.arange(num_simulations):
    averages = np.append(averages, one_simulated_average())

In [None]:
Table().with_column('Simulated Section Average', averages).hist(right_end=observed_average)
plots.ylim(-0.01, 0.4)
plots.scatter(observed_average, 0, color='red', s=40, zorder=3)
plots.title('Prediction Under the Null');

In [None]:
np.count_nonzero(averages <= observed_average)/num_simulations

Conclusion: 

## Discussion Question 2 ##

Null Hypothesis:


Aternative Hypothesis:


Test Statistic:


For the p-value, start at ... and look to the ...

In [None]:
def simulate_one_count():
    return np.count_nonzero(np.random.choice(['H', 'T'], 200) == 'H')

In [None]:
num_simulations = 10000
counts = make_array()
for i in np.arange(num_simulations):
    counts = np.append(counts, simulate_one_count())

In [None]:
Table().with_column('Number of Heads', counts).hist(right_end=91)
plots.ylim(-0.001, 0.055)
plots.scatter(91, 0, color='red', s=40, zorder=3)
plots.title('Prediction Under the Null');

In [None]:
np.count_nonzero(counts <= 91)/10000

Conclusion: 

## Discussion Question 3 ##

Null Hypothesis:


Aternative Hypothesis:


Test Statistic:


For the p-value, start at ... and look to the ...

In [None]:
wheel = make_array(18/38, 18/38, 2/38)

In [None]:
obs_counts = make_array(166, 186, 28)
obs_distribution = obs_counts / 380
obs_tvd = sum(abs(obs_distribution - wheel))/2
obs_tvd

In [None]:
def simulate_one_tvd():
    sample_dist = sample_proportions(380, wheel)
    return sum(abs(sample_dist - wheel))/2

In [None]:
num_simulations = 10000
tvds = make_array()
for i in np.arange(10000):
    tvds = np.append(tvds, simulate_one_tvd())

In [None]:
Table().with_column('TVD', tvds).hist(left_end=obs_tvd)
plots.ylim(-0.5, 32)
plots.scatter(obs_tvd, 0, color='red', s=40, zorder=3)
plots.title('Prediction Under the Null');

In [None]:
np.count_nonzero(tvds >= obs_tvd)/num_simulations

Conclusion: 