# Hypothesis Testing

In [None]:
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=np.VisibleDeprecationWarning)

# Lecture 19

## Assessing Mendel's Model ##

In [None]:
# Model for [purple, white]:
# Draws at random with replacement from
mendel_proportions = [0.75, 0.25]

# Mendel's data
sample_size = 929
observed_count = 705
observed_percent = 100 * observed_count / sample_size

## Observed Value of the Test Statistic ##

In [None]:
observed_distance = abs(observed_percent - 75)

## Predicting the Test Statistic Under the Null ##

In [None]:
def one_simulated_distance():
    sample_percent = 100 * sample_proportions(sample_size, mendel_proportions).item(0)
    return abs(sample_percent - 75)

In [None]:
num_simulations = 10000
distances = make_array()
for i in np.arange(num_simulations):
    distances = np.append(distances, one_simulated_distance())

In [None]:
mendel_simulations = Table().with_columns('Distance between Sample % and 75%', distances)

In [None]:
mendel_simulations.hist()
plots.ylim(-0.02, 0.5)
plots.title('Prediction Made by the Null Hypothesis')
plots.scatter(observed_distance, 0, color='red', s=60);

In [None]:
mendel_simulations.hist(left_end=observed_distance)
plots.ylim(-0.02, 0.5)
plots.title('Prediction Made by the Null Hypothesis')
plots.scatter(observed_distance, 0, color='red', s=60, zorder=3);

In [None]:
# Shaded area (as a proportion)

np.count_nonzero(distances >= observed_distance) / num_simulations

## How Far is Too Far?
Suppose you decide to use the 5% cutoff.

In [None]:
np.count_nonzero(distances >= 2.85) / num_simulations

In [None]:
cutoff_5_percent = 2.85
mendel_simulations.hist(left_end=cutoff_5_percent)
plots.ylim(-0.02, 0.5)
plots.title('Prediction Made by the Null Hypothesis');

## Comparing Two Samples ##

In [None]:
births = Table.read_table('baby.csv')

In [None]:
births

In [None]:
smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')

In [None]:
smoking_and_birthweight.group('Maternal Smoker')

In [None]:
smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')

# Test Statistic

[Question] What values of our statistic are in favor of the alternative: positive or negative?

In [None]:
means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
means_table

In [None]:
means = means_table.column('Birth Weight average')
observed_difference = means.item(1) - means.item(0)
observed_difference

In [None]:
def difference_of_means(table, group_label):
    """Takes: name of table,
    column label that indicates which group the row relates to
    Returns: Difference of means of the two groups"""
    
    # table with the two relevant columns
    reduced = table.select('Birth Weight', group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    
    # array of group means
    means = means_table.column('Birth Weight average')
    
    return means.item(1) - means.item(0)

In [None]:
difference_of_means(births, 'Maternal Smoker')

# Random Permutation (Shuffling)

In [None]:
staff = Table().with_columns(
    'Names', make_array('Jim', 'Pam', 'Dwight', 'Michael'),
    'Ages', make_array(29, 28, 34, 41)
)

In [None]:
staff.sample()

In [None]:
staff.sample(with_replacement = False)

In [None]:
shuffled_names = staff.sample(with_replacement = False).column('Names')
staff.with_column('Shuffled', shuffled_names)

# Simulation Under Null Hypothesis

In [None]:
smoking_and_birthweight

In [None]:
shuffled_labels = smoking_and_birthweight.sample(with_replacement=False
                                                ).column('Maternal Smoker')

In [None]:
original_and_shuffled = smoking_and_birthweight.with_column(
    'Shuffled Label', shuffled_labels
)

In [None]:
original_and_shuffled

In [None]:
difference_of_means(original_and_shuffled, 'Shuffled Label')

In [None]:
difference_of_means(original_and_shuffled, 'Maternal Smoker')

# Permutation Test

In [None]:
def one_simulated_difference():
    """Returns: Difference of means of smokers vs non-smokers after shuffling labels"""
    
    # array of shuffled labels
    shuffled_labels = births.sample(with_replacement=False).column('Maternal Smoker')
    
    # table of numerical variable and shuffled labels
    shuffled_table = births.select('Birth Weight').with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(shuffled_table, 'Shuffled Label')   

In [None]:
one_simulated_difference()

In [None]:
differences = make_array()

for i in np.arange(1000):
    new_difference = one_simulated_difference()
    differences = np.append(differences, new_difference)

In [None]:
Table().with_column('Difference Between Group Means', differences).hist()
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');