# Interpreting Confidence

In [None]:
from datascience import *
%matplotlib inline
path_data = '../../../assets/data/'
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=np.VisibleDeprecationWarning)

# Lecture 24 #

## Bootstrap 

In [None]:
sf = Table.read_table('san_francisco_2019.csv')
min_salary = 15 * 20 * 50
sf = sf.where('Salary', are.above(min_salary))

In [None]:
sf.num_rows

In [None]:
sf_bins = np.arange(0, 726000, 25000)
sf.hist('Total Compensation', bins=sf_bins)

In [None]:
# Parameter: Median total compensation in the population
pop_median = percentile(50, sf.column('Total Compensation'))
pop_median

## Bootstrap Estimates of the Parameter (Pretend it is Unknown) 

In [None]:
# One random sample
our_sample = sf.sample(400, with_replacement=False)

In [None]:
our_sample.hist('Total Compensation', bins=sf_bins)

### Carrying Out the Bootstrap 

Sample randomly
 - from the original sample
 - with replacement
 - the same number of times as the original sample size

In [None]:
# Default behavior of tbl.sample:
# at random with replacement,
# the same number of times as rows of tbl

def one_bootstrap_median():
    resample = our_sample.sample()
    return percentile(50, resample.column('Total Compensation'))

In [None]:
one_bootstrap_median()

In [None]:
# Generate the medians of 3000 bootstrap samples
num_repetitions = 3000
bstrap_medians = make_array()
for i in np.arange(num_repetitions):
    bstrap_medians = np.append(bstrap_medians, one_bootstrap_median())

## Percentile Method: Middle 95% of the Bootstrap Estimates 

In [None]:
left = percentile(2.5, bstrap_medians)
right = percentile(97.5, bstrap_medians)

make_array(left, right)

In [None]:
resampled_medians = Table().with_column('Bootstrap Sample Median', bstrap_medians)
median_bins=np.arange(120000, 160000, 2500)
resampled_medians.hist(bins = median_bins)

# Plotting parameters; you can ignore this code
parameter_green = '#32CD32'
plots.ylim(-0.000005, 0.00014)
plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=3, zorder=1)
plots.scatter(pop_median, 0, color=parameter_green, s=40, zorder=2);

## Confidence Interval for Unknown Population Mean

In [None]:
# Random sample of mother-newborn pairs
births = Table.read_table('baby.csv')

In [None]:
births.hist('Maternal Age')

In [None]:
# Average age of mothers in the sample
np.average(births.column('Maternal Age'))

### Question
What is the average age of the mothers in the population?

**Plan for estimating this parameter:**

...

In [None]:
def one_bootstrap_mean():
    resample = births.sample()
    return np.average(resample.column('Maternal Age'))

In [None]:
# Generate means from 3000 bootstrap samples
num_repetitions = 3000
bstrap_means = make_array()
for i in np.arange(num_repetitions):
    bstrap_means = np.append(bstrap_means, one_bootstrap_mean())

### Bootstrap Percentile Method for Confidence Interval

The interval of estimates is the "middle 95%" of the bootstrap estimates.

This is called a *95% confidence interval* for the mean age in the population.

In [None]:
# Get the endpoints of the 95% confidence interval
left = percentile(2.5, bstrap_means)
right = percentile(97.5, bstrap_means)

make_array(left, right)

In [None]:
resampled_means = Table().with_columns(
    'Bootstrap Sample Mean', bstrap_means
)
resampled_means.hist(bins=15)
plots.plot([left, right], [0, 0], color='yellow', lw=8);

## Using the Confidence Interval for Testing Hypotheses

**Null:** The average age of mothers in the population is 25 years; the random sample average is different due to chance.

**Alternative:** The average age of the mothers in the population is not 25 years.

Suppose you use the 5% cutoff for the p-value.

Based on the confidence interval, which hypothesis would you pick?