# Lecture 15 #

## 7.1: Module 7 Notebook 1 ##

In [None]:
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

## Percentiles

In [None]:
# Manually compute the 55th percentile.
x = make_array(43, 20, 51, 7, 28, 34)

In [None]:
# Step 1. Sort the data
x_sorted = np.sort(x)
x_sorted

In [None]:
# Step 2. Figure out where 55th percentile would be.
index = 55 / 100 * 6
print("index = ", index)
print(int(index))

In [None]:
# Step 3: obtain the value at the computed index
x_sorted[int(index)]

In [None]:
# OR: 1 Line of Code
percentile(55, x)

### Percentiles in class discussion question

In [None]:
s = make_array(1, 7, 3, 9, 5)
s

In [None]:
sorted_s = np.sort(arr)
sorted_s

In [None]:
percentile(10, s) == 0

In [None]:
percentile(10, s)

In [None]:
percentile(39, s) == percentile(40, s)

In [None]:
percentile(39, s)

In [None]:
percentile(40, s)

In [None]:
percentile(40, s) == percentile(41, s)

In [None]:
percentile(41, s)

In [None]:
percentile(50, s) == 5

## Estimation

### Estimating Median - Sample Median

In [None]:
sf = Table.read_table('san_francisco_2015.csv')
sf

In [None]:
# Who is making the most money
sf.sort('Total Compensation', descending=True).show(5)

In [None]:
# Who is making the least money
sf.sort('Total Compensation', descending=False).show(5)

In [None]:
# let's focus our estimate on people making at least a minimum salary
# assuming compensation of $10 per hour, 20hr work week, and 52 weeks in a year
min_salary = 10 * 20 * 52
sf = sf.where('Total Compensation', are.above(min_salary))

In [None]:
# let's calculate the population median (i.e., the parameter)
# median is the 50% percentile, so we can use the percentile() function
pop_median = percentile(50, sf.column('Total Compensation'))
pop_median

In [None]:
# now, let's sample from the population (unique people, i.e., sampling without replacement)
# 1 sample of size = 300
our_sample = sf.sample(300, with_replacement=False)
our_sample.show(5)

In [None]:
# now let's compute the sample median
percentile(50, our_sample.column('Total Compensation'))

In [None]:
# let's visualize the population data, "true" distribution
sf_bins = np.arange(0, 700000, 25000)
sf.hist('Total Compensation', bins=sf_bins)
plots.title('Population Distribution');

In [None]:
# Let's also visualize the sample data, empirical distribution
our_sample.hist('Total Compensation', bins=sf_bins)
plots.title('Sample Distribution');

In [None]:
# Observe that the distributions look pretty similar. Why?

### Variability of the Estimate

In [None]:
# let's create a function to generate one sample from our data and compute the median
def generate_sample_median(samp_size):
    our_sample = sf.sample(samp_size, with_replacement=False)
    return percentile(50, our_sample.column('Total Compensation'))

In [None]:
# let's test the function
sample_median = generate_sample_median(300)
sample_median

In [None]:
# finally, let's see how far the sample median, i.e., estimate, is from the true median, i.e., the population median
error = sample_median - pop_median
error

### Quantifying Uncertainty

In [None]:
# let's run a simulation, sampling from the population, and computing the median of each sample
sample_medians = make_array()

for i in np.arange(1000):
    new_median = generate_sample_median(300)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
# let's visualize the 1000 sample medians computed above
# let's also plot the observed median (i.e., population median, by a red dot)
med_bins = np.arange(90000, 125001, 2500)
Table().with_column(
    'Sample Medians', sample_medians
).hist(bins = med_bins)

plots.scatter(pop_median, 0, color="red");

In [None]:
# Let's also compute and visualize the errors in our simulation
# for each simulated sample, compute the difference between the median and the true median
err_bins = np.arange(-15000, 12501, 2500)
Table().with_column(
    'Errors', sample_medians - pop_median
).hist(bins = err_bins)

plots.scatter(0, 0, color="red");

## Bootstrap

In [None]:
# Take a bootstrap (re)sample of size 300, WITH replacement
boot_sample = our_sample.sample(300, with_replacement=True)
boot_sample.hist('Total Compensation', bins=sf_bins)
plots.title('Bootstrap sample');

print("Population Median = ", pop_median)
print("Our Sample Median = ", sample_median)
print("Bootstrap Sample Median = ", 
      percentile(50,boot_sample.column('Total Compensation')))

In [None]:
# let's put the code above in one function
# this function creates a sample, by resampling from the sample, and computes its median
def one_bootstrap_median():
    single_sample = our_sample.sample()
    return percentile(50, single_sample.column('Total Compensation'))

In [None]:
# now, let's generate many medians by repeating the process above several times
bootstrap_medians = make_array()
for i in np.arange(1000):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)

In [None]:
# finally let's visualize the bootstrap medians
Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

# let's also plot the "true" median, in red, and the sample median, in blue
plots.scatter(pop_median, -0.00000001, color="red");
plots.scatter(sample_median, -0.00000001, color="blue");

## Confidence Intervals

In [None]:
# Make an interval based on the middle 95% of bootstrap samples

left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.plot([left, right], [0,0], color="gold",lw=3, zorder=1);
plots.scatter(pop_median, 0, color="red", zorder=2);
plots.scatter(sample_median, 0, color="blue", zorder=2);