# Lecture 19: Why the Mean Matters

## 8.3: Module 8, Notebook 3

In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

## Standard Units

In [None]:
exams = Table.read_table('exams_fa18.csv')
exams.show(5)

In [None]:
exams.hist(overlay=False, bins=20)

In [None]:
def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.average(x)) / np.std(x)

In [None]:
midterm_su = standard_units(exams.column('Midterm'))
exams = exams.with_column('Midterm in Standard Units', midterm_su)

final_su = standard_units(exams.column('Final'))
exams = exams.with_column('Final in Standard Units', final_su)

exams.show(10)

In [None]:
exams.hist('Midterm in Standard Units', bins=20)

In [None]:
exams.hist('Final in Standard Units', bins=20)

## Central Limit Theorem ##

In [None]:
# united = Table.read_table('united_summer2015.csv')
united = Table.read_table('united.csv')
united_bins = np.arange(-20, 300, 10)
united

In [None]:
united.hist('Delay', bins=united_bins)

In [None]:
delays = united.column('Delay')
delay_mean = np.mean(delays)
delay_sd = np.std(delays)
delay_mean, delay_sd

In [None]:
percentile(50, delays)

In [None]:
def one_sample_mean(sample_size):
    """ Takes a sample from the population of flights and computes its mean"""
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))

In [None]:
one_sample_mean(100)

In [None]:
def ten_thousand_sample_means(sample_size):
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means

In [None]:
sample_means_100 = ten_thousand_sample_means(100)

In [None]:
sample_means_100

In [None]:
len(sample_means_100)

In [None]:
Table().with_column('Mean of 100 flight delays', sample_means_100).hist(bins=20)

print('Population Average:', delay_mean)

In [None]:
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column('Mean of 400 flight delays', sample_means_400).hist(bins=20)
print('Population Average:', delay_mean)

In [None]:
sample_means_900 = ten_thousand_sample_means(900)
Table().with_column('Mean of 900 flight delays', sample_means_900).hist(bins=20)
print('Population Average:', delay_mean)

## Distribution of the Sample Average

In [None]:
means_tbl = Table().with_columns(
    '400', sample_means_400,
    '900', sample_means_900,
)

In [None]:
means_tbl.hist(bins = np.arange(5, 31, 0.5))
plots.title('Distribution of Sample Average');

In [None]:
united.num_rows

In [None]:
# How many possible sample means are there?
united.num_rows ** 400

## Variability of the Sample Average

In [None]:
delay_mean = np.mean(united.column('Delay'))
delay_sd = np.std(united.column('Delay'))

In [None]:
"""Empirical distribution of random sample means"""

def plot_and_summarize_sample_means(sample_size):
    sample_means = ten_thousand_sample_means(sample_size)
    sample_means_tbl = Table().with_column('Sample Means', sample_means)
    
    # Print some information about the distribution of the sample means
    print("Sample size: ", sample_size)
    print("Population mean:", delay_mean)
    print("Average of sample means: ", np.mean(sample_means))
    print("Population SD:", delay_sd)
    print("SD of sample means:", np.std(sample_means))

    # Plot a histogram of the sample means
    sample_means_tbl.hist(bins=20)
    plots.xlabel('Sample Means')
    plots.title('Sample Size ' + str(sample_size))

In [None]:
plot_and_summarize_sample_means(100)

In [None]:
39.48 / 3.932

In [None]:
plot_and_summarize_sample_means(400)

In [None]:
39.48 / 1.973

In [None]:
plot_and_summarize_sample_means(625)

In [None]:
39.48 / 1.577