In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action='ignore',category=np.VisibleDeprecationWarning)

## Part 1: Flight data: different sampling regimes ##

In [None]:
united = Table.read_table('united.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
united

In [None]:
# A deterministic sample
united.where('Destination', 'HNL')

In [None]:
# Another deterministic sample:
united.take(np.arange(42, united.num_rows, 100))

In [None]:
# A third deterministic sample:
united.take(make_array(15, 30, 35))

In [None]:
# A random sample: unpack what this code does!
start = np.random.choice(np.arange(100))
systematic_sample = united.take(np.arange(start, united.num_rows, 100))
systematic_sample

## Part 2: Empirical distribution for rolling a die ##

In [None]:
# Set up rolling a die
die = Table().with_columns('Face', np.arange(1, 7))
die

In [None]:
# Get a random sample from this
die.sample(10)

In [None]:
# Draw a histogram
die.hist()

In [None]:
# Ugh that's not very pretty; let's be careful about the bins
roll_bins = np.arange(0.5, 6.6, 1)
die.hist(bins=roll_bins)

In [None]:
# Now let's see what happens if we introduce randomness into this:

die.sample(10).hist(bins=roll_bins)

In [None]:
# Let's take a bigger sample:

die.sample(100).hist(bins=roll_bins)

In [None]:
# Let's do an even bigger one:

die.sample(10000).hist(bins=roll_bins)

## Part 3: Back to the flight data ##

In [None]:
united.hist('Delay', bins = np.arange(-20, 201, 10), unit = 'minute')

In [None]:
systematic_sample.hist('Delay', bins = np.arange(-20, 201, 10), unit = 'minute')

In [None]:
# Now a different random sample: choose 500 flights at random
sample_size = 500
united.sample(sample_size).hist('Delay', bins = np.arange(-20, 201, 10), unit = 'minute')

In [None]:
# Now introduce a STATISTIC
delay = np.average(united.column('Delay'))

sample = united.sample(sample_size)
sample_delay = np.average(sample.column('Delay'))

print('Average delay', delay)
print('Sample average', sample_delay)

In [None]:
# Let's generate a bunch of sample averages instead:

def random_sample_average():
    return np.average(united.sample(500).column('Delay'))

sample_averages = make_array()

for _ in np.arange(1000):
    sample_averages = np.append(sample_averages, random_sample_average())

In [None]:
# Now make a table of the data!
sample_average_table = Table().with_columns('Sample average', sample_averages)
sample_average_table

In [None]:
# Now let's look at the distribution of the sample averages

sample_average_table.hist()
plots.scatter(delay, 0.10, color='red', s=300)