## Lecture 11 & 12 - Simulation and Distributions ##

## 5.2: Module 5 Notebook 2 ##

In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Random Sampling ##

In [None]:
united = Table.read_table('united.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
united

In [None]:
united.where('Destination', 'JFK') 

In [None]:
united.take(np.arange(0, united.num_rows, 1000))

In [None]:
united.take(make_array(34, 6321, 10040))

In [None]:
start = np.random.choice(np.arange(1000))
systematic_sample = united.take(np.arange(start, united.num_rows, 1000))
systematic_sample.show()

## Distributions ##

In [None]:
die = Table().with_column('Face', np.arange(1, 7))
die

In [None]:
die.sample(10)

In [None]:
die.hist()

In [None]:
roll_bins = np.arange(0.5, 6.6, 1)

In [None]:
die.hist(bins=roll_bins)

In [None]:
die.sample(10).hist(bins=roll_bins)

In [None]:
die.sample(1000).hist(bins=roll_bins)

In [None]:
die.sample(100000).hist(bins=roll_bins)

## Large Random Samples ##

In [None]:
united 

In [None]:
# (Population) Probability Distribution

united_bins = np.arange(-20, 201, 5)
united.hist('Delay', bins = united_bins)

In [None]:
min(united.column('Delay'))

In [None]:
max(united.column('Delay'))

In [None]:
np.average(united.column('Delay'))

In [None]:
# (Sample) Empirical Distribution
united.sample(10).hist('Delay', bins = united_bins)

In [None]:
# (Sample) Empirical Distribution
united.sample(1000).hist('Delay', bins = united_bins)

## Simulating Statistics ##

In [None]:
# (Population) Parameter
np.median(united.column('Delay'))

In [None]:
# (Sample) Statistic
np.median(united.sample(10).column('Delay'))

In [None]:
# (Sample) Statistic
np.median(united.sample(100).column('Delay'))

In [None]:
def sample_median(size):
    return np.median(united.sample(size).column('Delay'))

In [None]:
sample_median(10)

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = sample_median(10)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column('Sample medians', sample_medians).hist(bins = np.arange(-10,31))

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = sample_median(1000)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column(
    'Sample medians', sample_medians).hist(bins = np.arange(-10,31))

### Empirical Distributions Overlayed

In [None]:
sample_medians_10 = make_array()
sample_medians_100 = make_array()
sample_medians_1000 = make_array()

num_simulations = 2000

for i in np.arange(num_simulations):
    new_median_10 = sample_median(10)
    sample_medians_10 = np.append(sample_medians_10, new_median_10)
    new_median_100 = sample_median(100)
    sample_medians_100 = np.append(sample_medians_100, new_median_100)
    new_median_1000 = sample_median(1000)
    sample_medians_1000 = np.append(sample_medians_1000, new_median_1000)

In [None]:
sample_medians = Table().with_columns('Size 10', sample_medians_10, 
                                      'Size 100', sample_medians_100,
                                      'Size 1000', sample_medians_1000)

In [None]:
sample_medians.hist(bins = np.arange(-5, 30))

## Accessing models

### Ex. 1: Swain vs. Alabama

In [None]:
eligible_population = make_array(0.26, 0.74)
eligible_population

In [None]:
sample_proportions(100, eligible_population)

In [None]:
# statistic: number of black men among random sample of 100 men from eligible population

100 * sample_proportions(100, eligible_population).item(0)

In [None]:
# Simulation

counts = make_array()

for i in np.arange(10000):
    new_count = 100 * sample_proportions(100, eligible_population).item(0)
    counts = np.append(counts, new_count)

In [None]:
counts

In [None]:
# Visualization

Table().with_column('Random Sample Count', counts).hist(bins = np.arange(9.5, 45, 1))

observed_count = 8
plots.scatter(observed_count, 0, color='red', s=30);

### Ex. 2: Mendel and Pea Flowers

In [None]:
model = make_array(0.75, 0.25)

In [None]:
sample_proportions(929, model)

In [None]:
# statistic: distance between sample percent (of purple plants) and 75

abs(100 * sample_proportions(929, model).item(0) - 75)

In [None]:
# Simulation

distances = make_array()

for i in np.arange(10000):
    new_distance = abs(100 * sample_proportions(929, model).item(0) - 75)
    distances = np.append(distances, new_distance)

In [None]:
Table().with_column('Distance from 75%', distances).hist()

In [None]:
observed_distance =  abs(100*(705/929) - 75)
observed_distance

In [None]:
Table().with_column('Distance from 75%', distances).hist()
plots.scatter(observed_distance, 0, color='red', s=30);