## Statistics

In [115]:
%matplotlib inline
import numpy as np
import pandas as pd
import random
np.random.seed(29)

#### How likely is it that you roll doubles when rolling two dice?

In [269]:
outcomes = [1, 2, 3, 4, 5, 6]

rolls = np.random.choice(outcomes, size = (10000, 2))

rolls

array([[1, 6],
       [6, 3],
       [6, 2],
       ...,
       [6, 6],
       [3, 2],
       [6, 4]])

In [273]:
rolls_df = pd.DataFrame(rolls)
rolls_df

Unnamed: 0,0,1
0,1,6
1,6,3
2,6,2
3,3,1
4,5,2
...,...,...
9995,1,1
9996,2,3
9997,6,6
9998,3,2


In [276]:
(rolls_df[0] == rolls_df[1]).mean()

0.1646

#### If you flip 8 coins, what is the probability of getting exactly 3 heads? What is the probability of getting more than 3 heads?

In [284]:
flips = np.random.choice(['h', 't'], size = (10000, 8))
flips

array([['h', 'h', 'h', ..., 't', 'h', 'h'],
       ['t', 'h', 'h', ..., 't', 'h', 't'],
       ['t', 't', 'h', ..., 't', 't', 't'],
       ...,
       ['t', 'h', 'h', ..., 't', 't', 't'],
       ['h', 'h', 't', ..., 't', 'h', 'h'],
       ['h', 'h', 't', ..., 'h', 'h', 't']], dtype='<U1')

In [285]:
count_of_heads = np.char.count(flips, 'h')
count_of_heads

array([[1, 1, 1, ..., 0, 1, 1],
       [0, 1, 1, ..., 0, 1, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 1, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 1, 1],
       [1, 1, 0, ..., 1, 1, 0]])

In [286]:
sums_by_trial = count_of_heads.sum(axis = 1)
sums_by_trial

array([6, 4, 1, ..., 3, 4, 4])

In [287]:
three_heads = (sums_by_trial == 3)
three_heads

array([False, False, False, ...,  True, False, False])

In [288]:
flip_rate = three_heads.astype(int).mean()
flip_rate

0.218

In [289]:
more_than_three_heads = (sums_by_trial > 3)
more_than_three_heads

array([ True,  True, False, ..., False,  True,  True])

In [290]:
flip_rate_two = more_than_three_heads.astype(int).mean()
flip_rate_two

0.6369

#### There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

In [306]:
# set probability

p_webdev = 0.75
p_ds = 0.25
billboard_options = ['WD', 'DS']

billboards = np.random.choice(billboard_options, size = (10000, 2), p = [0.75, 0.25])
billboards

array([['WD', 'WD'],
       ['WD', 'WD'],
       ['WD', 'DS'],
       ...,
       ['DS', 'WD'],
       ['WD', 'WD'],
       ['WD', 'DS']], dtype='<U2')

In [307]:
billboards == 'DS'

array([[False, False],
       [False, False],
       [False,  True],
       ...,
       [ True, False],
       [False, False],
       [False,  True]])

In [309]:
success = ((billboards == 'DS').sum(axis = 1) == 2)

In [310]:
success.mean()

0.0612

- My way

In [296]:
cohorts = np.random.choice(['data', 'wd_one', 'wd_two', 'wd_three'], size = (10000, 2))

In [297]:
count_of_data = np.char.count(cohorts, 'data')
count_of_data

array([[0, 1],
       [0, 0],
       [0, 0],
       ...,
       [0, 1],
       [0, 1],
       [1, 0]])

In [298]:
sums_by_trial = count_of_data.sum(axis = 1)
sums_by_trial

array([1, 0, 0, ..., 1, 1, 1])

In [299]:
both_data = (sums_by_trial == 2)
both_data

array([False, False, False, ..., False, False, False])

In [300]:
both_data_rate = both_data.astype(int).mean()
both_data_rate

0.0655

#### Codeup students buy, on average, 3 poptart packages with a standard deviation of 1.5 a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon? 
- (Remember, if you have mean and standard deviation, use the np.random.normal) You'll need to make a judgement call on how to handle some of your values

In [313]:
mean = 3
sd = 1.5 

# we will look at sales over 5 days
n_days = 5

In [314]:
poptart_sales = np.random.normal(mean, sd, size = (10000, n_days))
poptart_sales

array([[ 3.63147783,  1.91177869,  0.66372705,  3.97986281,  5.204228  ],
       [ 3.15013711,  4.19480259,  1.15373771,  3.69387132,  6.55805423],
       [ 4.02683806,  4.89790863,  3.64034655,  4.73573211, -0.05019664],
       ...,
       [ 3.30451472,  2.77085016,  1.10301542,  2.82293027,  3.66724451],
       [ 0.4130108 ,  1.15469602,  0.82614324,  3.71624682,  0.0473768 ],
       [ 6.05495774,  3.81845894,  2.80830275,  6.73197467,  3.26089884]])

In [315]:
# Account for negative sales as that is not realistic for this scenario, can have negative sales

poptart_sales[poptart_sales < 0] = 0

In [323]:
poptart_weekly_sales = poptart_sales.sum(axis = 1)
poptart_weekly_sales

array([15.39107439, 18.75060297, 17.30082535, ..., 13.66855509,
        6.15747367, 22.67459295])

In [324]:
(poptart_weekly_sales < 17).mean()

0.7235

#### Compare Heights

- Men have an average height of 178 cm and standard deviation of 8cm.
- Women have a mean of 170, sd = 6cm.
- Since you have means and standard deviations, you can use np.random.normal to generate observations.
- If a man and woman are chosen at random, what is the likelihood the woman is taller than the man?

In [93]:
m_mu, m_sigma = 178, 8 # mean and standard deviation

men_height = np.random.normal(m_mu, m_sigma, 10_000)
men_height

array([180.07598716, 188.06210761, 184.2413496 , ..., 179.30487776,
       176.67426759, 174.81791076])

In [94]:
f_mu, f_sigma = 170, 6 # mean and standard deviation
fem_height = np.random.normal(f_mu, f_sigma, 10_000)
fem_height

array([171.85365457, 161.73768498, 168.55782948, ..., 170.32338005,
       177.97754167, 173.04849163])

In [328]:
fem_height > men_height

array([False, False, False, ..., False,  True, False])

In [329]:
taller_fem_rate = taller_fem.astype(int).mean()
taller_fem_rate

0.2198

#### When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails. What are the odds that after having 50 students download anaconda, no one has an installation issue? 100 students?

In [334]:
# 1 in 250 or 1/250 = .004
# 249 in 250 od 249/250 = .0996

success = .996
fail = .004
n_students = 50 

installs = np.random.choice(['s', 'f'], size = (10000, n_students), p = [0.996, 0.004])
installs

array([['s', 's', 's', ..., 's', 's', 's'],
       ['s', 's', 's', ..., 's', 's', 's'],
       ['s', 's', 's', ..., 's', 's', 's'],
       ...,
       ['s', 's', 's', ..., 's', 's', 's'],
       ['s', 's', 's', ..., 's', 's', 's'],
       ['s', 's', 's', ..., 's', 's', 's']], dtype='<U1')

In [339]:
((installs == 's').sum(axis = 1) == 50).mean()

0.8164

In [341]:
success = .996
fail = .004
n_students = 100 

installs = np.random.choice(['s', 'f'], size = (10000, n_students), p = [0.996, 0.004])
installs

array([['s', 's', 's', ..., 's', 's', 's'],
       ['s', 's', 's', ..., 's', 's', 's'],
       ['s', 's', 's', ..., 's', 's', 's'],
       ...,
       ['s', 's', 's', ..., 's', 's', 's'],
       ['s', 's', 's', ..., 's', 's', 's'],
       ['s', 's', 's', ..., 's', 's', 's']], dtype='<U1')

In [342]:
((installs == 's').sum(axis = 1) == 100).mean()

0.6696

#### What is the probability that we observe an installation issue within the first 150 students that download anaconda?

In [343]:
installs = np.random.choice(['s', 'f'], size = (10000, 150), p = [0.996, 0.004])

In [345]:
bools = installs == 's'

In [347]:
n_successful_installs = bools.sum(axis = 1)
n_successful_installs

array([150, 149, 150, ..., 150, 149, 149])

In [351]:
((bools.sum(axis=1)) < 150).mean()

0.4547

#### How likely is it that 450 students all download anaconda without an issue?

In [356]:
installs = np.random.choice(['s', 'f'], size = (10000, 450), p = [0.996, 0.004])

In [357]:
bools = installs == 's'

In [358]:
n_successful_installs = bools.sum(axis = 1)
n_successful_installs

array([449, 446, 449, ..., 445, 449, 446])

In [359]:
((bools.sum(axis = 1)) == 450).mean()

0.1637

#### There's a 70% chance on any given day that there will be at least one food truck at Travis Park. However, you haven't seen a food truck there in 3 days. How unlikely is this?

In [369]:
# assign outcomes:

# 0 = no truck
# 1 = truck present

# probailities per the question:
# 70% chance its there
# 30% chance its not

trucks = np.random.choice([0, 1], size= (10000, 3), p = [0.3, 0.7])
trucks

array([[1, 0, 1],
       [1, 1, 1],
       [0, 1, 1],
       ...,
       [1, 0, 1],
       [0, 1, 1],
       [1, 0, 1]])

In [370]:
# Probability that no truck shows up

(trucks.sum(axis = 1) == 0).mean()

0.0252

In [371]:
# Probability that the truck shows up at least once

(trucks.sum(axis = 1) >= 1).mean()

0.9748

#### How likely is it that a food truck will show up sometime this week?

In [372]:
weekly_trucks = np.random.choice([0, 1], size= (10000, 8), p = [0.3, 0.7])

In [373]:
(weekly_trucks.sum(axis = 1) > 0).mean()

0.9999

#### If 23 people are in the same room, what are the odds that two of them share a birthday? What if it's 20 people? 40?

In [374]:
birthdates = range(0, 366)

n_students = 23

In [377]:
birthday = np.random.choice(birthdates, size = (10000, 23))
birthday

array([[116, 154, 204, ...,   2, 144,   1],
       [309, 304, 151, ..., 117, 142, 124],
       [230, 314, 241, ...,  40, 245, 328],
       ...,
       [363, 222, 127, ...,   4, 157, 237],
       [330, 145, 126, ..., 317, 353, 288],
       [115, 295, 285, ..., 108, 289, 311]])

In [379]:
birthdays_df = pd.DataFrame(birthday)
birthdays_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,116,154,204,322,175,311,148,359,357,217,...,12,90,146,156,59,93,14,2,144,1
1,309,304,151,310,126,256,195,78,78,102,...,290,0,20,110,345,59,238,117,142,124
2,230,314,241,228,316,133,199,344,331,22,...,283,187,364,33,347,307,127,40,245,328
3,53,304,126,5,323,240,167,208,252,215,...,178,135,173,293,323,3,260,288,222,348
4,282,221,186,164,27,332,301,208,26,340,...,288,151,57,117,80,325,339,298,362,135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,280,178,14,65,112,188,283,51,294,189,...,113,359,318,76,209,288,127,28,272,203
9996,351,29,207,31,365,86,285,175,3,68,...,43,286,114,113,154,221,245,44,319,315
9997,363,222,127,337,252,334,176,53,92,265,...,281,216,157,291,276,346,86,4,157,237
9998,330,145,126,50,221,0,18,145,268,193,...,344,75,75,279,62,90,50,317,353,288


In [382]:
#check that the unique value count corresponds to someone sharing a birthday
(birthdays_df.nunique(axis = 1) < 23).mean()

0.5047

In [385]:
#40 students
birthday = np.random.choice(birthdates, size = (10000, 40))
birthdays_df = pd.DataFrame(birthday)

In [386]:
(birthdays_df.nunique(axis = 1) < 40).mean()

0.8926

In [387]:
#20 students
birthday = np.random.choice(birthdates, size = (10000, 20))
birthdays_df = pd.DataFrame(birthday)

In [388]:
(birthdays_df.nunique(axis = 1) < 20).mean()

0.4081