In [1]:
import numpy as np
import pandas as pd

How likely is it that you roll doubles when rolling two dice?

In [2]:
n_trials = 100_000 # rows
n_dice = 2 # columns

rolls_array = np.random.choice([1, 2, 3, 4, 5, 6], n_trials * n_dice)\
        .reshape(n_trials, n_dice)

rolls_array

array([[4, 6],
       [5, 3],
       [5, 1],
       ...,
       [6, 6],
       [2, 6],
       [5, 1]])

In [3]:
(rolls_array[:,0] == rolls_array[:,1]).mean()

0.16696

In [4]:
# pandas solution

rolls_df = pd.DataFrame(rolls_array)
rolls_df['die1'] = np.random.choice([1, 2, 3, 4, 5, 6], size = 100_000)
rolls_df['die2'] = np.random.choice([1, 2, 3, 4, 5, 6], size = 100_000)
rolls_df

Unnamed: 0,0,1,die1,die2
0,4,6,6,2
1,5,3,2,2
2,5,1,6,1
3,1,3,4,6
4,2,4,5,5
...,...,...,...,...
99995,5,3,6,3
99996,6,2,4,1
99997,6,6,3,3
99998,2,6,3,4


In [5]:
rolls_df['is_pair'] = (rolls_df.die1 == rolls_df.die2)
rolls_df.is_pair.mean()

0.16751

In [6]:
# numpy solution

a = np.random.choice([1, 2, 3, 4, 5, 6], size = 100_000)
b = np.random.choice([1, 2, 3, 4, 5, 6], size = 100_000)
(a == b).mean()

0.16563

If you flip 8 coins, what is the probability of getting exactly 3 heads? 

What is the probability of getting more than 3 heads?

In [7]:
n_trials = nrows = 100_000
n_coins = ncols = 8

# 1 is heads, 0 is tails (encoded)
coin_flips = np.random.choice([1, 0], n_trials * n_coins)\
.reshape(nrows, ncols)
coin_flips

array([[1, 0, 1, ..., 1, 0, 1],
       [1, 1, 1, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 1, 0, ..., 1, 1, 1],
       [1, 1, 0, ..., 1, 0, 1],
       [1, 0, 1, ..., 0, 0, 1]])

In [18]:
#axis 1 is sum by row
num_of_heads = coin_flips.sum(axis = 1)
num_of_heads

array([4, 5, 2, ..., 5, 5, 5])

In [10]:
(num_of_heads == 3).mean()

0.2181

In [11]:
(num_of_heads > 3).mean()

0.63614

There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. 
Assuming that Codeup randomly selects an alumni to put on a billboard...

what are the odds that the two billboards I drive past both have data science students on them?

In [100]:
n_trials = 100_000
billboards = 2
# web_dev = 3 (75%)
# data_sci = 1 (25%)

cohort_odds = np.random.choice(['webdev', 'datasci'], 
              size=(n_trials, billboards), 
              p=[.75, .25])
cohort_odds

array([['webdev', 'webdev'],
       ['webdev', 'datasci'],
       ['datasci', 'datasci'],
       ...,
       ['webdev', 'datasci'],
       ['webdev', 'webdev'],
       ['webdev', 'datasci']], dtype='<U7')

In [101]:
cohort_df = pd.DataFrame(cohort_odds)
cohort_df.columns = ['first_billboard', 'second_billboard']
cohort_df

Unnamed: 0,first_billboard,second_billboard
0,webdev,webdev
1,webdev,datasci
2,datasci,datasci
3,webdev,webdev
4,webdev,datasci
...,...,...
99995,webdev,webdev
99996,webdev,webdev
99997,webdev,datasci
99998,webdev,webdev


In [102]:
cohort_df['both_ds'] = \
(cohort_df.first_billboard == 'datasci') & (cohort_df.second_billboard == 'datasci')
cohort_df

Unnamed: 0,first_billboard,second_billboard,both_ds
0,webdev,webdev,False
1,webdev,datasci,False
2,datasci,datasci,True
3,webdev,webdev,False
4,webdev,datasci,False
...,...,...,...
99995,webdev,webdev,False
99996,webdev,webdev,False
99997,webdev,datasci,False
99998,webdev,webdev,False


In [105]:
cohort_df['both_ds'].mean()

0.06318

Codeup students buy, on average, 3 poptart packages with a standard deviation of 1.5 a day from the snack vending machine. 

If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon? 

(Remember, if you have mean and standard deviation, use the np.random.normal) You'll need to make a judgement call on how to handle some of your values

In [99]:
mean = 3 # packages
st_dev = 1.5 # packages
n_days = 5 # days
restocked = 17 # packages
mean_week = mean * n_days

In [70]:
n_weeks = 10000
n_packages_per_day = np.round(np.random.normal(mean, st_dev, size=(n_weeks, n_days)))

In [71]:
n_left_per_week = restocked - np.sum(n_packages_per_day[:, :4], axis=1)

In [72]:
n_left_mask = n_left_per_week > 0
prob_left = np.sum(n_left_mask) / n_weeks
prob_left

0.9313

Compare Heights

Men have an average height of 178 cm and standard deviation of 8cm.

Women have a mean of 170, sd = 6cm.

Since you have means and standard deviations, you can use np.random.normal to generate observations.

If a man and woman are chosen at random, what is the likelihood the woman is taller than the man?

In [75]:
men_mean = 178
men_std_dev = 8
women_mean = 170
women_std_dev = 6
n_choices = 10000

n_men = np.round(np.random.normal(men_mean, men_std_dev, size=(n_choices)))
n_women = np.round(np.random.normal(women_mean, women_std_dev, size=(n_choices)))

(n_women > n_men).mean()

0.2066

When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails. 

What are the odds that after having 50 students download anaconda, no one has an installation issue? 100 students?

In [113]:
fail = 1 / 250
success = 249/250
chance = [success, fail]

n_students = 50
n_trials = 10000
n_students_100 = 100
n_students_150 = 150
n_students_450 = 450

In [85]:
download = np.random.choice([1,0], n_students * n_trials, p=chance)\
           .reshape(n_trials, n_students)
download

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 0, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [87]:
(download.sum(axis = 1) == 50).mean()

0.8225

In [92]:
download_100 = np.random.choice([1,0], n_students_100 * n_trials, p=chance)\
.reshape(n_trials, n_students_100)
download_100

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [98]:
(download_100.sum(axis=1) == 100).mean()

0.6684

What is the probability that we observe an installation issue within the first 150 students that download anaconda?


How likely is it that 450 students all download anaconda without an issue?

In [111]:
download_450 = np.random.choice([1,0], n_students_450 * n_trials, p=chance)\
.reshape(n_trials, n_students_450)
download_450

array([[1, 1, 0, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [112]:
(download_450.sum(axis=1) == 450).mean()

0.1691

In [116]:
download_150 = np.random.choice([1,0], n_students_150 * n_trials, p=chance)\
.reshape(n_trials, n_students_150)
download_150

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [120]:
(1 - (download_150.sum(axis = 1) == 150).mean()).round(3)

0.451

There's a 70% chance on any given day that there will be at least one food truck at Travis Park. 

However, you haven't seen a food truck there in 3 days. How unlikely is this?

In [135]:
n_trials = 10000
shows = .70
no_shows = .30
truck_chance = [shows, no_shows]
n_days = 7

truck_arrives = np.random.choice(['truck', 'none'], n_trials * n_days, p=truck_chance)\
.reshape(n_trials, n_days)
truck_arrives

array([['truck', 'truck', 'none', ..., 'truck', 'truck', 'truck'],
       ['truck', 'truck', 'truck', ..., 'truck', 'none', 'truck'],
       ['truck', 'none', 'truck', ..., 'truck', 'none', 'truck'],
       ...,
       ['none', 'truck', 'truck', ..., 'none', 'truck', 'truck'],
       ['truck', 'truck', 'none', ..., 'truck', 'none', 'truck'],
       ['truck', 'none', 'none', ..., 'truck', 'none', 'truck']],
      dtype='<U5')

In [140]:
((truck_arrives[:,0] == 'none') & (truck_arrives[:,1] == 'none')\
& (truck_arrives[:,2] == 'none')).mean() * 100

2.74

In [148]:
trucks = np.random.choice([1,0], size=(100_000, 3), p=truck_chance)
df = pd.DataFrame(trucks)
df.columns = ['day_1', 'day_2', 'day_3']
df.head()

Unnamed: 0,day_1,day_2,day_3
0,1,0,1
1,1,1,0
2,1,1,1
3,0,1,1
4,0,1,1


In [149]:
df['appear'] = df.day_1 + df.day_2 + df.day_3
df.head()

Unnamed: 0,day_1,day_2,day_3,appear
0,1,0,1,2
1,1,1,0,2
2,1,1,1,3
3,0,1,1,2
4,0,1,1,2


In [153]:
(df['appear'] == 0).mean() * 100

2.742

In [None]:
# # How likely is it that a food truck will show up sometime this week?
# trucks = np.random.choice([1,0], size=(100_000, 7), p=[.7,.3])
# df = pd.DataFrame(trucks)

# df['appear'] = df.sum(axis=1)
# (df.appear > 0).mean()

How likely is it that a food truck will show up sometime this week?

In [156]:
trucks = np.random.choice([1, 0], size=(100_000, 7), p=truck_chance)
df = pd.DataFrame(trucks)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,1,0,0,1,1,1,1
1,0,0,1,1,0,1,1
2,0,1,1,1,1,0,1
3,1,1,1,1,1,1,0
4,1,1,1,1,1,0,0


In [158]:
df['appear'] = df.sum(axis = 1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,appear
0,1,0,0,1,1,1,1,10
1,0,0,1,1,0,1,1,8
2,0,1,1,1,1,0,1,10
3,1,1,1,1,1,1,0,12
4,1,1,1,1,1,0,0,10


In [159]:
(df['appear'] > 0).mean()

0.99974

If 23 people are in the same room, what are the odds that two of them share a birthday? 

What if it's 20 people? 40?

In [176]:
n_trials = 100000 # rows
n_people = 23 # columns

birthday = np.random.choice(range(1,366), size=(n_trials, n_people))
df_birthday = pd.DataFrame(birthday)
df_birthday.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,70,325,335,46,66,349,185,59,46,273,...,345,212,364,301,7,88,280,222,123,9
1,286,66,123,65,208,214,305,286,325,96,...,87,296,140,294,294,38,337,39,27,278
2,258,262,258,226,139,140,119,171,250,189,...,177,32,228,22,317,84,228,108,180,115
3,127,70,93,227,171,315,214,184,228,240,...,143,195,274,268,357,70,134,129,70,3
4,249,114,1,107,227,225,354,235,338,257,...,121,122,277,231,110,345,67,208,43,89


In [183]:
df_birthday['unique'] = df.nunique(axis=1)
df_birthday.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,unique,n_unique
0,70,325,335,46,66,349,185,59,46,273,...,364,301,7,88,280,222,123,9,4,4
1,286,66,123,65,208,214,305,286,325,96,...,140,294,294,38,337,39,27,278,4,4
2,258,262,258,226,139,140,119,171,250,189,...,228,22,317,84,228,108,180,115,4,4
3,127,70,93,227,171,315,214,184,228,240,...,274,268,357,70,134,129,70,3,4,4
4,249,114,1,107,227,225,354,235,338,257,...,277,231,110,345,67,208,43,89,4,4


In [180]:
(df.unique != 23).mean()

1.0

In [191]:
df.drop(columns = ['n_unique'])

KeyError: "['n_unique'] not found in axis"

In [192]:
df_birthday

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,unique,n_unique
0,70,325,335,46,66,349,185,59,46,273,...,364,301,7,88,280,222,123,9,4,4
1,286,66,123,65,208,214,305,286,325,96,...,140,294,294,38,337,39,27,278,4,4
2,258,262,258,226,139,140,119,171,250,189,...,228,22,317,84,228,108,180,115,4,4
3,127,70,93,227,171,315,214,184,228,240,...,274,268,357,70,134,129,70,3,4,4
4,249,114,1,107,227,225,354,235,338,257,...,277,231,110,345,67,208,43,89,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,126,364,1,289,264,216,241,28,364,7,...,294,213,264,141,298,334,113,110,4,4
99996,111,340,303,318,320,247,172,129,48,253,...,355,342,250,1,354,116,11,331,4,4
99997,5,303,295,16,321,281,320,109,249,278,...,122,85,80,338,261,335,169,240,4,4
99998,297,154,285,243,79,166,93,130,68,305,...,106,218,135,114,307,147,88,191,3,3
