# Simulation Exercises

#### 1. How likely is it that you roll doubles when rolling two dice?

In [184]:
import numpy as np
np.random.seed(1)
# generate values for first die
die1 = np.random.choice([1, 2, 3, 4, 5, 6], 100_000)
# generate values for second die
die2 = np.random.choice([1, 2, 3, 4, 5, 6], 100_000)
# define what doubles means
doubles = die1 == die2
# get average rate of doubles
doubles.mean()

0.16789

#### 2. If you flip 8 coins, what is the probability of getting exactly 3 heads? What is the probability of getting more than 3 heads?

In [185]:
# establish number of trials
n_trials = nrows = 100_000
# establish 8 coins are being flipped per trial
n_coins = ncols = 8
np.random.seed(2)
# generate values for coin flips with 0=tails and 1=heads
flips = np.random.choice([1,0],n_trials*n_coins).reshape(nrows, ncols)
flips

array([[1, 0, 0, ..., 0, 1, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 1, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 1]])

In [186]:
import pandas as pd
# convert array to dataframe
flips = pd.DataFrame(flips)
# add column to count how many times coins land on heads per trial
flips['heads_count'] = flips.sum(axis=1)
flips.head()

Unnamed: 0,0,1,2,3,4,5,6,7,heads_count
0,1,0,0,1,1,0,1,0,4
1,1,0,1,0,0,0,0,0,2
2,0,0,1,1,1,1,0,0,4
3,0,1,1,1,0,0,0,1,4
4,1,0,1,1,0,0,0,1,4


In [187]:
# average rate of getting heads exactly 3 times
print(f'The probability of getting exactly 3 heads is {(flips.heads_count == 3).mean()}.')
# average rate of getting heads more than 3 times
print(f'The probability of getting more than 3 heads is {(flips.heads_count > 3).mean()}.')

The probability of getting exactly 3 heads is 0.216.
The probability of getting more than 3 heads is 0.63964.


#### 3. There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

In [188]:
# 100,000 trials
n_trials = nrows = 100_000
# 2 billboards per trial
n_billboards = ncols = 2
np.random.seed(3)
# generate data for driving past billboards, specifying each outcome's probability
billboards = np.random.choice(['webdev', 'datasci'], p=[.75, .25], size = (nrows, ncols))
# convert array to dataframe
billboards = pd.DataFrame(billboards, columns = ['bill1', 'bill2'])
billboards.head()

Unnamed: 0,bill1,bill2
0,webdev,webdev
1,webdev,webdev
2,datasci,datasci
3,webdev,webdev
4,webdev,webdev


In [189]:
# create function to count instances of 'datasci'
def ds_count(row):
    return row.bill1.count('datasci') + row.bill2.count('datasci')

In [190]:
# add column to dataframe applying ds_count function to dataframe rows
# (counts instances of 'datasci' in each row)
billboards['ds_signs'] = billboards.apply(ds_count, axis=1)
billboards.head()

Unnamed: 0,bill1,bill2,ds_signs
0,webdev,webdev,0
1,webdev,webdev,0
2,datasci,datasci,2
3,webdev,webdev,0
4,webdev,webdev,0


In [191]:
# average rate of seeing two billboards with data science alumni
print(f'The probability of seeing data science alumni on both billboards is {(billboards.ds_signs == 2).mean()}.')

The probability of seeing data science alumni on both billboards is 0.06232.


#### 4. Codeup students buy, on average, 3 poptart packages with a standard deviation of 1.5 a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon? (Remember, if you have mean and standard deviation, use the np.random.normal)

In [195]:
np.random.seed(4)
# generate random observations for 5 days (M-F) and 100,000 trials based on given mean and sd
# convert results to int data type to represent whole packages of poptarts
poptarts = np.random.normal(3, 1.5, [100_000, 5]).astype(int)
# convert array to dataframe
poptarts = pd.DataFrame(poptarts, columns = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri'])
poptarts

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
0,3,3,1,4,2
1,0,2,3,3,1
2,3,2,3,3,1
3,3,2,5,4,3
4,1,3,3,1,6
...,...,...,...,...,...
99995,3,2,2,3,4
99996,3,2,4,2,-1
99997,1,3,3,4,1
99998,1,3,0,1,2


In [196]:
# sum across to get total poptarts purchased throughout the week
poptarts['total_eaten'] = poptarts.sum(axis=1)
poptarts

Unnamed: 0,Mon,Tue,Wed,Thu,Fri,total_eaten
0,3,3,1,4,2,13
1,0,2,3,3,1,9
2,3,2,3,3,1,12
3,3,2,5,4,3,17
4,1,3,3,1,6,14
...,...,...,...,...,...,...
99995,3,2,2,3,4,14
99996,3,2,4,2,-1,10
99997,1,3,3,4,1,12
99998,1,3,0,1,2,7


In [198]:
# average rate of having at least one poptart package left at the end of the week
print(f'The probability that there will be poptarts available for purchase on Friday afternoon is {(poptarts.total_eaten < 17).mean()}.')

The probability that there will be poptarts available for purchase on Friday afternoon is 0.88024.


#### 5. Compare Heights

- Men have an average height of 178 cm and standard deviation of 8cm.
- Women have a mean of 170, sd = 6cm.
- Since you have means and standard deviations, you can use np.random.normal to generate observations.
- If a man and woman are chosen at random, P(woman taller than man)?

In [201]:
np.random.seed(5)
# generate observations for female and male heights
mens_height = np.random.normal(178, 8, 100_000)
womens_height = np.random.normal(170, 6, 100_000)

df = pd.DataFrame({'mens_height': mens_height, 'womens_height': womens_height})
df

Unnamed: 0,mens_height,womens_height
0,181.529820,161.970612
1,175.353039,168.602590
2,197.446169,163.224571
3,175.983263,169.299606
4,178.876879,170.336759
...,...,...
99995,171.484304,168.584183
99996,179.611520,175.456661
99997,183.734250,178.803664
99998,189.385711,171.562260


In [202]:
df['taller_woman'] = df.womens_height > df.mens_height
df

Unnamed: 0,mens_height,womens_height,taller_woman
0,181.529820,161.970612,False
1,175.353039,168.602590,False
2,197.446169,163.224571,False
3,175.983263,169.299606,False
4,178.876879,170.336759,False
...,...,...,...
99995,171.484304,168.584183,False
99996,179.611520,175.456661,False
99997,183.734250,178.803664,False
99998,189.385711,171.562260,False


In [210]:
print(f'The probability of a randomly chosen woman being taller than a randomly selected man is {df.taller_woman.mean()}.')

The probability of a randomly chosen woman being taller than a randomly selected man is 0.21093.


#### 6. When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails. What are the odds that after having 50 students download anaconda, no one has an installation issue? 100 students?

In [207]:
np.random.seed(6)
# generate data for 100,000 trials of 50 students' anaconda installations given the probability of corruption
# 0 is a successful installation, 1 is a failed installation
installs = np.random.choice([0,1], p=[(249/250),(1/250)], size=[100_000, 50])
# convert array to dataframe
installs = pd.DataFrame(installs)
installs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [208]:
# add column to count failed installations
installs['failed'] = installs.sum(axis=1)
installs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,failed
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [209]:
print(f'The probability of no failed installations for 50 students is {(installs.failed == 0).mean()}.')

The probability of no failed installations for 50 students is 0.81885.


In [206]:
installs_100 = np.random.choice([0,1], p=[(249/250),(1/250)], size=[100_000, 100])
installs_100 = pd.DataFrame(installs_100)
installs_100['failed'] = installs_100.sum(axis=1)
print(f'The probability of no failed installations for 100 students is {(installs_100.failed == 0).mean()}.')

The probability of no failed installations for 100 students is 0.66846.


In [None]:
# practice defining a function to do the above (but with less code)

#### What is the probability that we observe an installation issue within the first 150 students that download anaconda?

In [205]:
installs_150 = np.random.choice([0,1], p=[(249/250),(1/250)], size=[100_000, 150])
installs_150 = pd.DataFrame(installs_150)
installs_150['failed'] = installs_150.sum(axis=1)
print(f'The probability of an issue occurring within the first 150 student downloads is {(installs_150.failed > 0).mean()}.')

The probability of an issue occurring within the first 150 student downloads is 0.45107.


#### How likely is it that 450 students all download anaconda without an issue?

In [204]:
installs_450 = np.random.choice([0,1], p=[(249/250),(1/250)], size=[100_000, 450])
installs_450 = pd.DataFrame(installs_450)
installs_450['failed'] = installs_450.sum(axis=1)
print(f'The probability of no issues occurring for all 450 students is {(installs_450.failed == 0).mean()}.')

The probability of no issues occurring for all 450 students is 0.16595.


#### 7. There's a 70% chance on any given day that there will be at least one food truck at Travis Park. However, you haven't seen a food truck there in 3 days. How unlikely is this?

In [127]:
np.random.seed(7)
# generate data for food trucks at Travis Park for three days, given the specified probability
foodtrucks = np.random.choice([0,1], p=[.3, .7], size=[10_000, 3])
# convert array to dataframe
foodtrucks = pd.DataFrame(foodtrucks, columns = ['Sun','Mon', 'Tue'])
foodtrucks.head()
# 1 means there is at least one food truck at Travis Park
# 0 means there are no food trucks at Travis Park

Unnamed: 0,Sun,Mon,Tue
0,0,1,1
1,1,1,1
2,1,0,0
3,1,1,1
4,1,0,0


In [128]:
# add column to count how many days there was at least one food truck at Travis Park
foodtrucks['three_day_total'] = foodtrucks.sum(axis=1)
foodtrucks

Unnamed: 0,Sun,Mon,Tue,three_day_total
0,0,1,1,2
1,1,1,1,3
2,1,0,0,1
3,1,1,1,3
4,1,0,0,1
...,...,...,...,...
9995,1,1,1,3
9996,0,0,1,1
9997,0,1,1,2
9998,1,1,1,3


In [129]:
# average rate of no food trucks being at Travis Park for 3 days
print(f'The probability of seeing no food trucks at Travis Park in 3 days is {(foodtrucks.three_day_total == 0).mean()}.')

The probability of seeing no food trucks at Travis Park in 3 days is 0.0271.


#### How likely is it that a food truck will show up sometime this week?

In [130]:
# isolate values from first part of week and assign to new variable
foodtrucks_wkpt1 = foodtrucks.drop(columns = 'three_day_total')
# generate values for second part of week
foodtrucks_wkpt2 = np.random.choice([0,1], p=[.3, .7], size=[10_000, 4])
# convert array to dataframe, assign new column names
foodtrucks_wkpt2 = pd.DataFrame(foodtrucks_wkpt2, columns = ['Wed', 'Thu', 'Fri', 'Sat'])
# concatenate half-week dataframes to have values for entire week in single dataframe
foodtrucks_wk = pd.concat([foodtrucks_wkpt1, foodtrucks_wkpt2], axis=1)
# display full week's values
foodtrucks_wk

Unnamed: 0,Sun,Mon,Tue,Wed,Thu,Fri,Sat
0,0,1,1,1,0,0,1
1,1,1,1,0,0,0,1
2,1,0,0,0,1,0,1
3,1,1,1,1,1,1,1
4,1,0,0,1,0,1,1
...,...,...,...,...,...,...,...
9995,1,1,1,0,1,1,1
9996,0,0,1,1,1,0,1
9997,0,1,1,1,0,0,1
9998,1,1,1,1,0,0,1


In [147]:
# create column to show how many food trucks were at Travis Park during the entire week
foodtrucks_wk['total_trucks'] = foodtrucks_wk.sum(axis=1)
foodtrucks_wk.head()

Unnamed: 0,Sun,Mon,Tue,Wed,Thu,Fri,Sat,total_trucks
0,0,1,1,1,0,0,1,4
1,1,1,1,0,0,0,1,4
2,1,0,0,0,1,0,1,3
3,1,1,1,1,1,1,1,7
4,1,0,0,1,0,1,1,4


In [148]:
# isolate only those observations where we did not see a food truck for the first three days of the week
this_week = foodtrucks_wk[(foodtrucks_wk.Sun == 0)&(foodtrucks_wk.Mon == 0)&(foodtrucks_wk.Tue == 0)]
this_week

Unnamed: 0,Sun,Mon,Tue,Wed,Thu,Fri,Sat,total_trucks
47,0,0,0,0,1,1,1,3
56,0,0,0,0,0,1,0,1
96,0,0,0,1,0,0,0,1
97,0,0,0,0,0,1,0,1
159,0,0,0,1,1,1,1,4
...,...,...,...,...,...,...,...,...
9689,0,0,0,1,1,0,1,3
9698,0,0,0,0,1,1,1,3
9736,0,0,0,1,1,1,1,4
9816,0,0,0,1,0,0,1,2


In [149]:
# average rate of there being at least one food truck at Travis Park
# during the four days following a three day period of no food trucks 
# (because that's what happened earlier this week)
print(f'The probability that a food truck will show up sometime this week is {(this_week.total_trucks > 0).mean()}.')

The probability that a food truck will show up sometime this week is 0.992619926199262.


#### 8. If 23 people are in the same room, what are the odds that two of them share a birthday? What if it's 20 people? 40?