In [1]:
# pyplot for plotting
import matplotlib.pyplot as plt
# numpy for vectorized array operations
import numpy as np
# pandas for proper tabular manipulation
import pandas as pd
# scipy stats for our subversions
from scipy import stats

## Q1) How likely is it that you roll doubles when rolling two dice?



In [2]:
n_trials = nrows = 10_000
n_dice = ncols = 2

rolls = np.random.choice([1, 2, 3, 4, 5, 6], n_trials * n_dice)
rolls


array([5, 6, 5, ..., 1, 5, 3])

## Q2) If you flip 8 coins, what is the probability of getting exactly 3 heads? What is the probability of getting more than 3 heads?




by theoretically,
### Combination Formula:
    -The total number of outcomes is 2^8(=256) because each coin can either be heads or tails, and there are 8 coins.

    -The number of ways to get exactly 3 heads is given by 
     C(8,3), which is the number of combinations of 8 items taken 3 at a time.

    -The probability is then C(8,3).
    
### For more than 3 heads:
    -You'd sum up the probabilities of getting 4, 5, 6, 7, and 8 heads.

    -For each case, use the combination formula to find the number of ways to get that specific number of heads.

    -The probability is then the sum of these individual probabilities.

Here are the steps:

### Exactly 3 heads:
    - C(8,3)=8!/(3!(8−3)!=8×7×6/3×2×1=56

 
    - Probability = 56/256 =7/32

 
### More than 3 heads:

    - P(X>3)=P(X=4)+P(X=5)+P(X=6)+P(X=7)+P(X=8)
            =1/2^8[c(8,4)+c(8,5)+c(8,6)+c(8,7)+c(8,8)]



This provides an empirical estimation based on counting the outcomes that meet the specified conditions. Keep in mind that this is an approximation and becomes more accurate with a larger number of trials or simulations.

### 1. Represent the data

In [3]:
# lets create our simulation:
# define our outcomes:
outcomes = ['H','T']
# number of trials will be our columns
n_trials = 8
# picking a number for our simulations.
# the higher the better, but too high may cause the computer
# to run a bit longer than we would like
# it will give the no. of rows
n_simulations = 10_000

### 2. create a matrix of random numbers

In [4]:
eight_flips = np.random.choice(outcomes,(n_simulations,n_trials))

In [5]:
eight_flips[:5,:]

array([['T', 'T', 'T', 'T', 'T', 'T', 'H', 'H'],
       ['H', 'H', 'H', 'T', 'T', 'H', 'T', 'H'],
       ['T', 'H', 'T', 'T', 'H', 'H', 'H', 'T'],
       ['H', 'H', 'T', 'H', 'H', 'H', 'H', 'T'],
       ['T', 'T', 'H', 'T', 'H', 'T', 'T', 'H']], dtype='<U1')

### 3. Apply an aggregate row-wise to produce the results of each simulation


In [6]:
(eight_flips == 'H')

array([[False, False, False, ..., False,  True,  True],
       [ True,  True,  True, ...,  True, False,  True],
       [False,  True, False, ...,  True,  True, False],
       ...,
       [ True,  True,  True, ..., False, False,  True],
       [False,  True,  True, ...,  True,  True,  True],
       [False,  True,  True, ..., False, False,  True]])

In [7]:
(eight_flips == 'H').sum(axis=1)

array([2, 5, 4, ..., 4, 6, 3])

### 4. Aggregate the resulting data to get our experimental probability



In [8]:
(eight_flips == 'H').sum(axis=1)

array([2, 5, 4, ..., 4, 6, 3])

In [9]:
## exactly 3 heads

(eight_flips == 'H').sum(axis=1)==3

array([False, False, False, ..., False, False,  True])

In [10]:
((eight_flips == 'H').sum(axis=1)==3).sum()

2183

In [11]:
((eight_flips == 'H').sum(axis=1)==3).sum()/n_simulations

0.2183

In [12]:
# getting more than 3 heads
(eight_flips == 'H').sum(axis=1)>=3


array([False,  True,  True, ...,  True,  True,  True])

In [13]:
((eight_flips == 'H').sum(axis=1)>=3).sum()


8561

In [14]:
((eight_flips == 'H').sum(axis=1)>=3).sum()/n_simulations


0.8561

## Q3) There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

Theoretically, 

If there are approximately 3 web development cohorts for every 1 data science cohort, and Codeup randomly selects an alumni to put on a billboard.
Then,


The probability of seeing a data science student on one billboard can be represented as 

            P(data science)=1/4 


The probability of seeing a web development student on one billboard is 

            P(web development)=3/4

Since these events are independent (assuming one billboard doesn't affect the other), you can multiply the probabilities to find the probability of both events happening (seeing data science students on both billboards):

        P(data science on both)=P(Data Science)×P(Data Science)

        P(data science on both)=1/4 × 1/4 = 1/16=0.0625

So, the odds that the two billboards you drive past both have data science students on them are 1/16.

In [15]:
billboard =['Web Dev', 'Web Dev', 'Web Dev', 'Data Science']
n_trials=2
n_simulations=10000

In [16]:
codeup=np.random.choice(billboard,(n_simulations,n_trials))
codeup[:4,:4]

array([['Web Dev', 'Data Science'],
       ['Web Dev', 'Web Dev'],
       ['Web Dev', 'Web Dev'],
       ['Web Dev', 'Web Dev']], dtype='<U12')

In [17]:
mask= (codeup=='Data Science')
mask

array([[False,  True],
       [False, False],
       [False, False],
       ...,
       [False, False],
       [False, False],
       [False, False]])

In [18]:
mask.sum(axis=1)==2

array([False, False, False, ..., False, False, False])

In [19]:
(mask.sum(axis=1)==2).mean()

0.06

## Q4) Codeup students buy, on average, 3 poptart packages with a standard deviation of 1.5 a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon?

### (Remember, if you have mean and standard deviation, use the np.random.normal) You'll need to make a judgement call on how to handle some of your values

In [20]:
# Parameters
mean_poptarts_per_day = 3
std_dev_poptarts_per_day = 1.5
days_until_friday = 4  # Monday to thursday


In [21]:
# Simulate the number of poptarts bought each day
poptarts_bought_each_day = np.random.normal(mean_poptarts_per_day, std_dev_poptarts_per_day, days_until_friday)


In [22]:
poptarts_bought_each_day

array([1.86654951, 3.92588562, 3.08340355, 5.64719892])

In [23]:
# Calculate the total poptarts remaining after each day
poptarts_remaining = 17 - np.cumsum(poptarts_bought_each_day)
poptarts_remaining

array([15.13345049, 11.20756487,  8.12416133,  2.47696241])

In [24]:
# Check if there are poptarts left on Friday afternoon
poptarts_available_on_friday = poptarts_remaining[-1] > 0
poptarts_available_on_friday

True

## Q5) Compare Heights

### - Men have an average height of 178 cm and standard deviation of 8cm.

### - Women have a mean of 170, sd = 6cm.

### - Since you have means and standard deviations, you can use np.random.normal to generate observations.

### - If a man and woman are chosen at random, what is the likelihood the woman is taller than the man?

In [25]:
# Set up parameters
mean_height_men = 178
std_dev_men = 8

mean_height_women = 170
std_dev_women = 6

# Number of simulations
num_simulations = 100000

# Generate random heights for men and women
heights_men = np.random.normal(mean_height_men, std_dev_men, num_simulations)
heights_women = np.random.normal(mean_height_women, std_dev_women, num_simulations)

# Calculate the likelihood that a woman is taller than a man
likelihood_taller_woman = np.mean(heights_women > heights_men) * 100

# Print the result
print(f"The likelihood that a randomly chosen woman is taller than a randomly chosen man is approximately: {likelihood_taller_woman:.2f}%")


The likelihood that a randomly chosen woman is taller than a randomly chosen man is approximately: 21.12%


## Q6) When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails. 

## What are the odds that after having 50 students download anaconda, no one has an installation issue? 100 students?

## What is the probability that we observe an installation issue within the first 150 students that download anaconda?

## How likely is it that 450 students all download anaconda without an issue?

#### What are the odds that after having 50 students download anaconda, no one has an installation issue

In [26]:
# 1.represent the data

p_issue=1/250
ntrials=50
n_simulations=1500

In [27]:
# 2. set up matrix of random numbers
stu_no_iss=np.random.random((n_simulations,ntrials))


In [28]:
stu_no_iss[:4,:4]

array([[0.2825686 , 0.222324  , 0.53829697, 0.25867299],
       [0.81313404, 0.33094674, 0.84536886, 0.55732163],
       [0.38976173, 0.38091799, 0.86504248, 0.11660362],
       [0.12167473, 0.41716046, 0.51451993, 0.47256059]])

In [29]:
# 3.apply an aggregate row-wise to produce the results of each simulation
iss=(stu_no_iss<p_issue)
iss

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

Now that we have issue as either true or false, we can take the sum of each row to find the total number of issue. When we sum an array of boolean values, numpy will treat True as 1 and False as 0.

Now we have the results of our simulation, an array where each number in the array represents how many issue.

In [30]:
# 4. aggregate the resulting data to get our experimental probability
iss.sum(axis=1)==0

array([ True,  True,  True, ...,  True,  True, False])

In [31]:
(iss.sum(axis=1)==0).mean()

0.8313333333333334

#### What are the odds that after having 100 students download anaconda, no one has an installation issue

In [32]:
n_trials=100
n_simulations=1500


In [33]:
down_ana=np.random.random((n_simulations,n_trials))
down_ana[:4,:4]

array([[0.84795468, 0.30171824, 0.54866199, 0.88516728],
       [0.20082623, 0.91759781, 0.9538919 , 0.25491873],
       [0.9595788 , 0.04853665, 0.51263341, 0.40730519],
       [0.59079451, 0.05506815, 0.34791007, 0.30749601]])

In [34]:
down_ana<p_issue

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [35]:
(down_ana<p_issue).sum(axis=1)

array([0, 1, 0, ..., 1, 0, 1])

In [36]:
install=(down_ana<p_issue).sum(axis=1)==0
install

array([ True, False,  True, ..., False,  True, False])

In [37]:
install.mean()

0.6813333333333333

#### What are the odds that after having 150 students download anaconda, no one has an installation issue

In [38]:
n_trials=150
n_simulations=1500

In [39]:
down_ana=np.random.random((n_simulations,n_trials))

In [40]:
down_ana[:4,:4]

array([[0.47614358, 0.56611204, 0.65518969, 0.97081282],
       [0.86171492, 0.64256331, 0.32986573, 0.40625722],
       [0.1645114 , 0.31724234, 0.22584254, 0.2647499 ],
       [0.51229916, 0.0336853 , 0.82233783, 0.35533336]])

In [41]:
#boolean mask
down_ana<p_issue

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       [False, False, False, ..., False, False, False]])

In [42]:
(down_ana<p_issue).sum(axis=1)

array([1, 0, 0, ..., 1, 2, 3])

In [43]:
install=(down_ana<p_issue).sum(axis=1)==0
install

array([False,  True,  True, ..., False, False, False])

In [44]:
install.mean()

0.5713333333333334

#### What are the odds that after having 450 students download anaconda, no one has an installation issue.

In [45]:
n_trials=450
n_simulations=1500

In [46]:
down_ana=np.random.random((n_simulations,n_trials))
down_ana[:4,:4]

array([[0.57649952, 0.00676252, 0.09085719, 0.09320964],
       [0.30522832, 0.83845126, 0.14044983, 0.19586591],
       [0.96336166, 0.31505129, 0.51807312, 0.03552731],
       [0.63248127, 0.3276618 , 0.90012469, 0.1239072 ]])

In [47]:
down_ana<p_issue

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [48]:
install=(down_ana<p_issue).sum(axis=1)==0

In [49]:
install.mean()

0.16266666666666665

## Q7) There's a 70% chance on any given day that there will be at least one food truck at Travis Park. 

## However, you haven't seen a food truck there in 3 days. How unlikely is this?

## How likely is it that a food truck will show up sometime this week?



In [50]:
# Parameters
prob_food_truck_daily = 0.7
prob_nofood_truck_daily=1-0.7
outcome=['food_truck','no_food_truck']
n_trials= 3
n_simulations = 100000

In [51]:
truck=np.random.choice(outcome,(n_simulations,n_trials),p=[0.7,0.3])
truck

array([['food_truck', 'no_food_truck', 'no_food_truck'],
       ['no_food_truck', 'food_truck', 'food_truck'],
       ['food_truck', 'food_truck', 'no_food_truck'],
       ...,
       ['food_truck', 'food_truck', 'food_truck'],
       ['no_food_truck', 'food_truck', 'food_truck'],
       ['food_truck', 'no_food_truck', 'food_truck']], dtype='<U13')

In [52]:
mask=(truck=='food_truck')
mask

array([[ True, False, False],
       [False,  True,  True],
       [ True,  True, False],
       ...,
       [ True,  True,  True],
       [False,  True,  True],
       [ True, False,  True]])

In [53]:
mask.sum(axis=1)>0

array([ True,  True,  True, ...,  True,  True,  True])

In [54]:
prob=(mask.sum(axis=1)>0).sum()
prob

97360

In [55]:
(mask.sum(axis=1)>0).mean()

0.9736

## How likely is it that a food truck will show up sometime this week?


In [57]:
n_trials=7
n_simulations=10000

In [61]:
tr_we=np.random.choice(outcome,(n_simulations,n_trials),p=[0.7,0.3])
tr_we

array([['food_truck', 'no_food_truck', 'food_truck', ...,
        'no_food_truck', 'food_truck', 'food_truck'],
       ['food_truck', 'no_food_truck', 'food_truck', ...,
        'no_food_truck', 'no_food_truck', 'food_truck'],
       ['food_truck', 'food_truck', 'food_truck', ..., 'food_truck',
        'food_truck', 'food_truck'],
       ...,
       ['food_truck', 'food_truck', 'food_truck', ..., 'no_food_truck',
        'food_truck', 'food_truck'],
       ['food_truck', 'food_truck', 'no_food_truck', ...,
        'no_food_truck', 'no_food_truck', 'food_truck'],
       ['food_truck', 'food_truck', 'no_food_truck', ..., 'food_truck',
        'food_truck', 'no_food_truck']], dtype='<U13')

In [62]:
mask=(tr_we=='food_truck')
mask

array([[ True, False,  True, ..., False,  True,  True],
       [ True, False,  True, ..., False, False,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ..., False,  True,  True],
       [ True,  True, False, ..., False, False,  True],
       [ True,  True, False, ...,  True,  True, False]])

In [63]:
mask.sum(axis=1)

array([4, 3, 7, ..., 5, 4, 5])

In [64]:
mask.sum(axis=1)>0

array([ True,  True,  True, ...,  True,  True,  True])

In [66]:
(mask.sum(axis=1)>0).mean()

0.9997

## Q8) If 23 people are in the same room, 

## what are the odds that two of them share a birthday? 

## What if it's 20 people? 40?



In [70]:
n_simulation = 10_000
days = list(range(1,366))

n_trials= people = 23

data = np.random.choice(days,(n_simulations,n_trials))
data



array([[142,  58, 365, ..., 216,  86, 204],
       [147, 212,  76, ..., 243, 189, 181],
       [267, 331,  67, ..., 279, 275, 192],
       ...,
       [ 39, 334, 362, ..., 109, 183, 271],
       [251, 303, 331, ...,  63,   3, 170],
       [ 63, 269, 195, ...,  69, 231, 335]])

In [71]:
df_data = pd.DataFrame(data)
df_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,142,58,365,227,196,76,195,308,34,34,...,119,120,221,279,360,135,180,216,86,204
1,147,212,76,148,15,154,99,191,77,192,...,87,248,363,224,19,303,212,243,189,181
2,267,331,67,181,39,100,82,332,311,219,...,162,211,170,7,53,84,98,279,275,192
3,140,204,15,204,219,102,138,293,180,87,...,188,278,137,218,167,161,273,211,87,74
4,167,146,282,145,338,110,344,108,134,312,...,285,327,125,347,343,143,326,102,270,102


In [72]:
df_data.nunique(axis=1)

0       22
1       21
2       23
3       21
4       22
        ..
9995    21
9996    23
9997    23
9998    23
9999    22
Length: 10000, dtype: int64

In [78]:
df_data.nunique(axis=1)!= n_trials

0        True
1        True
2       False
3        True
4        True
        ...  
9995     True
9996    False
9997    False
9998    False
9999     True
Length: 10000, dtype: bool

In [79]:
(df_data.nunique(axis=1)!= n_trials).mean()

0.5093

## what if it's 20 people

In [80]:
n_trials=20

In [81]:
data = np.random.choice(days,(n_simulations,n_trials))
data

array([[ 33, 296, 268, ..., 247,  58, 242],
       [154,  75, 138, ..., 154, 328, 216],
       [170, 129, 116, ...,   6, 262, 146],
       ...,
       [312, 319, 170, ..., 199, 108,  40],
       [235, 181,  13, ..., 334,  24, 284],
       [ 51, 151, 203, ..., 139, 289, 221]])

In [82]:
df=pd.DataFrame(data)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,33,296,268,80,164,362,264,191,312,273,199,266,71,29,58,151,170,247,58,242
1,154,75,138,262,161,108,278,312,253,163,271,28,99,318,229,156,223,154,328,216
2,170,129,116,93,42,172,114,141,365,198,267,168,140,239,60,144,257,6,262,146
3,272,64,205,150,329,336,272,105,352,317,189,68,216,180,81,101,93,257,304,200
4,110,247,238,312,285,68,73,271,105,127,128,321,139,355,67,290,79,317,344,151


In [83]:
df.nunique()

0     365
1     365
2     365
3     365
4     365
5     365
6     365
7     365
8     365
9     365
10    365
11    365
12    365
13    365
14    365
15    365
16    365
17    365
18    365
19    365
dtype: int64

In [85]:
df.nunique(axis=1)

0       19
1       19
2       20
3       19
4       20
        ..
9995    19
9996    19
9997    20
9998    20
9999    20
Length: 10000, dtype: int64

In [86]:
df.nunique(axis=1) !=n_trials

0        True
1        True
2       False
3        True
4       False
        ...  
9995     True
9996     True
9997    False
9998    False
9999    False
Length: 10000, dtype: bool

In [88]:
(df.nunique(axis=1) !=n_trials).mean()

0.4106

## if it's 40 people?

In [89]:
n_trials=40

In [90]:
data=np.random.choice(days,(n_simulations,n_trials))
data

array([[249, 322, 167, ..., 209, 248,  84],
       [281, 259, 333, ..., 116, 163, 295],
       [108, 238, 120, ...,  75, 360, 259],
       ...,
       [ 98, 211,  45, ..., 197, 198, 152],
       [262, 117,  83, ...,   9,  84, 245],
       [237, 103, 152, ...,  88,  24,  13]])

In [91]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,249,322,167,241,202,335,197,352,50,212,...,16,269,185,109,186,293,121,209,248,84
1,281,259,333,124,120,179,351,289,8,22,...,214,319,177,341,225,227,315,116,163,295
2,108,238,120,37,130,334,289,300,349,237,...,57,234,151,37,240,85,193,75,360,259
3,24,293,339,182,51,5,328,93,67,197,...,69,231,220,300,29,128,311,223,32,7
4,114,268,194,175,273,344,108,239,16,348,...,181,239,251,356,4,185,245,5,314,160


In [93]:
df.nunique(axis=1) 

0       39
1       37
2       36
3       40
4       37
        ..
9995    39
9996    39
9997    39
9998    37
9999    36
Length: 10000, dtype: int64

In [94]:
df.nunique(axis=1) != 40

0        True
1        True
2        True
3       False
4        True
        ...  
9995     True
9996     True
9997     True
9998     True
9999     True
Length: 10000, dtype: bool

In [95]:
(df.nunique(axis=1) != 40).mean()

0.8911