In [1]:
import numpy as np
import pandas as pd

In [2]:
# If you flip 8 coins
# What's probability of getting more than 3 heads

In [3]:
n_trials = 10_000
n_coins = 8

flips = np.random.choice([0, 1], n_coins * n_trials).reshape(n_trials, n_coins)

flips = pd.DataFrame(flips)
flips.columns = ["die1", "die2", "die3", "die4", "die5", "die6", "die7", "die8"]
flips

Unnamed: 0,die1,die2,die3,die4,die5,die6,die7,die8
0,0,0,1,1,1,0,1,1
1,0,1,1,1,0,1,0,0
2,0,1,0,0,1,0,1,1
3,0,0,0,1,1,1,1,1
4,1,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...
9995,1,0,1,1,0,1,0,0
9996,1,1,1,0,1,0,1,0
9997,0,1,1,1,1,1,0,1
9998,1,0,1,0,1,1,0,0


In [4]:
flips["n_heads"] = flips.sum(axis=1)
flips.head()

Unnamed: 0,die1,die2,die3,die4,die5,die6,die7,die8,n_heads
0,0,0,1,1,1,0,1,1,5
1,0,1,1,1,0,1,0,0,4
2,0,1,0,0,1,0,1,1,4
3,0,0,0,1,1,1,1,1,5
4,1,0,1,0,0,1,0,1,4


In [5]:
# What's the probability of flipping more than 3 heads?
(flips.n_heads > 3).mean()

0.6371

## Flipping Heads or Tails
- If we flip 8 coins at once, what's the chance of getting exactly 3?


In [6]:
flips = np.random.choice(["h", "t"], n_coins * n_trials).reshape(n_trials, n_coins)
flips = pd.DataFrame(flips)
flips.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7
0,h,h,h,t,t,t,h,h
1,h,t,h,h,t,h,t,t


In [7]:
# Count the h
def count_heads(row):
    counter = 0
    for field in row:
        if field == "h":
            counter += 1
    return counter

In [8]:
%%timeit
# 60.6 ms ± 3.98 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
flips.apply(count_heads, axis=1)

61.8 ms ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
%%timeit
# p(flipping exactly 3 heads)
# 1.25 ms ± 64.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
(flips == "h").sum(axis=1)

4.19 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Dealing with Strings
There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

In [10]:
n_rows = 100_000
n_cols = 2

billboards = np.random.choice(["Web", "Data"], p=[.75, .25], size=n_rows*n_cols).reshape(n_rows, n_cols)
billboards = pd.DataFrame(billboards)
billboards.columns = ["billboard1", "billboard2"]
billboards.head()

Unnamed: 0,billboard1,billboard2
0,Data,Web
1,Web,Data
2,Web,Web
3,Web,Data
4,Web,Data


In [14]:
%%timeit
# Only return true if billboard1 and billboard2 are BOTH "Data"
((billboards.billboard1 == "Data") & (billboards.billboard2 == "Data")).mean()

13.8 ms ± 852 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%%timeit
# Dataframe == "Value"
# .sum(axis=1) will count the number of "Data" cohorts
((billboards == "Data").sum(axis=1) == 2).mean()

15 ms ± 448 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### How to Check if We Roll Doubles

In [25]:
rolls = pd.DataFrame(np.random.choice([1, 2, 3, 4, 5, 6], n_rows*n_cols).reshape(n_rows, n_cols))
rolls.head()

Unnamed: 0,0,1
0,1,3
1,1,6
2,1,4
3,4,1
4,5,5


In [26]:
(rolls[0] == rolls[1]).mean()

0.16975