In [1]:
# probability is the theoretical study of measuring certainty that an event will happen.
# It is a foundational discipline for statistics, hypothesis testing, machine learning

#  Probability is about quantifying predictions of events yet to happen,
# whereas likelihood is measuring the frequency of events that already occurred.
# In statistics and machine learning, 
# we often use likelihood (the past) in the form of data to predict probability (the future).


In [2]:
# Probability is purely theoretical of how likely an event is to happen, and does not require data.
# Statistics on the other hand cannot exist without data, and uses it to discover probability
# and provides tools to decribe data.

In [3]:
# When there are two separate probabilities of two separate events,
# but we want to find the probability that both events will occur together.
# This is known as a joint probability.

In [4]:
# When we deal with “OR” operations with probabilities, this is known as a union probability.

In [5]:
# A probability topic that people easily get confused by is the concept of conditional probability,
# which is the probability of an event A occuring given event B has occurred.


In [6]:
# Using Bayes Theorem in Python

p_coffee_drinker = .65
p_cancer = .005
p_coffee_drinker_given_cancer = .85

p_cancer_given_coffee_drinker = p_coffee_drinker_given_cancer *p_cancer / p_coffee_drinker


print(p_cancer_given_coffee_drinker)

0.006538461538461539


In [7]:
# binomial distribution, which measures how likely k successes can happen out of n trials given p probability.

In [8]:
#  we use SciPy’s binom.pmf() function (PMF stands for “probability mass function”)
# to print all 11 probabilities for our binomial distribution from 0 to 10 successes.

from scipy.stats import binom

n = 10
p = 0.9

for k in range(n + 1):
    probability = binom.pmf(k, n, p)
    print("{0} - {1}".format(k, probability))

0 - 9.99999999999996e-11
1 - 8.999999999999996e-09
2 - 3.644999999999996e-07
3 - 8.748000000000003e-06
4 - 0.0001377809999999999
5 - 0.0014880347999999988
6 - 0.011160260999999996
7 - 0.05739562800000001
8 - 0.19371024449999993
9 - 0.38742048900000037
10 - 0.34867844010000004


In [9]:
#  The beta distribution allows us to see the likelihood of different underlying probabilities
#     for an event to occur given alpha successes and beta failures.

from scipy.stats import beta

a = 8
b = 2

p = beta.cdf(.90, a, b)

print(p)

0.7748409780000001


In [12]:
# Creating a sample space of coin flips

sample_space = {'Heads', 'Tails'}

# Computing the probability of heads
probability_heads = 1 / len(sample_space)
print(f'Probability of choosing heads is {probability_heads}')

Probability of choosing heads is 0.5


In [13]:
# Let’s define two event conditions:
#     one where the coin lands on either heads or tails,
#     and another where the coin lands on neither heads nor tails.

def is_heads_or_tails(outcome):  return outcome in {'Heads', 'Tails'}
def is_neither(outcome): return not is_heads_or_tails(outcome)

In [14]:
#  let’s define event conditions for the two basic events
#     in which the coin satisfies exactly one of our two potential outcomes.
def is_heads(outcome): return outcome == 'Heads'
def is_tails(outcome): return outcome == 'Tails'

In [15]:
#  Defining an event-detection function
def get_matching_event(event_condition, sample_space):
    return set([outcome for outcome in sample_space
                if event_condition(outcome)])

In [16]:
# Detecting event using event conditions

event_conditions = [is_heads_or_tails, is_heads, is_tails, is_neither]
 
for event_condition in event_conditions:
    print(f"Event Condition: {event_condition.__name__}")
    event = get_matching_event(event_condition, sample_space)
    print(f'Event: {event}\n')

Event Condition: is_heads_or_tails
Event: {'Tails', 'Heads'}

Event Condition: is_heads
Event: {'Heads'}

Event Condition: is_tails
Event: {'Tails'}

Event Condition: is_neither
Event: set()



In [18]:
#  Computing event probabilities


def compute_probability(event_condition, generic_sample_space):
    event = get_matching_event(event_condition, generic_sample_space)
    return len(event) / len(generic_sample_space)
 
for event_condition in event_conditions:
    prob = compute_probability(event_condition, sample_space)
    name = event_condition.__name__
    print(f"Probability of event arising from '{name}' is {prob}")

Probability of event arising from 'is_heads_or_tails' is 1.0
Probability of event arising from 'is_heads' is 0.5
Probability of event arising from 'is_tails' is 0.5
Probability of event arising from 'is_neither' is 0.0


In [19]:
# We computed probabilities for an unbiased coin.
# What would happen if that coin was biased?
# Suppose, for instance, that a coin is four times more likely to land on heads relative to tails.
# How do we compute the likelihoods of outcomes that are not weighted in an equal manner?
# Well, we can construct a weighted sample space represented by a Python dictionary.
# Each outcome is treated as a key whose value maps to the associated weight.



In [20]:
# Representing a weighted sample space
weighted_sample_space = {'Heads': 4, 'Tails': 1}

In [21]:
# Checking the weighted sample space size

sample_space_size = sum(weighted_sample_space.values())
assert sample_space_size == 5

In [22]:
# Checking the weighted event size

event = get_matching_event(is_heads_or_tails, weighted_sample_space)
event_size = sum(weighted_sample_space[outcome] for outcome in event)
assert event_size == 5

In [23]:
# Our generalized definitions of sample space size and event size permit us to create a compute_event_probability function.
# The function takes as input a generic_sample_ space variable that can be either a weighted dictionary or an unweighted set.

def compute_event_probability(event_condition, generic_sample_space):
    event = get_matching_event(event_condition, generic_sample_space)
    if type(generic_sample_space) == type(set()):
        return len(event) / len(generic_sample_space)
 
    event_size = sum(generic_sample_space[outcome]
                     for outcome in event)
    return event_size / sum(generic_sample_space.values())

In [24]:
# We can now output all the event probabilities for the biased coin
# without needing to redefine our four event condition functions.

for event_condition in event_conditions:
    prob = compute_event_probability(event_condition, weighted_sample_space)
    name = event_condition.__name__
    print(f"Probability of event arising from '{name}' is {prob}")

Probability of event arising from 'is_heads_or_tails' is 1.0
Probability of event arising from 'is_heads' is 0.8
Probability of event arising from 'is_tails' is 0.2
Probability of event arising from 'is_neither' is 0.0


In [25]:
# Suppose a family has four children. What is the probability that exactly two of the children are boys?

# Computing the sample space of children

possible_children = ['Boy', 'Girl']
sample_space = set()
for child1 in possible_children:
    for child2 in possible_children:
        for child3 in possible_children:
            for child4 in possible_children:
                outcome = (child1, child2, child3, child4)
                sample_space.add(outcome)

In [26]:
# We ran four nested for loops to explore the sequence of four births.
# This is not an efficient use of code.
# We can more easily generate our sample space using Python’s built-in itertools.product function,
# which returns all pairwise combinations of all elements across all input lists.
# Next, we input four instances of the possible_children list into itertools.product.
# The product function then iterates over all four instances of the list, computing all the combinations of list elements.
# The final output equals our sample space.

from itertools import product
all_combinations = product(*(4 * [possible_children]))
assert set(all_combinations) == sample_space

In [27]:
# more efficiency

sample_space_efficient = set(product(possible_children, repeat=4))
assert sample_space == sample_space_efficient

In [28]:
# Computing the probablity of two boys

def has_two_boys(outcome): return len([child for child in outcome
                                      if child == 'Boy']) == 2
prob = compute_event_probability(has_two_boys, sample_space)
print(f"Probability of 2 boys is {prob}")

Probability of 2 boys is 0.375
