# Probability

In [57]:
import wget
import os
# download CSV
url_csv = "https://github.com/AllenDowney/BiteSizeBayes/raw/master/gss_bayes.csv"
if not os.path.exists("files/gss_bayes.csv"):
    wget.download(url_csv, "files/gss_bayes.csv")

In [58]:
import pandas as pd
gss = pd.read_csv('files/gss_bayes.csv', index_col=0)
gss.head()

Unnamed: 0_level_0,year,age,sex,polviews,partyid,indus10
caseid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1974,21.0,1,4.0,2.0,4970.0
2,1974,41.0,1,5.0,0.0,9160.0
5,1974,58.0,2,6.0,1.0,2670.0
6,1974,30.0,1,5.0,4.0,6870.0
7,1974,48.0,1,5.0,4.0,7860.0


In [59]:
banker = gss['indus10'] == 6870 # create boolean matrix (values True or False) 
banker

caseid
1       False
2       False
5       False
6        True
7       False
        ...  
2863    False
2864    False
2865    False
2866    False
2867    False
Name: indus10, Length: 49290, dtype: bool

In [60]:
banker.sum() # sum all values, True=1 , False=0

728

In [61]:
# Note: banker.size == len(banker) 
banker.mean() # = 728/49290 = banker.sum/banker.size = probability

0.014769730168391155

In [62]:
# probability of female
(gss['sex']==2).mean()

0.5378575776019476

In [63]:
# probability of polviews <= 3 (liberal)
(gss["polviews"]<=3).mean()

0.27374721038750255

In [64]:
# probability of democrat
(gss["partyid"]<=1).mean()

0.3662609048488537

In [65]:
# probability funcion
def prob(A): 
    # A is a boolean matrix
    return A.mean() 

## Conjunction (AND)

In [66]:
banker = gss['indus10'] == 6870
democrat = gss["partyid"] <= 1
(banker & democrat).mean() # or prob(banker & democrat)

0.004686548995739501

## Conditional Probability (A/B)

In [67]:
# Calculate: P(liberal/democrat)

liberal = gss["polviews"] <= 3
democrat = gss["partyid"] <= 1

# the democrats, of which they are liberal.
# if value of liberal matrix is False -> ignore 
ans = democrat[liberal] 

# print(liberal.size) # All True + False = 49290
# print(liberal.sum()) # Only True = 13493
# print(ans.size) # = liberal.sum() = 13493

print("P(liberal/democrat):\t",ans.mean())

P(liberal/democrat):	 0.5206403320240125


In [68]:
# Calculate: P(female/banker)

female = gss['sex'] == 2
banker = gss['indus10'] == 6870

ans = female[banker]

print("P(female/banker):\t",ans.mean())

P(female/banker):	 0.7706043956043956


In [69]:
def conditional(proposition, given):
    return prob(proposition[given])

# examples
ans_1 = conditional(liberal & female, given=banker)
ans_2 = conditional(banker, given=liberal & female)
ans_3 = conditional(female, given=liberal & democrat)
print("Ej1. P((liberal & female)/banker):\t", ans_1)
print("Ej2. P(banker/(liberal & female)):\t", ans_2)
print("Ej3. P(female/(liberal & democrat)):\t", ans_3)

Ej1. P((liberal & female)/banker):	 0.17307692307692307
Ej2. P(banker/(liberal & female)):	 0.01723194748358862
Ej3. P(female/(liberal & democrat)):	 0.576085409252669


## Baye's Theorem
**Theorem 1** gives us a way to compute a conditional probability using a conjunction:

$$P(A|B) = \frac{P(A~\mathrm{and}~B)}{P(B)}$$

**Theorem 2** gives us a way to compute a conjunction using a conditional probability:

$$P(A~\mathrm{and}~B) = P(B) P(A|B)$$

**Theorem 3**, also known as Bayes's Theorem, gives us a way to get from $P(A|B)$ to $P(B|A)$, or the other way around:

$$P(A|B) = \frac{P(A) P(B|A)}{P(B)}$$

## Exercises

**Exercise 1**
* The probability that Linda is a female banker,
* The probability that Linda is a liberal female banker, and
* The probability that Linda is a liberal female banker and a Democrat.

In [70]:
print((female & banker).mean())
print((female & banker & liberal).mean())
print((female & banker & liberal & democrat).mean())

0.011381618989653074
0.002556299452221546
0.0012375735443294787


**Exercise 2**
* What is the probability that a respondent is liberal, given that they are a Democrat?
* What is the probability that a respondent is a Democrat, given that they are liberal?

In [71]:
print(liberal[democrat].mean())
print(democrat[liberal].mean())

0.3891320002215698
0.5206403320240125


**Exercise 3**
* What is the probability that a randomly chosen respondent is a young liberal?
* What is the probability that a young person is liberal?
* What fraction of respondents are old conservatives?
* What fraction of conservatives are old?

_Note:_ 
```python
young = gss["age"] < 30
old = gss["age"] >= 65
conservative = gss["polviews"] >= 5
```

In [72]:
young = gss["age"] < 30
old = gss["age"] >= 65
conservative = gss["polviews"] >= 5

# solution
print((liberal & young).mean())
print(liberal[young].mean())
print((conservative & old).mean())
print(old[conservative].mean())

0.06579427875836884
0.338517745302714
0.06701156421180766
0.19597721609113564
