<a href="https://colab.research.google.com/github/somilasthana/MachineLearningSkills/blob/master/Basic_Binomial_Prior_Posterior.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import pymc3 as pm
from random import choices
from scipy import stats
import altair as alt

In [0]:

n=9
p=0.5
k=6

In [0]:
np.random.binomial(k, p, n)

array([3, 3, 1, 3, 2, 4, 3, 3, 3])

In [0]:
round(stats.binom.pmf(k, n, p), 2)

0.16

In [0]:
[round(stats.binom.pmf(k,i,p),2) for i in (6, 7, 8, 9, 10, 11, 12, 13,14, 15, 16)]

[0.02, 0.05, 0.11, 0.16, 0.21, 0.23, 0.23, 0.21, 0.18, 0.15, 0.12]

In [0]:
distexample =[
 round(stats.binom.pmf(i, n, p), 2)
 for i in (0,1,2,3,4,5,6,7,8,9)
 ]
distexample

[0.0, 0.02, 0.07, 0.16, 0.25, 0.25, 0.16, 0.07, 0.02, 0.0]

In [0]:
np.sum(distexample)

1.0

In [0]:
p_grid = np.linspace(0,1,101)
p_grid

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

In [0]:

prob_p = np.ones(101)
prob_data = stats.binom.pmf(k, n, p=p_grid)
posterior = prob_data * prob_p
posterior = posterior / sum(posterior) # normalizing the posterior

In [0]:
aux = pd.DataFrame(posterior).reset_index().rename({0:'prob'}, axis=1)
aux['p'] = aux['index']/100

alt.Chart(aux)\
   .mark_line()\
   .encode(
        x=alt.X('p', title='p'),
        y=alt.Y('prob', title='density')
          )

In [0]:
# The posterior probability is highest when the prior probability for
# binomial setup is between 0.6 and 0.7

In [0]:
samples = pd.DataFrame(np.random.choice(p_grid, 5000, p=posterior))\
            .reset_index()\
            .rename({0:'prob'}, axis=1)

In [0]:
samples.tail(n=10)

Unnamed: 0,index,prob
4990,4990,0.56
4991,4991,0.9
4992,4992,0.71
4993,4993,0.47
4994,4994,0.82
4995,4995,0.8
4996,4996,0.62
4997,4997,0.45
4998,4998,0.5
4999,4999,0.76


In [0]:
plot_1 = alt.Chart(samples)\
   .mark_point()\
   .encode(
        x=alt.X('index', title='samples'),
        y=alt.Y('prob', title='parameter p of the posterior')
          )

plot_2 = alt.Chart(samples).mark_area(
    opacity=0.3,
    interpolate='step'
).encode(
    alt.X('prob:Q', bin=alt.Bin(maxbins=200),scale=alt.Scale(domain=(0, 1)), title='parameter p of the posterior'),
    alt.Y('count()', stack=None, title='Number of records')
)

alt.hconcat(plot_1, plot_2)

In [0]:
"""
Suppose the globe tossing data had turned out to be 8 water in 
15 tosses. Construct the posterior distribution, using grid 
approximation. Use the same flat prior as before.
"""

In [0]:
n=15
k=8

In [0]:
p_grid = np.linspace(0,1,101)
prob_p = np.ones(101)
prob_data = stats.binom.pmf(k, n, p=p_grid)
posterior1 = prob_data * prob_p #prob_data:likelihood prob_b:prior prob
posterior1 = posterior1 / sum(posterior1)

In [0]:
aux = pd.DataFrame(posterior1).reset_index().rename({0:'prob'}, axis=1)
aux['p'] = aux['index']/100

alt.Chart(aux)\
   .mark_line()\
   .encode(
        x=alt.X('p', title='p'),
        y=alt.Y('prob', title='density')
          )

In [0]:
samples = pd.DataFrame(np.random.choice(p_grid, 5000, p=posterior1))\
            .reset_index()\
            .rename({0:'prob'}, axis=1)

In [0]:
plot_1 = alt.Chart(samples)\
   .mark_point()\
   .encode(
        x=alt.X('index', title='samples'),
        y=alt.Y('prob', title='parameter p of the posterior')
          )

plot_2 = alt.Chart(samples).mark_area(
    opacity=0.3,
    interpolate='step'
).encode(
    alt.X('prob:Q', bin=alt.Bin(maxbins=200),scale=alt.Scale(domain=(0, 1)), title='parameter p of the posterior'),
    alt.Y('count()', stack=None, title='Number of records')
)

alt.hconcat(plot_1, plot_2)

In [0]:
"""
Start over in 1, but now use a prior that is zero below p=0.5
 and a constant above p=0.5
. This corresponds to prior information that a majority of the
 Earth's surface is water. What difference does the better prior 
 make? If it helps, compare posterior distributions (using both 
 priors) to the true value p=0.7

"""

In [0]:
p_grid = np.linspace(0,1,101)
prob_p = np.concatenate((np.zeros(50), np.full(51,0.5)))
prob_data = stats.binom.pmf(k, n, p=p_grid)
posterior2 = prob_data * prob_p
posterior2 = posterior2 / sum(posterior2)

In [5]:
aux = pd.DataFrame(posterior2).reset_index().rename({0:'prob'}, axis=1)
aux['p'] = aux['index']/100

alt.Chart(aux)\
   .mark_line()\
   .encode(
        x=alt.X('p', title='p'),
        y=alt.Y('prob', title='density')
          )

In [0]:
samples = pd.DataFrame(np.random.choice(p_grid, 5000, p=posterior2))\
            .reset_index()\
            .rename({0:'prob'}, axis=1)

In [7]:
plot_1 = alt.Chart(samples)\
   .mark_point()\
   .encode(
        x=alt.X('index', title='samples'),
        y=alt.Y('prob', title='parameter p of the posterior')
          )

plot_2 = alt.Chart(samples).mark_area(
    opacity=0.3,
    interpolate='step'
).encode(
    alt.X('prob:Q', bin=alt.Bin(maxbins=200),scale=alt.Scale(domain=(0, 1)), title='parameter p of the posterior'),
    alt.Y('count()', stack=None, title='Number of records')
)

alt.hconcat(plot_1, plot_2)

In [8]:
round(np.mean(samples.prob),2)

0.68

In [9]:
pm.stats.quantiles(np.array(samples.prob), qlist=[0.5, 99.5])

{0.5: 0.5, 99.5: 0.93}

In [0]:
# posterior probability proportional prior probability * likelihood of data

In [0]:
"""
Suppose you want to estimate the Earth's proportion of water very
 precisely. Specifically, you want the 99% percentile interval of
 the posterior distribution of p to be only 0.05 wide. This means
 the distance between the upper and lower bound of the interval 
 should be 0.05. How many times will you have to toss the globe 
 to do this?
"""

In [10]:
p=0.7
for n in [20, 50, 100, 200, 500, 1000, 2000, 3000, 5000]:
    k=sum(np.random.binomial(1, p, n))
    p_grid = np.linspace(0,1,1001)
    prob_p = np.ones(1001)
    prob_data = stats.binom.pmf(k, n, p=p_grid)
    posterior3 = prob_data * prob_p
    posterior3 = posterior3 / sum(posterior3)
    samples = pd.DataFrame(np.random.choice(p_grid, 5000, p=posterior3))\
                .reset_index()\
                .rename({0:'prob'}, axis=1)
    print(f'Distribution size: {n}, PI0.5&99.5: {pm.stats.quantiles(np.array(samples.prob), qlist=[0.5, 99.5])}')

Distribution size: 20, PI0.5&99.5: {0.5: 0.325, 99.5: 0.8310000000000001}
Distribution size: 50, PI0.5&99.5: {0.5: 0.422, 99.5: 0.756}
Distribution size: 100, PI0.5&99.5: {0.5: 0.677, 99.5: 0.875}
Distribution size: 200, PI0.5&99.5: {0.5: 0.5640000000000001, 99.5: 0.734}
Distribution size: 500, PI0.5&99.5: {0.5: 0.648, 99.5: 0.751}
Distribution size: 1000, PI0.5&99.5: {0.5: 0.681, 99.5: 0.752}
Distribution size: 2000, PI0.5&99.5: {0.5: 0.675, 99.5: 0.726}
Distribution size: 3000, PI0.5&99.5: {0.5: 0.684, 99.5: 0.727}
Distribution size: 5000, PI0.5&99.5: {0.5: 0.686, 99.5: 0.719}
