# Week 1 Homework

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from scipy import stats

## Exercise 1

In [None]:
# possible values for uknown parameter: water proportion
p_grid = np.linspace(0, 1, num=100)

prior = np.full_like(p_grid, 1)
prior = prior / prior.sum()

prob_data = stats.binom.pmf(k=4, n=4 + 11, p=p_grid)
prob_data = prob_data / prob_data.sum()

posterior = prior * prob_data
posterior = posterior / posterior.sum()

posterior_samples = np.random.choice(p_grid, p=posterior, size=5000)

In [None]:
# data source for plotting
source = pd.DataFrame(
    {
        "p_grid": p_grid,
        "prior": prior,
        "prob_data": prob_data,
        "posterior": posterior,
    }
)

# convert wide df to tall, keep `p_grid` column
source = source.melt("p_grid", value_name="density")
source.head()

In [None]:
prob_lines = (
    alt.Chart(source, title="probabilities")
    .mark_line()
    .encode(
        alt.X("p_grid", title="water ratio"),
        y="density",
        color="variable",
    )
)

samples_source = pd.DataFrame({"posterior_samples": posterior_samples})
hist = (
    alt.Chart(samples_source)
    .mark_bar(size=10)
    .encode(
        x=alt.X("posterior_samples", bin=alt.BinParams(step=0.05, extent=[0, 1])),
        y="count()",
    )
)

alt.vconcat(prob_lines, hist)

In [None]:
posterior_mean = (p_grid * posterior).mean() * len(p_grid)
posterior_samples_mean = posterior_samples.mean()
print(
    f"posterior_mean={posterior_mean:.4f} posterior_samples_mean={posterior_samples_mean:.4f}"
)

## Exercise 2

In [None]:
p_grid = np.linspace(0, 1, num=100)

prior = np.full_like(p_grid, 1)
prior[p_grid < 0.5] = 0
prior = prior / prior.sum()

prob_data = stats.binom.pmf(k=4, n=4 + 2, p=p_grid)
prob_data = prob_data / prob_data.sum()

posterior = prior * prob_data
posterior = posterior / posterior.sum()

posterior_samples = np.random.choice(p_grid, p=posterior, size=5000)

In [None]:
# data source for plotting
source = pd.DataFrame(
    {
        "p_grid": p_grid,
        "prior": prior,
        "prob_data": prob_data,
        "posterior": posterior,
    }
)

# convert wide df to tall, keep `p_grid` column
source = source.melt("p_grid", value_name="density")

In [None]:
prob_lines = (
    alt.Chart(source, title="probabilities")
    .mark_line()
    .encode(
        alt.X("p_grid", title="water ratio"),
        y="density",
        color="variable",
    )
)

samples_source = pd.DataFrame({"posterior_samples": posterior_samples})
hist = (
    alt.Chart(samples_source)
    .mark_bar(size=10)
    .encode(
        x=alt.X("posterior_samples", bin=alt.BinParams(step=0.05, extent=[0, 1])),
        y="count()",
    )
)

alt.vconcat(prob_lines, hist)

In [None]:
posterior_mean = (p_grid * posterior).mean() * len(p_grid)
posterior_samples_mean = posterior_samples.mean()
print(
    f"posterior_mean={posterior_mean:.4f} posterior_samples_mean={posterior_samples_mean:.4f}"
)

## Exercise 3

In [None]:
import statreth as sr

In [None]:
prop_mass = 0.89
hdpi = sr.hdi(posterior_samples, prop_mass)
print(f"HDPI={hdpi} at prop_mass={prop_mass}")
cred_int = sr.pi(posterior_samples, prop_mass)
print(f"PI={cred_int} at prop_mass={prop_mass}")

## Exercise 4

In [None]:
p_error = 0.2
p_water = 0.7

n_measurements = 20  # number of measurments per simulation
n_sim = 1000

In [None]:
# true water samples
true_samples = np.random.binomial(n_measurements, p=p_water, size=n_sim)
# take true samples and reduce them according to p_error
obs_samples = np.random.binomial(true_samples, p=(1 - p_error))

In [None]:
print("true sample mean:", true_samples.mean() / n_measurements)
print("obs sample mean:", obs_samples.mean() / n_measurements)

In [None]:
# take a biased sample
k = obs_samples[1]
print("number of water samples:", k)

# possible values for uknown parameter: water proportion
p_grid = np.linspace(0, 1, num=100)

prior = stats.beta.pdf(p_grid, 1, 1)
prior = prior / prior.sum()

p_grid_adj = p_grid * (1 - p_error)
prob_data_adj = stats.binom.pmf(k=k, n=n_measurements, p=p_grid_adj)
prob_data_adj = prob_data_adj / prob_data_adj.sum()

posterior_adj = prior * prob_data_adj
posterior_adj = posterior_adj / posterior_adj.sum()

# assume prior is uniform
posterior_biased = stats.binom.pmf(k=k, n=n_measurements, p=p_grid)
posterior_biased = posterior_biased / posterior_biased.sum()

In [None]:
# data source for plotting
source = pd.DataFrame(
    {
        "p_grid": p_grid,
        "posterior_adj": posterior_adj,
        "posterior_biased": posterior_biased,
    }
)

# convert wide df to tall, keep `p_grid` column
source = source.melt("p_grid", value_name="density")

alt.Chart(source, title="probabilities").mark_line().encode(
    alt.X("p_grid", title="water ratio"),
    y="density",
    color="variable",
)