source: https://www.kaggle.com/gauravduttakiit/explore-the-poisson-regression/notebook

In [None]:
pip install pyro-ppl-1.8.0

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

import torch
from torch.distributions import constraints
from torch import tensor

import pyro
import pyro.distributions as dist
from pyro.infer import SVI,Trace_ELBO
from pyro.infer.autoguide  import AutoMultivariateNormal, AutoNormal, init_to_mean
from pyro.optim import ClippedAdam

assert pyro.__version__.startswith('1.8')
pyro.set_rng_seed(1)
torch.manual_seed(1)

# Set matplotlib settings
%matplotlib inline
plt.style.use('default')
plt.rcParams['figure.figsize'] = [8, 6]

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/seanreed1111/datasets/master/count-regression-datasets/competition_awards_data.csv")
df.columns = ['award_count', 'math_score']
df.head()

In [None]:
df.describe()

In [None]:
X = df.copy()

In [None]:
sns.stripplot(data=X, x='award_count', y='math_score', palette='colorblind');

In [None]:
sns.displot(data=X, x='award_count', palette='colorblind');

Poisson Distribution should have equal mean and variance. Let's check:

In [None]:
X['award_count'].mean(), X['award_count'].var() # Variance larger than the mean. Overdispersed.

In [None]:
samples_1 = dist.Poisson(X['award_count'].mean()).sample(tensor([200])) # create sample distribution where Poisson mean = data mean
sns.distplot(samples_1.numpy(), kde=False);

In [None]:
samples_2 = dist.Poisson(X['award_count'].var()).sample(tensor([200])) # create sample distribution where Poisson mean = data variance
sns.displot(samples_2.numpy());

### So, comparing the actual data with the above two poisson sampling distributions above, it appears our data is both zero inflated AND overdispersed!

let's make a Poisson model with 'award_count ~ math_score' 

In [None]:
data = X[['math_score']]
target = X['award_count']

## Using sklearn

In [None]:
data.shape, target.shape

In [None]:
from sklearn.linear_model import PoissonRegressor #Generalized Linear Model with a Poisson distribution and log link.

reg = PoissonRegressor().fit(data.values, target.values)

In [None]:
# these are MLE estimates of parameters we expect to recover
print(reg.intercept_)
print(reg.coef_)

In [None]:
reg.score(data.values.reshape(-1,1), target)

## Using statsmodels

In [None]:
import statsmodels.formula.api as smf
statsmod = smf.poisson(formula='award_count ~ math_score', data=X)
result = statsmod.fit()
print(result.summary())

## Using Bayesian Regression with SVI

In [None]:
# convert data and target to torch tensors
data = tensor(data.values, dtype=torch.float)
target = tensor(target.values, dtype=torch.float)

In [None]:
data.size(), target.size()

In [None]:
from torch import nn
from pyro.nn import PyroSample, PyroModule

# need to pass the priors for all models as parameters to the object.
class BayesianPoissonRegression(PyroModule):
    def __init__(self, in_features, out_features = 1, bias = True):
        super().__init__()
        self.linear = PyroModule[nn.Linear](in_features, out_features)
        if bias:
          self.linear.bias = PyroSample(dist.Normal(0., 5.).expand([out_features]).to_event(1))
        self.linear.weight = PyroSample(dist.Normal(0., 0.05).expand([out_features, in_features]).to_event(2))

        
    def forward(self, x, y=None):
        rate = self.linear(x).squeeze(-1).exp() #we are using log link, so apply inverse of log to the matrix multiplication, i.e. exp
        with pyro.plate("data", x.shape[0]):
            obs = pyro.sample("obs", dist.Poisson(rate), obs=y)
        return rate

In [None]:
model = BayesianPoissonRegression(data.size(1)) 

In [None]:
from pyro.infer.autoguide import AutoMultivariateNormal

guide = AutoMultivariateNormal(model, init_loc_fn=init_to_mean)

In [None]:
def train(model, guide, lr=0.01, n_steps=4000):
    pyro.set_rng_seed(1)
    pyro.clear_param_store()
    
    gamma = 0.01  # final learning rate will be gamma * initial_lr
    lrd = gamma ** (1 / n_steps)
    adam = pyro.optim.ClippedAdam({'lr': lr, 'lrd': lrd})

    svi = SVI(model, guide, adam, loss=Trace_ELBO())

    for i in range(n_steps):
        elbo = svi.step(data, target)
        if i % 500 == 0:
          print(f"Elbo loss: {elbo}")
    print(f"Final Elbo loss: {elbo}")

In [None]:
%%time
train(model, guide)

In [None]:
from pyro.infer import Predictive

num_samples = 1000
predictive = Predictive(model, guide=guide, num_samples=num_samples)

svi_samples = {k: v.reshape((num_samples,-1)).detach().cpu().numpy()
               for k, v in predictive(data, target).items()
               if k != "obs"}

In [None]:
svi_samples.keys()

In [None]:
svi_samples['linear.bias'].mean()

In [None]:
svi_samples['linear.weight'].mean(axis=0)

In [None]:
guide.quantiles([0.05,0.50,0.95])

In [None]:
print(reg.intercept_)
print(reg.coef_)

In [None]:
sns.kdeplot(data = svi_samples['linear.bias']);

In [None]:
sns.kdeplot(data = svi_samples['linear.weight']);

In [None]:
# So all three methods seem to be in agreement about the central tendencies of the coefficients.

# What about MCMC?

In [None]:
from pyro.infer import MCMC, NUTS
nuts_kernel = NUTS(model)
mcmc = MCMC(nuts_kernel, num_samples=800, warmup_steps=500)

In [None]:
%%time
mcmc.run(data, target)

In [None]:
hmc_samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

In [None]:
hmc_samples.keys()

In [None]:
np.median(hmc_samples['linear.bias'])

In [None]:
sns.kdeplot(data=hmc_samples['linear.bias']);

In [None]:
hmc_samples['linear.weight'].shape

In [None]:
print(np.median(hmc_samples['linear.weight'][:,0,0]))
sns.kdeplot(data=hmc_samples['linear.weight'][:,0,0]);

In [None]:
print(np.median(hmc_samples['linear.weight'][:,0,0]))
sns.kdeplot(data=hmc_samples['linear.weight'][:,0,0]);

In [None]:
# sklearn estimate
print(reg.intercept_)
print(reg.coef_)


In [None]:
import arviz as az

az_data = az.from_pyro(mcmc)
az.plot_trace(az_data, compact=False)
plt.tight_layout()