## 【一般化線形混合モデル】肉まんの販売個数

とあるエリアのコンビニエンスストア 10店舗の肉まんの販売個数のデータをモデル化する。特徴量としては気温データのみが与えられているが、その他の特徴量はないため、店舗ごとの立地など違いを「ランダム効果」として、ポアソン分布でモデル化してみる。

In [None]:
import pymc as pm
import arviz as az

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = [10, 4]

## Load Data

In [None]:
data = pd.read_csv('data.csv')

In [None]:
data.head(20)

## Preprocess & Visualize Data

In [None]:
x = data['temperature'].values
y = data['num_sold'].values

store_id = data['store_id'].values

num_stores = np.max(store_id) + 1

In [None]:
num_days = 10

temperature = x[:num_days]
num_sold = y.reshape(num_stores, -1)

In [None]:
fig = plt.figure(figsize=(10, 6))

sns.heatmap(num_sold, annot=True, cmap='jet')

plt.title('Number of Sold')
plt.xlabel('Date ID')
plt.ylabel('Store ID')

plt.tight_layout()

In [None]:
fig = plt.figure(figsize=(8, 6))

sns.scatterplot(x=temperature, y=num_sold.sum(axis=0), s=100)

plt.title('Temperature vs Sold')

plt.xlabel('Temperature')
plt.ylabel('Number of Sold (Sum of all stores)');

## Scale Data

In [None]:
x_mu = np.mean(x)
x_sd = np.std(x)

x_scaled = (x - x_mu) / x_sd

## Define Model & Inference

In [None]:
with pm.Model() as model:

    a = pm.Normal('a', mu=0, sigma=10)
    b = pm.Normal('b', mu=0, sigma=10)

    s = pm.HalfCauchy('s', 5)
    r = pm.Normal('r', mu=0, sigma=s, shape=(num_stores,))

    theta = a * x_scaled + r[store_id] + b

    mu = pm.math.exp(theta)

    obs = pm.Poisson('obs', mu=mu, observed=y)

In [None]:
with model:

    trace = pm.sample(3000, tune=3000, target_accept=0.99, return_inferencedata=False)
    idata = pm.to_inference_data(trace)

## Check MCMC-samples

In [None]:
az.plot_trace(idata)
plt.gcf().subplots_adjust(wspace=0.5, hspace=0.5)

In [None]:
az.summary(idata)

## Posterior Predictive Check

In [None]:
with model:

    ppc = pm.sample_posterior_predictive(idata, return_inferencedata=False)

In [None]:
ppc['obs'].shape

In [None]:
fig = plt.figure(figsize=(12, 24))

for k in range(30):

    ax = fig.add_subplot(10, 3, k+1)

    az.plot_dist(ppc['obs'][:, :, k])
    ax.axvline(y[k], color='r', linestyle='dashed')
    ax.set_title('Temperature = {}, Store ID = {}'.format(x[k], store_id[k]))

plt.tight_layout()

## Check Random Effects

In [None]:
az.plot_violin(idata.posterior['r'], grid=(1, num_stores), figsize=(12, 4));

## Compare with True Values

In [None]:
random_effects = pd.read_csv('random_effects.csv')
random_effects

In [None]:
r_true = random_effects['random_effects'].values

In [None]:
r_mean = trace['r'].mean(axis=0)

In [None]:
plt.plot(np.arange(num_stores), r_true, 'o-', markersize=8, label='True Values')
plt.plot(r_mean, 'o-', markersize=8, label='Estimated Values')
plt.xticks(np.arange(num_stores))
plt.xlabel('Store ID')
plt.ylabel('Random Effects')
plt.legend()

plt.tight_layout()

In [None]:
fig = plt.figure(figsize=(8, 6))

sns.scatterplot(x=r_mean, y=r_true, s=100)

plt.xlabel('True Value')
plt.ylabel('Estimated Value');