In [None]:
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import pymc3 as pm
import pandas as pd
import os

np.random.seed(123)

%matplotlib inline
plt.style.use('ggplot')

import matplotlib

text_size = 20

matplotlib.rcParams['figure.figsize'] = (15, 10)
matplotlib.rcParams['axes.titlesize'] = text_size
matplotlib.rcParams['axes.labelsize'] = text_size - 2
matplotlib.rcParams['xtick.labelsize'] = text_size - 4
matplotlib.rcParams['ytick.labelsize'] = text_size - 4

# UWAGI:
* `MaskedArray` z `numpy` pozawala wepchnac brakujace wartosci (podobnie `pandas`owy `DataFrame` z wartosciami `NaN`);

In [None]:
from datetime import datetime
DF_reddb = pd.read_csv('reddb.csv')

# przeprocesuj kolumne `Hour()`
DF_reddb['hour'] = DF_reddb['Hour()'].apply(lambda s: datetime.strptime(s, '%H %d/%m/%Y'))
DF_reddb['day'] = DF_reddb['Hour()'].apply(lambda s: datetime.strptime(s, '%H %d/%m/%Y'))

In [None]:
# wykresy statow

stats = ['imp', 'index', 'pro_index', 'pro_scroll_8_8', 'con1']

fig = plt.figure(figsize = (30, 5))

for stat_number, stat_name in enumerate(stats):
    ax = fig.add_subplot(1, len(stats), stat_number + 1)
    ax.hist(DF_reddb[stat_name])
    ax.set_xlabel('log10({0})'.format(stat_name))
    if stat_number == 0:
        ax.set_ylabel('log10(count)')
    ax.set_xscale('log')
    ax.set_yscale('log')

In [None]:
DF_reddb.show()

# Pierwsza proba stworzenia modelu

$$ \sigma \sim Exponential(50) $$

$$ \nu \sim Exponential(.1) $$

$$ s_i \sim Normal(s_{i-1}, \sigma^{-2}) $$

$$ log(\frac{y_i}{y_{i-1}}) \sim t(\nu, 0, exp(-2 s_i)) $$

In [8]:
import numpy as np
import pymc3 as pm
from pymc3.distributions.timeseries import GaussianRandomWalk

N = 10**3
succ = np.array([12, 12, 13, 12, 9, 7, 2, 0])
tries = np.array([1.0, 1.1, 1.5, 1.6, 1.1, 0.9, 0.9, 0.2]) * N
n = len(succ)

In [22]:
alpha = 1

with pm.Model() as test_model:
    sigma = pm.Exponential('sigma', 1./.02, testval = .1)
    
    betas = GaussianRandomWalk('betas', sigma**-2, shape = n)
    
    rates = pm.Beta('rate', alpha, betas, shape = n)
    
    for ind in range(n):
        clicks = pm.Binomial('clicks', tries[ind], rate[ind], observed = succ[ind])

Applied log-transform to sigma and added transformed sigma_log to model.
Applied logodds-transform to rate and added transformed rate_logodds to model.


In [25]:
with test_model:
    start = pm.find_MAP()
    step1 = pm.NUTS(start = start, vars = [sigma, betas, rate])
    step2 = pm.Metropolis(start = start, vars = [clicks])

    trace = pm.sample(1000, start = start, step = [step1, step2], progressbar = True)

ValueError: Optimization error: max, logp or dlogp at max have non-finite values. Some values may be outside of distribution support. max: {'betas': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), 'rate_logodds_': array([ inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf]), 'sigma_log_': array(-13.302585092994049)} logp: array(-inf) dlogp: array([-6.00008351,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan])Check that 1) you don't have hierarchical parameters, these will lead to points with infinite density. 2) your distribution logp's are properly specified. Specific issues: 
rate_logodds_.dlogp bad at idx: (array([0, 1, 2, 3, 4, 5, 6, 7]),) with values: [ nan  nan  nan  nan  nan  nan  nan  nan]
rate_logodds_.value bad: [ inf  inf  inf  inf  inf  inf  inf  inf]
rate_logodds_.logp bad: -inf