In [573]:
# Bernoulli-Poisson link CP decomposition
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy.linalg
import itertools
from scipy import sparse
from scipy.stats import norm
from numpy.random import *
from scipy import optimize

In [574]:
# 連続した日付を取得する関数
def daterange(_start, _end):
    for n in range((_end - _start).days):
        yield _start + timedelta(n)
        
# 切断ポアソン分布を生成する関数
def rtpois(mu, a, b, n):
    FA = scipy.stats.poisson.cdf(a, mu)
    FB = scipy.stats.poisson.cdf(b, mu)
    return np.array(scipy.stats.poisson.ppf(np.random.uniform(0, 1, n)*(FB-FA)+FA, mu), dtype="int")

# 多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.array(np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1), dtype="int")
    return z_id

In [575]:
# データの定義
k = 10
hh = 3000
item = 2500
element = 500
Lambda = np.random.gamma(50.0, 1/0.25, hh)
pt1 = np.random.poisson(Lambda, hh)
hhpt = np.sum(pt1)
k_vec = np.repeat(1.0, k)

In [576]:
# idとインデックスを定義
# idの定義
d_id1 = np.repeat(np.arange(hh), pt1)
pt_id1 = np.array(list(itertools.chain(*[np.array(range(pt1[i]), dtype="int") for i in range(hh)])))

#インデックスを定義
d_list1 = [i for i in range(hh)]
d_vec1 = [i for i in range(hh)]
for i in range(hh):
    d_list1[i] = np.array(np.where(d_id1==i)[0], dtype="int")
    d_vec1[i] = np.repeat(1.0, d_list1[i].shape[0])

In [577]:
# itemとelementを選択
#トピックを生成
topic = 30
theta_topic = np.random.dirichlet(np.repeat(0.2, topic), hh)
phi_item = np.random.dirichlet(np.repeat(0.1, item), topic)
phi_element = np.random.dirichlet(np.repeat(0.1, element), topic)
z = np.array(rmnom(theta_topic[d_id1, ], hhpt, topic, 0), dtype="int16")

# 多項分布からplaceを生成
item_id1 = np.repeat(0, hhpt)
element_id1 = np.repeat(0, hhpt)
for i in range(hh):
    index = d_list1[i]
    item_id1[index] = rmnom(phi_item[z[index], ], pt1[i], item, 0)
    element_id1[index] = rmnom(phi_element[z[index], ], pt1[i], element, 0)
    
# インデックスを定義
item_list1 = [i for i in range(item)]
element_list1 = [i for i in range(element)]
item_vec1 = [i for i in range(item)]
element_vec1 = [i for i in range(element)]
item_n1 = np.repeat(0, item)
element_n1 = np.repeat(0, element)
for i in range(item):
    item_list1[i] = np.array(np.where(item_id1==i)[0], dtype="int")
    item_vec1[i] = np.repeat(1.0, item_list1[i].shape[0])
    item_n1[i] = item_list1[i].shape[0]
for i in range(element):
    element_list1[i] = np.array(np.where(element_id1==i)[0], dtype="int")
    element_vec1[i] = np.repeat(1.0, element_list1[i].shape[0])
    element_n1[i] = element_list1[i].shape[0]

In [608]:
# 応答変数を生成
# パラメータを生成
theta_u = np.random.gamma(1.0, 1/1.75, k*hh).reshape(hh, k)
theta_v = np.random.gamma(1.0, 1/1.75, k*item).reshape(item, k)
theta_e = np.random.gamma(0.75, 1/2.0, k*element).reshape(element, k)
thetat_u = theta_u.copy(); thetat_v = theta_v.copy(); thetat_e = theta_e.copy()

# Bernoulli Poisson linkからbinaryデータを生成
mu = np.dot(theta_u[d_id1, ] * theta_v[item_id1, ] * theta_e[element_id1, ], k_vec)
freq = np.random.poisson(mu, hhpt)
x = np.array(freq > 0, dtype="int")
index_x = np.array(np.where(x==1)[0], dtype="int")
N = index_x.shape[0]

In [609]:
# 新しいidとインデックスを定義
# idを定義
d_id2 = d_id1[index_x]
item_id2 = item_id1[index_x]
element_id2 = element_id1[index_x]

# インデックスを定義
d_list2 = [i for i in range(hh)]
item_list2 = [i for i in range(item)]
element_list2 = [i for i in range(element)]
d_vec2 = [i for i in range(hh)]
item_vec2 = [i for i in range(item)]
element_vec2 = [i for i in range(element)]
pt2 = np.repeat(0, hh)
item_n2 = np.repeat(0, item)
element_n2 = np.repeat(0, element)

for i in range(hh):
    d_list2[i] = np.array(np.where(d_id2==i)[0], dtype="int")
    d_vec2[i] = np.repeat(1.0, d_list2[i].shape[0])
    pt2[i] = d_list2[i].shape[0]
pt_id2 = np.array(list(itertools.chain(*[np.array(range(pt2[i]), dtype="int") for i in range(hh)])))
for i in range(item):
    item_list2[i] = np.array(np.where(item_id2==i)[0], dtype="int")
    item_vec2[i] = np.repeat(1.0, item_list2[i].shape[0])
    item_n2[i] = item_list2[i].shape[0]
for i in range(element):
    element_list2[i] = np.array(np.where(element_id2==i)[0], dtype="int")
    element_vec2[i] = np.repeat(1.0, element_list2[i].shape[0])
    element_n2[i] = element_list2[i].shape[0]

In [630]:
# Bernoulli-Poisson link CP decompositionのパラメータを推定
# MCMCの設定
R = 1000
keep = 2
burnin = int(500/keep)
iter = 0
disp = 10
freq_y = freq[index_x]

# 事前分布の定義
alpha1 = 0.1; beta1 = 0.1
alpha2 = 0.1; beta2 = 0.1
alpha3 = 0.1; beta3 = 0.1
s0 = 0.5; v0 = 0.001

In [631]:
# パラメータの真値
# モデルパラメータの定義
theta_u = thetat_u.copy()
theta_v = thetat_v.copy()
theta_e = thetat_e.copy()

# 期待値の真値
mu_deploy = theta_u[d_id1, ] * theta_v[item_id1, ] * theta_e[element_id1, ]
mu = np.dot(mu_deploy, k_vec)

In [632]:
# パラメータの初期値
# パラメータを生成
theta_u = np.random.gamma(1.0, 1/3.0, k*hh).reshape(hh, k)
theta_v = np.random.gamma(1.0, 1/3.0, k*item).reshape(item, k)
theta_e = np.random.gamma(1.0, 1/3.0, k*element).reshape(element, k)

# 期待値の初期値
mu_deploy = theta_u[d_id1, ] * theta_v[item_id1, ] * theta_e[element_id1, ]
mu = np.dot(mu_deploy, k_vec)

In [633]:
# パラメータの格納用配列
THETA_U = np.zeros((hh, k, int(R/keep)))
THETA_V = np.zeros((item, k, int(R/keep)))
THETA_E = np.zeros((element, k, int(R/keep)))

In [634]:
# 対数尤度の基準値
# 1パラメータモデルの対数尤度
LLst = np.sum(scipy.stats.poisson.logpmf(freq, np.mean(freq)))
print(LLst)

# 真値での対数尤度
mu_deploy = thetat_u[d_id1, ] * thetat_v[item_id1, ] * thetat_e[element_id1, ]
mut = np.dot(mu_deploy, k_vec)
LLbest = np.sum(scipy.stats.poisson.logpmf(freq, mut))
print(LLbest)

-975629.5927440615
-756541.9646464376


In [635]:
# ギブスサンプリングでパラメータをサンプリング
for rp in range(R):
    
    # Bernoulli-Poisson linkで潜在頻度を生成
    # 期待値を定義
    mu_deploy = theta_u[d_id2, ] * theta_v[item_id2, ] * theta_e[element_id2, ]
    mu = np.dot(mu_deploy, k_vec)

    # 切断ポアソン分布から潜在頻度を生成
    xy = np.repeat(0, hhpt)
    y = Prob * rtpois(mu, 0, np.inf, N)
    xy[index_x] = y
    
    
    # ユーザーの特徴行列を更新
    # 補助変数を定義
    Lambda = mu_deploy / mu[:, np.newaxis]
    lambda_y = y[:, np.newaxis] * Lambda
    lambda_h = theta_v[item_id1, ] * theta_e[element_id1, ]
    
    # 事後分布のパラメータ
    w1 = np.zeros((hh, k))
    w2 = np.zeros((hh, k))
    for i in range(hh):
        index1 = d_list1[i]; index2 = d_list2[i]
        w1[i, ] = np.dot(d_vec2[i].T, lambda_y[index2, ]) + alpha1
        w2[i, ] = np.dot(d_vec1[i].T, lambda_h[index1, ]) + beta1
    
    # ガンマ分布よりパラメータをサンプリング
    theta_u = np.random.gamma(w1, 1/w2)
    mu_deploy = theta_u[d_id2, ] * theta_v[item_id2, ] * theta_e[element_id2, ]
    mu = np.dot(mu_deploy, k_vec)

    # itemの特徴行列を更新
    # 補助変数を定義
    Lambda = mu_deploy / mu[:, np.newaxis]
    lambda_y = y[:, np.newaxis] * Lambda
    lambda_h = theta_u[d_id1, ] * theta_e[element_id1, ]
    
    # 事後分布のパラメータ
    w1 = np.zeros((item, k))
    w2 = np.zeros((item, k))
    for i in range(item):
        index1 = item_list1[i]; index2 = item_list2[i]
        w1[i, ] = np.dot(item_vec2[i].T, lambda_y[index2, ]) + alpha2
        w2[i, ] = np.dot(item_vec1[i].T, lambda_h[index1, ]) + beta2
    
    # ガンマ分布よりパラメータをサンプリング
    theta_v = np.random.gamma(w1, 1/w2)
    mu_deploy = theta_u[d_id2, ] * theta_v[item_id2, ] * theta_e[element_id2, ]
    mu = np.dot(mu_deploy, k_vec)
    
    # itemの特徴行列を更新
    # 補助変数を定義
    Lambda = mu_deploy / mu[:, np.newaxis]
    lambda_y = y[:, np.newaxis] * Lambda
    lambda_h = theta_u[d_id1, ] * theta_v[item_id1, ]
    
    # 事後分布のパラメータ
    w1 = np.zeros((element, k))
    w2 = np.zeros((element, k))
    for i in range(element):
        index1 = element_list1[i]; index2 = element_list2[i]
        w1[i, ] = np.dot(element_vec2[i].T, lambda_y[index2, ]) + alpha3
        w2[i, ] = np.dot(element_vec1[i].T, lambda_h[index1, ]) + beta3
    
    # ガンマ分布よりパラメータをサンプリング
    theta_e = np.random.gamma(w1, 1/w2)
    mu_deploy = theta_u[d_id2, ] * theta_v[item_id2, ] * theta_e[element_id2, ]
    mu = np.dot(mu_deploy, k_vec)
    
    
    if rp%disp==0:
        # 対数尤度を更新
        mu_deploy = theta_u[d_id1, ] * theta_v[item_id1, ] * theta_e[element_id1, ]
        mu = np.dot(mu_deploy, k_vec)
        LL = np.sum(scipy.stats.poisson.logpmf(freq, mu))

        print(rp)
        print([np.sum(y), np.sum(freq)])
        print(np.round([LL, LLst, LLbest], 1))

0
[130522.90802979896, 728542]
[-1760748.4  -975629.6  -756542. ]
10
[13626.846825047456, 728542]
[-4924565.8  -975629.6  -756542. ]


ValueError: shape < 0