In [1]:
#####Latent variables Poisson Factorized model#####
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy
import scipy.linalg
import scipy.stats as ss
import itertools
from numpy.random import *
from scipy import optimize
from scipy.stats import norm
from scipy import sparse

In [2]:
##切断ポアソン分布の乱数を生成するための関数
def rtpois(value, n, lower):
    k = np.repeat(lower, n)
    z = np.repeat(1, n)
    mu = np.exp(-value)
    t = (mu / (1 - mu)) * value
    s = t.copy()
    u = np.random.uniform(0, 1, n)

    while True:
        z = s < u
        index = np.where(z==True)[0]
        if index.shape[0]==0:
            break    
        k[index] = k[index] + z[index]
        t[index] = t[index] * value[index] / k[index]
        s[index] = s[index] + t[index]
    return k

In [3]:
####データの発生####
##データの設定
k1 = 10
k2 = 10
hh = 5000   #ユーザー数
item = 3000   #アイテム数
hhpt = hh*item
vec_k1 = np.repeat(1, k1)
vec_k2 = np.repeat(1, k2)

In [4]:
##IDとインデックスの設定
#IDの設定
user_id0 = np.repeat(range(hh), item)
item_id0 = np.matlib.repmat(range(item), 1, hh).reshape(-1)

In [5]:
#インデックスの設定
index = np.array(range(hhpt))
user_index0 = [i for i in range(hh)]
item_index0 = [j for j in range(item)]
for i in range(hh):
    user_index0[i]  = index[user_id0==i]
for j in range(item):
    item_index0[j] = index[item_id0==j]

In [19]:
####応答変数を生成####
rp = 0
while True:
    rp = rp + 1
    print(rp)
    
    ##パラメータを生成
    #ガンマ事前分布を設定
    alpha01 = 0.2; beta01 = 0.9
    alpha02 = 0.175; beta02 = 0.9
    alpha03 = 0.125; beta03 = 0.75

    #モデルパラメータを生成
    W = np.random.gamma(alpha01, 1/beta01, hh*k1).reshape(hh, k1)   #ユーザー特徴行列
    H = np.random.gamma(alpha02, 1/beta02, item*k2).reshape(item, k2)   #アイテム特徴行列
    omega = np.random.gamma(alpha03, 1/beta03, k1*k2).reshape(k1, k2)
    WT = W.copy(); HT = H.copy(); omegat = omega.copy()
    
    #ポアソン分布からデータを生成
    Lambda = np.dot(np.dot(W, omega)[user_id0, ] * H[item_id0, ], vec_k2)
    x_comp = np.random.poisson(Lambda, hhpt)

    ##欠損のある購買データを生成
    #購買確率をベータ分布から生成
    alpha03 = 9.5; beta03 = 12.0
    alpha04 = 7.5; beta04 = 10.0
    beta1 = np.random.beta(alpha03, beta03, hh)   #ユーザー購買確率
    beta2 = np.random.beta(alpha04, beta04, item)   #アイテム購買確率

    #欠損ベクトルを生成
    z_vec0 = np.random.binomial(1, beta1[user_id0]*beta2[item_id0], hhpt)
    z_vec = z_vec0 * x_comp > 0

    #欠損インデックス
    index = np.array(range(hhpt))
    index_z0 = index[z_vec0==0]
    index_z1 = index[(z_vec0==1) & (x_comp > 0)]
    N = index_z1.shape[0]

    #購買有無のベクトルに変換
    index = np.array(range(hhpt))
    user_id = user_id0[index_z1]
    item_id = item_id0[index_z1]
    x_vec = x_comp[index_z1]   #潜在変数
    y_vec = np.array(x_vec > 0, dtype="int")   #観測されたデータ
    
    if (N < (hh*200)) & (np.sum(x_vec >= 2) > (N/5)) & (np.max(x_vec) < 100):
        break

1
2
3


In [20]:
#テストデータの作成
index_test = np.array(np.where(np.random.binomial(1, 0.15, index_z0.shape[0])==1)[0], dtype="int")
user_id_test = user_id0[index_z0[index_test]]
item_id_test = item_id0[index_z0[index_test]]
x_test = x_comp[index_z0[index_test]]
y_test = np.array(x_test > 0, dtype="int")

In [21]:
####マルコフ連鎖モンテカルロ法でImcomplete data NMFを推定####
##アルゴリズムの設定
R = 2000
keep = 2
burnin = int(500/keep)
iter = 0
disp = 10

In [22]:
##ユーザーおよびアイテムのインデックスを作成
#インデックスを作成
index = np.array(range(N))
user_n = np.repeat(0, hh); item_n = np.repeat(0, item)
user_index = [i for i in range(hh)]
item_index = [j for j in range(item)]
for i in range(hh):
    user_index[i]  = index[user_id==i]
    user_n[i] = user_index[i].shape[0]
for j in range(item):
    item_index[j] = index[item_id==j]
    item_n[j] = item_index[j].shape[0]
    
#和を取るためのスパース行列
user_dt = sparse.coo_matrix((np.repeat(1, N), (user_id, range(N))), shape=(hh, N)).tocsr()
item_dt = sparse.coo_matrix((np.repeat(1, N), (item_id, range(N))), shape=(item, N)).tocsr()

In [31]:
#事前分布の設定
alpha1 = 0.1; beta1 = 0.1
alpha2 = 0.1; beta2 = 0.1
alpha3 = 0.1; beta3 = 1.0

In [32]:
#パラメータの真値
W = WT
H = HT
omega = omegat
mu_vec = np.dot(np.dot(W, omega)[user_id, ] * H[item_id, ], vec_k2)

In [33]:
#パラメータの初期値
W = np.random.gamma(0.15, 1/1.0, hh*k1).reshape(hh, k1)
H = np.random.gamma(0.15, 1/1.0, item*k2).reshape(item, k2)
omega = np.random.gamma(0.2, 1/1.0, k1*k2).reshape(k1, k2)
mu_vec = np.dot(np.dot(W, omega)[user_id, ] * H[item_id, ], vec_k2)

In [34]:
#サンプリング結果の格納用配列
W_array = np.zeros((hh, k1, int(R/keep)))
H_array = np.zeros((item, k2, int(R/keep)))
OMEGA = np.zeros((k1, k2, int(R/keep)))

In [35]:
##対数尤度の基準値
#真値の対数尤度
z_best = np.random.binomial(1, 1-np.exp(-np.dot(np.dot(WT, omegat)[user_id, ] * HT[item_id, ], vec_k2)), N)
LLbest = np.sum(scipy.stats.poisson.logpmf(z_best*x_vec, np.dot(np.dot(WT, omegat)[user_id, ] * HT[item_id, ], vec_k2)))
LLbest_test = np.sum(scipy.stats.poisson.logpmf(x_test, np.dot(np.dot(WT, omegat)[user_id_test, ] * HT[item_id_test, ], vec_k2)))

In [36]:
####ギブスサンプリングでパラメータをサンプリング####
for rp in range(R):

    ##切断ポアソン分布からxをサンプリング
    #潜在変数zの割当確率
    mu = np.dot(H, omega.T)[item_id, ] * W[user_id, ]
    mu_vec = np.dot(mu, vec_k1)
    Prob = 1 - np.exp(-mu_vec)

    #潜在変数zと切断ポアソン分布から新しい変数を作成
    z = np.random.binomial(1, Prob, N)
    x_new = z * x_vec


    ##ガンマ分布よりユーザー特徴行列Wをサンプリング
    #補助変数lambdaを更新
    Lambda = mu / mu_vec.reshape(N, 1)

    #ユーザーごとのガンマ分布のパラメータ
    lambda_y = Lambda * x_new.reshape(N, 1)
    lambda_h = np.dot(H, omega.T)[item_id, ]
    W1 = np.zeros((hh, k1)); W2 = np.zeros((hh, k1))
    for i in range(hh):
        W1[i, ] = np.sum(lambda_y[user_index[i], ], axis=0)
        W2[i, ] = np.sum(lambda_h[user_index[i], ], axis=0)
    W1 = W1 + alpha1; W2 = W2 + beta1

    #パラメータをサンプリング
    W = np.random.gamma(W1.reshape(-1), 1/W2.reshape(-1), hh*k1).reshape(hh, k1)
    
    ##ガンマ分布よりアイテム特徴行列Hをサンプリング
    #補助変数lambdaを更新
    mu = np.dot(W, omega)[user_id, ] * H[item_id, ]
    mu_vec = np.dot(mu, vec_k2)
    Lambda = mu / mu_vec.reshape(N, 1)

    #アイテムごとのガンマ分布のパラメータ
    lambda_y = Lambda * x_new.reshape(N, 1)
    lambda_w = np.dot(W, omega)[user_id, ]
    H1 = np.zeros((item, k2)); H2 = np.zeros((item, k2))
    for j in range(item):
        H1[j, ] = np.sum(lambda_y[item_index[j], ], axis=0)
        H2[j, ] = np.sum(lambda_w[item_index[j], ], axis=0)
    H1 = H1 + alpha2; H2 = H2 + beta2

    #パラメータをサンプリング
    H = np.random.gamma(H1.reshape(-1), 1/H2.reshape(-1), item*k2).reshape(item, k2)
    
    ##ガンマ分布よりomegaをサンプリング
    #補助変数lambdaを更新
    W_vec = W[user_id, ]; H_vec = H[item_id, ]
    omega_block = np.zeros((k1, k2))
    WH_block = np.zeros((k1, k2))
    for j in range(k1):
        WH_vec = W_vec[:, j].reshape(N, 1) * H_vec
        omega_block[j, ] = np.sum(WH_vec * omega[np.repeat(j, N), ], axis=0)
        WH_block[j, ] = np.sum(WH_vec, axis=0)
    Lambda = omega_block / np.sum(omega_block)   #補助変数

    #ガンマ分布のパラメータ
    omega1 = Lambda * np.sum(x_new) + alpha3
    omega2 = WH_block + beta3

    #パラメータをサンプリング
    omega = np.random.gamma(omega1.reshape(-1), 1/omega2.reshape(-1), k1*k2).reshape(k1, k2)

    
    ##パラメータの格納とサンプリング結果の表示
    #サンプリング結果の格納
    if rp%keep==0:
        mkeep = rp//keep
        W_array[:, :, mkeep] = W
        H_array[:, :, mkeep] = H
        OMEGA[:, :, mkeep] = omega

    if rp%disp==0:
    #対数尤度を更新
        LL = np.sum(scipy.stats.poisson.logpmf(x_new, np.dot(np.dot(W, omega)[user_id, ] * H[item_id, ], vec_k2)))
        LL_test = np.sum(scipy.stats.poisson.logpmf(x_test, np.dot(np.dot(W, omega)[user_id_test, ] * H[item_id_test, ], vec_k2)))

        #サンプリング結果の表示
        print(rp)
        print(np.round(np.mean(z), 3))
        print(np.round(np.array([LL, LLbest]), 1))
        print(np.round(np.array([LL_test, LLbest_test]), 1))

0
0.251
[-645111.3 -960921.3]
[-2603096.3 -1274124.7]
10
0.508
[-1006649.5  -960921.3]
[-1650000.2 -1274124.7]
20
0.538
[-1026264.4  -960921.3]
[-1614086.9 -1274124.7]
30
0.545
[-1024803.8  -960921.3]
[-1596696.2 -1274124.7]
40
0.545
[-1012938.4  -960921.3]
[-1575200.1 -1274124.7]
50
0.544
[-1000908.9  -960921.3]
[-1549467.5 -1274124.7]
60
0.542
[-991328.6 -960921.3]
[-1529786.3 -1274124.7]
70
0.541
[-985580.3 -960921.3]
[-1516898.3 -1274124.7]
80
0.54
[-980211.7 -960921.3]
[-1501147.2 -1274124.7]
90
0.538
[-975413.2 -960921.3]
[-1495294.3 -1274124.7]
100
0.537
[-971536.4 -960921.3]
[-1486692.8 -1274124.7]
110
0.535
[-968748.3 -960921.3]
[-1484875.3 -1274124.7]
120
0.535
[-965202.4 -960921.3]
[-1476769.8 -1274124.7]
130
0.534
[-963020.5 -960921.3]
[-1470004.3 -1274124.7]
140
0.531
[-959115.  -960921.3]
[-1466126.1 -1274124.7]
150
0.531
[-957084.6 -960921.3]
[-1463053.4 -1274124.7]
160
0.53
[-954912.5 -960921.3]
[-1455453.8 -1274124.7]
170
0.53
[-953814.  -960921.3]
[-1454442.1 -1274124

1460
0.522
[-939240.3 -960921.3]
[-1430418.4 -1274124.7]
1470
0.524
[-941615.9 -960921.3]
[-1429391.1 -1274124.7]
1480
0.523
[-940405.9 -960921.3]
[-1430022.9 -1274124.7]
1490
0.524
[-941516.3 -960921.3]
[-1432543.3 -1274124.7]
1500
0.523
[-939918.  -960921.3]
[-1432787.6 -1274124.7]
1510
0.523
[-940682.8 -960921.3]
[-1435188.3 -1274124.7]
1520
0.523
[-941326.  -960921.3]
[-1430299.9 -1274124.7]
1530
0.523
[-940503.  -960921.3]
[-1431040.2 -1274124.7]
1540
0.523
[-940230.7 -960921.3]
[-1431304.  -1274124.7]
1550
0.524
[-941351.8 -960921.3]
[-1425322.  -1274124.7]
1560
0.524
[-941986.8 -960921.3]
[-1429260.9 -1274124.7]
1570
0.523
[-940118.3 -960921.3]
[-1432858.5 -1274124.7]
1580
0.524
[-941407.1 -960921.3]
[-1435202.  -1274124.7]
1590
0.524
[-940008.4 -960921.3]
[-1433813.9 -1274124.7]
1600
0.524
[-940880.9 -960921.3]
[-1437294.9 -1274124.7]
1610
0.522
[-939929.5 -960921.3]
[-1428536.5 -1274124.7]
1620
0.524
[-941613.9 -960921.3]
[-1430999.6 -1274124.7]
1630
0.522
[-939466.6 -960921.3