In [1]:
#####Latent variables Poisson Factorized model#####
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy
import scipy.linalg
import scipy.stats as ss
import itertools
from numpy.random import *
from scipy import optimize
from scipy.stats import norm
from scipy import sparse

In [2]:
##切断ポアソン分布の乱数を生成するための関数
def rtpois(value, n, lower):
    k = np.repeat(lower, n)
    z = np.repeat(1, n)
    mu = np.exp(-value)
    t = (mu / (1 - mu)) * value
    s = t.copy()
    u = np.random.uniform(0, 1, n)

    while True:
        z = s < u
        index = np.where(z==True)[0]
        if index.shape[0]==0:
            break    
        k[index] = k[index] + z[index]
        t[index] = t[index] * value[index] / k[index]
        s[index] = s[index] + t[index]
    return k

In [3]:
####データの発生####
##データの設定
k1 = 10
k2 = 10
hh = 5000   #ユーザー数
item = 3000   #アイテム数
hhpt = hh*item
vec_k1 = np.repeat(1, k1)
vec_k2 = np.repeat(1, k2)

In [4]:
##IDとインデックスの設定
#IDの設定
user_id0 = np.repeat(range(hh), item)
item_id0 = np.matlib.repmat(range(item), 1, hh).reshape(-1)

In [5]:
#インデックスの設定
index = np.array(range(hhpt))
user_index0 = [i for i in range(hh)]
item_index0 = [j for j in range(item)]
for i in range(hh):
    user_index0[i]  = index[user_id0==i]
for j in range(item):
    item_index0[j] = index[item_id0==j]

In [6]:
####応答変数を生成####
rp = 0
while True:
    rp = rp + 1
    print(rp)
    
    ##パラメータを生成
    #ガンマ事前分布を設定
    alpha01 = 0.2; beta01 = 0.9
    alpha02 = 0.175; beta02 = 0.9
    alpha03 = 0.125; beta03 = 0.75

    #モデルパラメータを生成
    W = np.random.gamma(alpha01, 1/beta01, hh*k1).reshape(hh, k1)   #ユーザー特徴行列
    H = np.random.gamma(alpha02, 1/beta02, item*k2).reshape(item, k2)   #アイテム特徴行列
    omega = np.random.gamma(alpha03, 1/beta03, k1*k2).reshape(k1, k2)
    WT = W.copy(); HT = H.copy(); omegat = omega.copy()
    
    #ポアソン分布からデータを生成
    Lambda = np.dot(np.dot(W, omega)[user_id0, ] * H[item_id0, ], vec_k2)
    x_comp = np.random.poisson(Lambda, hhpt)

    ##欠損のある購買データを生成
    #購買確率をベータ分布から生成
    alpha03 = 9.5; beta03 = 12.0
    alpha04 = 7.5; beta04 = 10.0
    beta1 = np.random.beta(alpha03, beta03, hh)   #ユーザー購買確率
    beta2 = np.random.beta(alpha04, beta04, item)   #アイテム購買確率

    #欠損ベクトルを生成
    z_vec0 = np.random.binomial(1, beta1[user_id0]*beta2[item_id0], hhpt)
    z_vec = z_vec0 * x_comp > 0

    #欠損インデックス
    index = np.array(range(hhpt))
    index_z0 = index[z_vec0==0]
    index_z1 = index[(z_vec0==1) & (x_comp > 0)]
    N = index_z1.shape[0]

    #購買有無のベクトルに変換
    index = np.array(range(hhpt))
    user_id = user_id0[index_z1]
    item_id = item_id0[index_z1]
    x_vec = x_comp[index_z1]   #潜在変数
    y_vec = np.array(x_vec > 0, dtype="int")   #観測されたデータ
    
    if (N < (hh*200)) & (np.sum(x_vec >= 2) > (N/5)) & (np.max(x_vec) < 100):
        break

1
2


In [7]:
#テストデータの作成
index_test = np.array(np.where(np.random.binomial(1, 0.15, index_z0.shape[0])==1)[0], dtype="int")
user_id_test = user_id0[index_z0[index_test]]
item_id_test = item_id0[index_z0[index_test]]
x_test = x_comp[index_z0[index_test]]
y_test = np.array(x_test > 0, dtype="int")

In [8]:
####マルコフ連鎖モンテカルロ法でImcomplete data NMFを推定####
##アルゴリズムの設定
R = 2000
keep = 2
burnin = int(500/keep)
iter = 0
disp = 10

In [9]:
##ユーザーおよびアイテムのインデックスを作成
#インデックスを作成
index = np.array(range(N))
user_n = np.repeat(0, hh); item_n = np.repeat(0, item)
user_index = [i for i in range(hh)]
item_index = [j for j in range(item)]
for i in range(hh):
    user_index[i]  = index[user_id==i]
    user_n[i] = user_index[i].shape[0]
for j in range(item):
    item_index[j] = index[item_id==j]
    item_n[j] = item_index[j].shape[0]
    
#和を取るためのスパース行列
user_dt = sparse.coo_matrix((np.repeat(1, N), (user_id, range(N))), shape=(hh, N)).tocsr()
item_dt = sparse.coo_matrix((np.repeat(1, N), (item_id, range(N))), shape=(item, N)).tocsr()

In [10]:
#事前分布の設定
alpha1 = 0.1; beta1 = 0.1
alpha2 = 0.1; beta2 = 0.1
alpha3 = 0.1; beta3 = 1.0

In [11]:
#パラメータの真値
W = WT
H = HT
omega = omegat
mu_vec = np.dot(np.dot(W, omega)[user_id, ] * H[item_id, ], vec_k2)

In [12]:
#パラメータの初期値
W = np.random.gamma(0.15, 1/1.0, hh*k1).reshape(hh, k1)
H = np.random.gamma(0.15, 1/1.0, item*k2).reshape(item, k2)
omega = np.random.gamma(0.2, 1/1.0, k1*k2).reshape(k1, k2)
mu_vec = np.dot(np.dot(W, omega)[user_id, ] * H[item_id, ], vec_k2)

In [13]:
#サンプリング結果の格納用配列
W_array = np.zeros((hh, k1, int(R/keep)))
H_array = np.zeros((item, k2, int(R/keep)))
OMEGA = np.zeros((k1, k2, int(R/keep)))

In [14]:
##対数尤度の基準値
#真値の対数尤度
z_best = np.random.binomial(1, 1-np.exp(-np.dot(np.dot(WT, omegat)[user_id, ] * HT[item_id, ], vec_k2)), N)
LLbest = np.sum(scipy.stats.poisson.logpmf(z_best*x_vec, np.dot(np.dot(WT, omegat)[user_id, ] * HT[item_id, ], vec_k2)))
LLbest_test = np.sum(scipy.stats.poisson.logpmf(x_test, np.dot(np.dot(WT, omegat)[user_id_test, ] * HT[item_id_test, ], vec_k2)))

In [None]:
####ギブスサンプリングでパラメータをサンプリング####
for rp in range(R):

    ##潜在変数zをサンプリング
    #潜在変数zの割当確率
    mu = np.dot(H, omega.T)[item_id, ] * W[user_id, ]
    mu_vec = np.dot(mu, vec_k1)
    Prob = 1 - np.exp(-mu_vec)

    #潜在変数zから新しい変数を作成
    z = np.random.binomial(1, Prob, N)
    x_new = z * x_vec


    ##ガンマ分布よりユーザー特徴行列Wをサンプリング
    #補助変数lambdaを更新
    Lambda = mu / mu_vec.reshape(N, 1)

    #ユーザーごとのガンマ分布のパラメータ
    lambda_y = Lambda * x_new.reshape(N, 1)
    lambda_h = np.dot(H, omega.T)[item_id, ]
    W1 = np.zeros((hh, k1)); W2 = np.zeros((hh, k1))
    for i in range(hh):
        W1[i, ] = np.sum(lambda_y[user_index[i], ], axis=0)
        W2[i, ] = np.sum(lambda_h[user_index[i], ], axis=0)
    W1 = W1 + alpha1; W2 = W2 + beta1

    #パラメータをサンプリング
    W = np.random.gamma(W1.reshape(-1), 1/W2.reshape(-1), hh*k1).reshape(hh, k1)
    
    ##ガンマ分布よりアイテム特徴行列Hをサンプリング
    #補助変数lambdaを更新
    mu = np.dot(W, omega)[user_id, ] * H[item_id, ]
    mu_vec = np.dot(mu, vec_k2)
    Lambda = mu / mu_vec.reshape(N, 1)

    #アイテムごとのガンマ分布のパラメータ
    lambda_y = Lambda * x_new.reshape(N, 1)
    lambda_w = np.dot(W, omega)[user_id, ]
    H1 = np.zeros((item, k2)); H2 = np.zeros((item, k2))
    for j in range(item):
        H1[j, ] = np.sum(lambda_y[item_index[j], ], axis=0)
        H2[j, ] = np.sum(lambda_w[item_index[j], ], axis=0)
    H1 = H1 + alpha2; H2 = H2 + beta2

    #パラメータをサンプリング
    H = np.random.gamma(H1.reshape(-1), 1/H2.reshape(-1), item*k2).reshape(item, k2)
    
    ##ガンマ分布よりomegaをサンプリング
    #補助変数lambdaを更新
    W_vec = W[user_id, ]; H_vec = H[item_id, ]
    WH_block = np.zeros((k1, k2))
    for j in range(k1):
        WH_vec = W_vec[:, j].reshape(N, 1) * H_vec
        WH_block[j, ] = np.sum(WH_vec, axis=0)
    omega_block = omega * WH_block
    Lambda = omega_block / np.sum(omega_block)   #補助変数

    #ガンマ分布のパラメータ
    omega1 = Lambda * np.sum(x_new) + alpha3
    omega2 = WH_block + beta3

    #パラメータをサンプリング
    omega = np.random.gamma(omega1.reshape(-1), 1/omega2.reshape(-1), k1*k2).reshape(k1, k2)

    
    ##パラメータの格納とサンプリング結果の表示
    #サンプリング結果の格納
    if rp%keep==0:
        mkeep = rp//keep
        W_array[:, :, mkeep] = W
        H_array[:, :, mkeep] = H
        OMEGA[:, :, mkeep] = omega

    if rp%disp==0:
    #対数尤度を更新
        LL = np.sum(scipy.stats.poisson.logpmf(x_new, np.dot(np.dot(W, omega)[user_id, ] * H[item_id, ], vec_k2)))
        LL_test = np.sum(scipy.stats.poisson.logpmf(x_test, np.dot(np.dot(W, omega)[user_id_test, ] * H[item_id_test, ], vec_k2)))

        #サンプリング結果の表示
        print(rp)
        print(np.round(np.mean(z), 3))
        print(np.round(np.array([LL, LLbest]), 1))
        print(np.round(np.array([LL_test, LLbest_test]), 1))

0
0.253
[ -830731.2 -1222297.8]
[-3172677.4 -1465768.2]
10
0.561
[-1304784.6 -1222297.8]
[-1854931.4 -1465768.2]
20
0.587
[-1318818.8 -1222297.8]
[-1815401.7 -1465768.2]
30
0.591
[-1305728.5 -1222297.8]
[-1785884.8 -1465768.2]
40
0.593
[-1287492.8 -1222297.8]
[-1745954.8 -1465768.2]
50
0.592
[-1273339.4 -1222297.8]
[-1718800.2 -1465768.2]
60
0.59
[-1262336.7 -1222297.8]
[-1698744.4 -1465768.2]
70
0.59
[-1257933.7 -1222297.8]
[-1684131.1 -1465768.2]
80
0.587
[-1250297.9 -1222297.8]
[-1673771.9 -1465768.2]
90
0.585
[-1244909.1 -1222297.8]
[-1667201.4 -1465768.2]
100
0.586
[-1244119.1 -1222297.8]
[-1661960.1 -1465768.2]
110
0.584
[-1241184.  -1222297.8]
[-1656919.6 -1465768.2]
120
0.584
[-1237183.8 -1222297.8]
[-1655418.1 -1465768.2]
130
0.583
[-1236095.8 -1222297.8]
[-1650821.4 -1465768.2]
140
0.581
[-1232744.3 -1222297.8]
[-1648983.7 -1465768.2]
150
0.581
[-1231408.2 -1222297.8]
[-1645114.4 -1465768.2]
160
0.581
[-1230107.4 -1222297.8]
[-1642654.5 -1465768.2]
170
0.58
[-1229269.5 -12222

1410
0.574
[-1209047.6 -1222297.8]
[-1604677.6 -1465768.2]
1420
0.575
[-1209827.7 -1222297.8]
[-1604557.  -1465768.2]
1430
0.575
[-1210639.7 -1222297.8]
[-1604013.1 -1465768.2]
1440
0.572
[-1207649.8 -1222297.8]
[-1602317.1 -1465768.2]
1450
0.574
[-1208589.1 -1222297.8]
[-1606564.7 -1465768.2]
1460
0.574
[-1208705.8 -1222297.8]
[-1605675.4 -1465768.2]
1470
0.574
[-1208440.5 -1222297.8]
[-1605456.1 -1465768.2]
1480
0.574
[-1208872.8 -1222297.8]
[-1605642.2 -1465768.2]
1490
0.573
[-1207969.1 -1222297.8]
[-1603552.7 -1465768.2]
1500
0.573
[-1206812.8 -1222297.8]
[-1606885.4 -1465768.2]
1510
0.573
[-1207134.  -1222297.8]
[-1607959.5 -1465768.2]
1520
0.575
[-1209772.7 -1222297.8]
[-1606348.2 -1465768.2]
1530
0.574
[-1208889.3 -1222297.8]
[-1604316.9 -1465768.2]
1540
0.574
[-1209763.8 -1222297.8]
[-1602765.9 -1465768.2]
1550
0.573
[-1208099.  -1222297.8]
[-1606336.3 -1465768.2]
1560
0.572
[-1208243.2 -1222297.8]
[-1606608.  -1465768.2]
1570
0.573
[-1207982.  -1222297.8]
[-1608429.9 -1465768.