In [8]:
#####Bayesian Personalized Ranking#####
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy.linalg
import itertools
from scipy import sparse
from scipy.stats import norm
from pandas.tools.plotting import scatter_matrix
from numpy.random import *
from scipy import optimize

In [9]:
##多項分布の乱数を生成する関数
def rmnom(pr, n, k, no, pattern):
    if pattern==1:
        z_id = np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1)
        Z = sparse.coo_matrix((np.repeat(1, n), (no, np.array(z_id))), shape=(n, k))   #スパース行列の設定
        return z_id, Z
    z_id = np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1)
    return z_id

In [10]:
####データの生成####
##データの設定
k = 10
f = 150
hh = 4000
item = 2500
Lambda = np.random.gamma(30, 1/0.3, hh)
pt = np.random.poisson(Lambda, hh)
hhpt = np.sum(pt)
k_vec = np.repeat(1.0, k)

In [11]:
##IDとインデックスの設定
#IDの設定
no = np.arange(hhpt)
d_id = np.repeat(np.arange(hh), pt)
pt_id = np.array(list(itertools.chain(*[np.array(range(pt[i]), dtype="int") for i in range(hh)])))

#インデックスの設定
d_list = [i for i in range(hh)]
for i in range(hh):
    d_list[i] = np.array(np.where(d_id==i)[0], dtype="int") 

In [12]:
##アイテム集合を生成
#トピック割当を生成
topic = 50
Gamma = np.random.gamma(5.5, 1/0.7, topic)
phi_topic = np.random.dirichlet(np.repeat(0.1, item), topic)
theta_topic = np.random.dirichlet(np.repeat(0.3, topic), hh)

#多項分布からアイテム集合を生成
max_item = 25
set_list = [i for i in range(hhpt)]
set_dt = np.array(np.full((hhpt, max_item), item), dtype="int16")
item_no = np.arange(item)
m = np.repeat(0, hhpt)
for i in range(hh):
    if i%1000==0:
        print(i)
    target_no = no[d_list[i]]
    z = np.dot(np.random.multinomial(1, theta_topic[i, ], pt[i]), np.arange(topic))
    m_vec = np.random.poisson(Gamma[z], pt[i])
    m_vec[m_vec < 2] = 2; m_vec[m_vec > max_item] = max_item
    m[d_list[i]] = m_vec
    for j in range(pt[i]):
        while True:
            sets = np.random.multinomial(1, phi_topic[z[j], ], m_vec[j])
            if np.max(np.sum(sets, axis=0))==1:
                sets_id = np.dot(sets, item_no)
                set_list[target_no[j]] = sets_id
                set_dt[target_no[j], np.arange(m_vec[j])] = sets_id
                break
                
#リストの変換
sets_id = np.array(list(itertools.chain(*[set_list[i] for i in range(hhpt)])))
no_long = np.repeat(np.arange(hhpt), m)
d_long = np.repeat(d_id, m)
N = d_long.shape[0]
No = np.arange(N)

0
1000
2000
3000


In [13]:
#インデックスを定義
no_list = [i for i in range(hhpt)]
d_list = [i for i in range(hh)]
for i in range(hhpt):
    if i==0:
        no_list[i] = np.arange(m[i])
    else:
        no_list[i] = np.max(no_list[i-1]) + np.arange(m[i]) + 1
for i in range(hh):
    d_list[i] = np.array(np.where(d_long==i)[0], dtype="int")

In [14]:
##応答変数を生成
#高次元特徴行列を生成
F = np.array(np.abs(np.random.normal(0, 0.3, item*f).reshape(f, item)), dtype="float32")

#モデルパラメータを生成
Sigma = np.array([1.0])
beta_u = np.random.normal(0, 0.5, hh)
beta_v = np.random.normal(0, 0.75, item)
theta_u = np.random.normal(0, 0.6, k*hh).reshape(hh, k)
theta_v = np.random.normal(0, 0.75, k*item).reshape(item, k)
omega = np.random.normal(0, 0.25, f*k).reshape(k, f)
betat_u = beta_u.copy(); betat_v = beta_v.copy()
thetat_u = theta_u.copy(); thetat_v = theta_v.copy(); omegat = omega.copy()

#効用関数の期待値を定義
uv1 = np.dot(theta_u[d_long, ] * theta_v[sets_id, ], k_vec)
uv2 = np.dot(theta_u[d_long, ] * np.dot(omega, F[:, sets_id]).T, k_vec)
mu = beta_u[d_long] + beta_v[sets_id] + uv1 + uv2

#選択アイテムを生成
U = mu + np.random.normal(0, Sigma, N)
item_id = np.repeat(0, hhpt)
target_index = np.repeat(0, hhpt)
for i in range(hhpt):
    index1 = no_list[i]; index2 = np.argmax(U[index1])
    item_id[i] = sets_id[index1][index2]
    target_index[i] = No[index1][index2]
item_long = np.repeat(item_id, m)

#選択集合から選択されたアイテムを除外
get_index = np.array(np.where(item_long!=sets_id)[0], dtype="int")
n = m - 1 
no_long = np.repeat(np.arange(hhpt), n)
d_long = d_long[get_index]
item_long = item_long[get_index]
sets_id = sets_id[get_index]
N = get_index.shape[0]
No = np.arange(N)

#新しい高次元特徴行列を定義
F1 = F[:, item_long]
F2 = F[:, sets_id]

In [15]:
##新しいインデックスを定義
#ユーザーインデックスを定義
no_list = [i for i in range(hhpt)]
d_list = [i for i in range(hh)]
for i in range(hhpt):
    if i==0:
        no_list[i] = np.arange(n[i])
    else:
        no_list[i] = np.max(no_list[i-1]) + np.arange(n[i]) + 1
for i in range(hh):
    d_list[i] = np.array(np.where(d_long==i)[0], dtype="int")
    
#アイテムインデックスを定義
item_list1 = [i for i in range(item)]; item_list2 = [i for i in range(item)]
item_n1 = np.repeat(0, item); item_n2 = np.repeat(0, item)
for i in range(item):
    item_list1[i] = np.array(np.where(item_long==i)[0], dtype="int")
    item_list2[i] = np.array(np.where(sets_id==i)[0], dtype="int")
    item_n1[i] = item_list1[i].shape[0]
    item_n2[i] = item_list2[i].shape[0]

In [16]:
####Bayesian Personalized Rankingを推定####
##パラメータ推定のための関数を定義
#対数事後分布の和を定義
def log_posterior(mu, beta_u, beta_v, theta_u, theta_v, omega, tau_u, tau_v, inv_Cov_u, inv_Cov_v, inv_Cov_g, k_vec):
    #ロジットモデルの対数尤度
    logit_exp = np.exp(mu)
    LLho = np.sum(np.log(logit_exp / (1 + logit_exp)))

    #モデルパラメータの対数事前分布
    Prior_u1 = np.sum(-0.5 * (np.power(beta_u, 2) / tau_u))
    Prior_v1 = np.sum(-0.5 * (np.power(beta_v, 2) / tau_v))
    Prior_u2 = np.sum(-0.5 * np.dot(np.dot(theta_u, inv_Cov_u) * theta_u, k_vec))
    Prior_v2 = np.sum(-0.5 * np.dot(np.dot(theta_v, inv_Cov_v) * theta_v, k_vec))
    Prior_g = np.sum(-0.5 * np.dot(np.dot(omega.T, inv_Cov_g) * (omega.T), k_vec))
    Prior = Prior_u1 + Prior_v1 + Prior_u2 + Prior_v2 + Prior_g

    #対数事後分布の和
    Posterior = LLho + Prior
    return Posterior

In [13]:
##アルゴリズムの設定
#確率的勾配法のパラメータを設定


In [14]:
#正則化パラメータを定義
tau_u = np.array([0.5])
tau_v = np.array([0.5])
Cov_u = np.diag(np.repeat(0.3, k))
Cov_v = np.diag(np.repeat(0.3, k))
Cov_g = np.diag(np.repeat(0.3, k))
inv_Cov_u = np.linalg.inv(Cov_u)
inv_Cov_v = np.linalg.inv(Cov_v)
inv_Cov_g = np.linalg.inv(Cov_g)

In [20]:
#モデルパラメータの初期値
beta_u = np.random.normal(0, 0.25, hh)
beta_v = np.random.normal(0, 0.25, item)
theta_u = np.random.normal(0, 0.25, k*hh).reshape(hh, k)
theta_v = np.random.normal(0, 0.25, k*item).reshape(item, k)
omega = np.random.normal(0, 0.2, f*k).reshape(k, f)

In [16]:
#モデルパラメータの真値
beta_u = betat_u.copy()
beta_v = betat_v.copy()
omega = omegat.copy()
theta_u = thetat_u.copy()
theta_v = thetat_v.copy()

In [21]:
#モデルの期待値を定義
beta_long = beta_u[d_long]; theta_long = theta_u[d_long, ]
uv11 = np.dot(theta_long * theta_v[item_long, ], k_vec)
uv12 = np.dot(theta_long * theta_v[sets_id, ], k_vec)
uv21 = np.dot(theta_long * np.dot(omega, F1).T, k_vec)
uv22 = np.dot(theta_long * np.dot(omega, F2).T, k_vec)
mu1 = beta_long + beta_v[item_long] + uv11 + uv21
mu2 = beta_long + beta_v[sets_id] + uv12 + uv22
mu = mu1 - mu2

In [22]:
##ユーザーパラメータを更新


-1907050.4660187056

In [None]:
-563157.4814029746

In [None]:
plt.hist(np.log(logit_exp / (1 + logit_exp)))

In [None]:
#対数事後分布の和を計算する関数
def Posterior(y, mu, theta, alpha_mu, inv_Cov, index, n, k, pattern1, pattern2):
    #ロジットモデルの対数尤度
    logit_exp = np.exp(mu)   #ロジットの期待値の指数
    Prob = logit_exp / (1 + logit_exp)
    LLho = y*np.log(Prob) + (1-y)*np.log(1-Prob)

    #多変量正規分布の対数事前分布
    er = theta - alpha_mu   #パラメータの誤差
    if pattern1==1:
        LLi_mvn = -1/2 * np.dot(np.dot(er, inv_Cov) * er, np.repeat(1, k))
    else:
        LLi_mvn = -1/2 * er * inv_Cov * er 

    #対数事後分布の和
    if pattern2==1:
        LL = np.repeat(0.0, n)
        for i in range(n):
            LL[i] = np.sum(LLho[index[i], ]) + LLi_mvn[i]
    else:
        LL = np.sum(LLho) + LLi_mvn
    return LL, LLho