In [1]:
# 可変基底非負値テンソル分解
# ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy.linalg
import itertools
from scipy import sparse
from scipy.stats import norm
from numpy.random import *
from scipy import optimize

#np.random.seed(98537)

In [2]:
# 多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.array(np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1), dtype="int")
    return z_id

# 切断ポアソン分布を生成する関数
def rtpois(mu, a, b, n, flag=0):
    FA = scipy.stats.poisson.cdf(a, mu)
    FB = scipy.stats.poisson.cdf(b, mu)
    x = np.array(scipy.stats.poisson.ppf(np.random.uniform(0, 1, n)*(FB-FA)+FA, mu), dtype="int")
    if flag==1:
        x = torch.Tensor(x)
    return x

# 切断ポアソン分布を高速で生成する関数
def rtpois_lw(mu, a, b, N, flag=0):
    x = np.random.poisson(mu, N)
    index = np.where(x==0)[0]
    x[index] = rtpois(mu[index], a, b, index.shape[0], 0)
    if flag==1:
        x = torch.Tensor(x)
    return x

# ガンマ分布を生成する関数
def Gamma(w1, w2, flag=0):
    x = np.random.gamma(w1, w2)
    if flag==1:
        x = torch.Tensor(x)
    return x

# データの生成 

In [3]:
# データと日付idの設定
# データの設定
k = 10
k_vec = np.repeat(1.0, k)
mode = 3
m = 3
hh = 3000
location = 2000
week = 7
month = 31
year = 12
max_week = 5
month_days = np.array([31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31])
year_days = np.sum(month_days)
Lambda = np.random.gamma(30.0, 1/0.2, hh) 
pt = np.random.poisson(Lambda, hh)
N = np.sum(pt)

# 日付データを定義
day_dt = np.tile(np.arange(week), int((year_days*m)/week) + 1)[:m*year_days]
week_dt = np.tile(np.array(list(itertools.chain(*[np.repeat(np.arange(max_week), week)[:month_days[j]] for j in range(year)]))), m)
month_dt = np.tile(np.repeat(np.arange(year), month_days), m)
year_dt = np.repeat(np.arange(m), year_days)
date_dt = [day_dt, week_dt, month_dt, year_dt]
date_n = np.array([week, max_week, year, m])
week_freq = np.array(pd.Series(week_dt).value_counts(), dtype="int")
month_freq = np.array(pd.Series(month_dt).value_counts(), dtype="int")

In [4]:
# IDとインデックスを作成
# IDの定義
d_id = np.repeat(np.arange(hh), pt)
pt_id = np.array(list(itertools.chain(*[np.array(range(pt[i]), dtype="int") for i in range(hh)])))

# インデックスの定義
d_list = [i for i in range(hh)]
for i in range(hh):
    d_list[i] = np.array(np.where(d_id==i)[0], dtype="int")

In [5]:
# locationの生成
# 多項分布からトピックを生成
topic = 30
theta_topic = np.random.dirichlet(np.repeat(0.2, topic), hh)
phi_location = np.random.dirichlet(np.repeat(0.2, location), topic)
z = rmnom(theta_topic[d_id, ], N, topic, 0)

# トピックからlocationを生成
location_id = np.repeat(0, N)
for i in range(hh):
    index = d_list[i]
    location_id[index] = rmnom(phi_location[z[index], ], pt[i], location, 0)
    
# インデックスの定義
location_list = [i for i in range(location)]
location_n = np.repeat(0, location)
for i in range(location):
    location_list[i] = np.array(np.where(location_id==i)[0], dtype="int")
    location_n[i] = location_list[i].shape[0]

In [6]:
# dateの生成
# パラメータを生成
phi_day = np.random.dirichlet(np.repeat(0.2, week), topic)
phi_week = week_freq * np.random.dirichlet(np.repeat(0.2, max_week), topic)
phi_week = phi_week / np.sum(phi_week, axis=1)[:, np.newaxis]
phi_month = month_freq * np.random.dirichlet(np.repeat(0.2, year), topic)
phi_month = phi_week / np.sum(phi_month, axis=1)[:, np.newaxis]
phi_year = np.random.dirichlet(np.repeat(2.5, m), topic)

# トピックからdateを生成
z = rmnom(theta_topic[d_id, ], N, topic, 0)
year_id = np.repeat(0, N)
month_id = np.repeat(0, N)
week_id = np.repeat(0, N)
day_id = np.repeat(0, N)
for i in range(hh):
    index = d_list[i]
    year_id[index] = rmnom(phi_year[z[index], ], pt[i], m, 0)
    month_id[index] = rmnom(phi_month[z[index], ], pt[i], year, 0)
    week_id[index] = rmnom(phi_week[z[index], ], pt[i], max_week, 0)
    day_id[index] = rmnom(phi_day[z[index], ], pt[i], week, 0)
date_id = np.array([day_id, week_id, month_id, year_id]).T

# インデックスの定義
year_list = [i for i in range(m)]
year_n = np.repeat(0, m)
for i in range(m):
    year_list[i] = np.array(np.where(year_id==i)[0], dtype="int")
    year_n[i] = year_list[i].shape[0]
    
month_list = [i for i in range(year)]
month_n = np.repeat(0, year)
for i in range(year):
    month_list[i] = np.array(np.where(month_id==i)[0], dtype="int")
    month_n[i] = month_list[i].shape[0]
    
week_list = [i for i in range(max_week)]
week_n = np.repeat(0, max_week)
for i in range(max_week):
    week_list[i] = np.array(np.where(week_id==i)[0], dtype="int")
    week_n[i] = week_list[i].shape[0]
    
day_list = [i for i in range(week)]
day_n = np.repeat(0, week)
for i in range(week):
    day_list[i] = np.array(np.where(day_id==i)[0], dtype="int")
    day_n[i] = day_list[i].shape[0]

In [7]:
# パラメータの生成
# 事前分布の設定
gamma = np.array([3.0, 1.0, 0.25, 0.5])
alpha1 = 0.35; alpha2 = 0.2
beta1 = 0.75; beta2 = 0.3

# モデルパラメータの生成
omega = np.random.dirichlet(gamma, hh)
theta_u = np.random.gamma(alpha1, 1/beta1, hh*k).reshape(hh, k)
theta_v = np.random.gamma(alpha1, 1/beta1, location*k).reshape(location, k)
theta_d = [j for j in range(mode+1)]
for j in range(mode):
    theta_d[j] = np.random.gamma(alpha2, 1/beta2, date_n[j]*k).reshape(date_n[j], k)
theta_d[mode] = np.repeat(1.0, k)
omegat = omega.copy()
thetat_u = theta_u.copy(); thetat_v = theta_v.copy(); thetat_d = theta_d.copy()

In [8]:
# 応答変数を生成
# 結合パラメータを定義
joint_theta = [j for j in range(mode+1)]
for j in range(mode+1):
    if k < mode:
        joint_theta[j] = theta_v[location_id, ] * theta_d[j][date_id[:, j], ]
    else:
        joint_theta[j] = theta_v[location_id, ]
        
# モデルの期待値
Z = rmnom(omega[d_id, ], N, mode+1, 1)[1]
uv = np.zeros((N, mode+1))
for j in range(mode+1):
    uv[:, j] = np.dot(theta_u[d_id, ] * joint_theta[j], k_vec)
mu = np.sum(Z * uv, axis=1)

# 切断ポアソン分布から応答変数を生成
x = rtpois(mu, 0, np.inf, N, flag=0)
y = x * np.random.binomial(1, 1 - np.exp(-mu), N)

# テストデータの生成

In [None]:
# データの定義
pt0 = np.random.poisson(Lambda, hh)
N0 = np.sum(pt0)

In [None]:
# IDとインデックスを作成
# IDの定義
d_id0 = np.repeat(np.arange(hh), pt0)
pt_id0 = np.array(list(itertools.chain(*[np.array(range(pt0[i]), dtype="int") for i in range(hh)])))

# インデックスの定義
d_list0 = [i for i in range(hh)]
for i in range(hh):
    d_list0[i] = np.array(np.where(d_id0==i)[0], dtype="int")

In [None]:
# locationの生成
# 多項分布からトピックを生成
topic = 30
z = rmnom(theta_topic[d_id0, ], N0, topic, 0)

# トピックからlocationを生成
location_id0 = np.repeat(0, N0)
for i in range(hh):
    index = d_list0[i]
    location_id0[index] = rmnom(phi_location[z[index], ], pt0[i], location, 0)
    
# インデックスの定義
location_list0 = [i for i in range(location)]
location_n0 = np.repeat(0, location)
for i in range(location):
    location_list0[i] = np.array(np.where(location_id0==i)[0], dtype="int")
    location_n0[i] = location_list0[i].shape[0]

In [None]:
# dateの生成
# トピックからdateを生成
z = rmnom(theta_topic[d_id0, ], N0, topic, 0)
year_id0 = np.repeat(0, N0)
month_id0 = np.repeat(0, N0)
week_id0 = np.repeat(0, N0)
day_id0 = np.repeat(0, N0)
for i in range(hh):
    index = d_list0[i]
    year_id0[index] = rmnom(phi_year[z[index], ], pt0[i], m, 0)
    month_id0[index] = rmnom(phi_month[z[index], ], pt0[i], year, 0)
    week_id0[index] = rmnom(phi_week[z[index], ], pt0[i], max_week, 0)
    day_id0[index] = rmnom(phi_day[z[index], ], pt0[i], week, 0)
date_id0 = np.array([day_id0, week_id0, month_id0, year_id0]).T

# インデックスの定義
year_list0 = [i for i in range(m)]
year_n0 = np.repeat(0, m)
for i in range(m):
    year_list0[i] = np.array(np.where(year_id0==i)[0], dtype="int")
    year_n0[i] = year_list0[i].shape[0]
    
month_list0 = [i for i in range(year)]
month_n0 = np.repeat(0, year)
for i in range(year):
    month_list0[i] = np.array(np.where(month_id0==i)[0], dtype="int")
    month_n0[i] = month_list0[i].shape[0]
    
week_list0 = [i for i in range(max_week)]
week_n0 = np.repeat(0, max_week)
for i in range(max_week):
    week_list0[i] = np.array(np.where(week_id0==i)[0], dtype="int")
    week_n0[i] = week_list0[i].shape[0]
    
day_list0 = [i for i in range(week)]
day_n0 = np.repeat(0, week)
for i in range(week):
    day_list0[i] = np.array(np.where(day_id0==i)[0], dtype="int")
    day_n0[i] = day_list0[i].shape[0]

In [None]:
# 応答変数を生成
# 結合パラメータを定義
joint_theta = [j for j in range(mode+1)]
for j in range(mode+1):
    if k < mode:
        joint_theta[j] = theta_v[location_id0, ] * theta_d[j][date_id0[:, j], ]
    else:
        joint_theta[j] = theta_v[location_id0, ]
        
# モデルの期待値
Z0 = rmnom(omega[d_id0, ], N0, mode+1, 1)[1]
uv = np.zeros((N0, mode+1))
for j in range(mode+1):
    uv[:, j] = np.dot(theta_u[d_id0, ] * joint_theta[j], k_vec)
mu = np.sum(Z0 * uv, axis=1)

# 切断ポアソン分布から応答変数を生成
y0 = np.random.poisson(mu, N0)

# 可変基底非負値テンソル分解のパラメータを推定

In [None]:
# アルゴリズムの設定
R = 2000
keep = 2
burnin = int(500/keep)
iter = 0
disp = 10

In [None]:
# 事前分布の設定
gamma = np.array([1.5, 1.0, 0.5, 0.5])
alpha1 = 0.1; beta1 = 0.1
alpha2 = 0.1; beta2 = 0.1
alpha3 = 0.1; beta3 = 1.0

In [None]:
# パラメータの真値
# モデルパラメータの真値
omega = omegat.copy()
theta_u = thetat_u.copy()
theta_v = thetat_v.copy()
theta_d = thetat_d.copy()

# トピックの真値
Zi = Z.copy()
z_vec = np.dot(Zi, np.arange(mode+1))

In [None]:
# パラメータ初期値
# モデルパラメータの初期値
omega = np.random.dirichlet(gamma, hh)
theta_u = np.random.gamma(0.5, 1/1.0, hh*k).reshape(hh, k)
theta_v = np.random.gamma(0.5, 1/1.0, location*k).reshape(location, k)
theta_d = [j for j in range(mode+1)]
for j in range(mode):
    theta_d[j] = np.random.gamma(0.5, 1/1.0, date_n[j]*k).reshape(date_n[j], k)
theta_d[mode] = np.repeat(1.0, k)

# トピックの初期値
res = rmnom(omega[d_id, ], N, mode+1, 1)
z_vec = np.array(res[0], dtype="int8")
Zi = np.array(res[1], dtype="int8")

In [None]:
# パラメータの格納用配列
# モデルパラメータの格納用配列
OMEGA = np.zeros((hh, mode+1, int(R/keep)))
THETA_U = np.zeros((hh, k, int(R/keep)))
THETA_V = np.zeros((location, k, int(R/keep)))
THETA_D1 = np.zeros((date_n[0], k, int(R/keep)))
THETA_D2 = np.zeros((date_n[1], k, int(R/keep)))
THETA_D3 = np.zeros((date_n[2], k, int(R/keep)))

# 潜在変数の格納用配列
S = np.repeat(0.0, N)
SEG = np.zeros((N, mode+1), dtype="int")

In [None]:
# 対数尤度の基準値
# テストデータの1パラメータモデルの対数尤度
LLst0 = np.sum(scipy.stats.poisson.logpmf(y0, np.mean(y0)))
print(LLst0)

# 学習データの真値での対数尤度
joint_theta = [j for j in range(mode+1)]
uv = np.zeros((N, mode+1))
for j in range(mode+1):
    if k < mode:
        joint_theta[j] = thetat_v[location_id, ] * thetat_d[j][date_id[:, j], ]
    else:
        joint_theta[j] = thetat_v[location_id, ]
    uv[:, j] = np.dot(thetat_u[d_id, ] * joint_theta[j], k_vec)
mu = np.sum(Z * uv, axis=1)
y = x * np.random.binomial(1, 1 - np.exp(-mu), N)
LLbest = np.sum(scipy.stats.poisson.logpmf(x, mu))
print(LLbest)

# 学習データの1パラメータモデルの対数尤度
LLst = np.sum(scipy.stats.poisson.logpmf(y, np.mean(y)))
print(LLst)

# テストデータの真値での対数尤度
joint_theta = [j for j in range(mode+1)]
uv = np.zeros((N0, mode+1))
for j in range(mode+1):
    if k < mode:
        joint_theta[j] = thetat_v[location_id0, ] * thetat_d[j][date_id0[:, j], ]
    else:
        joint_theta[j] = thetat_v[location_id0, ]
    uv[:, j] = np.dot(thetat_u[d_id0, ] * joint_theta[j], k_vec)
mu = np.sum(Z0 * uv, axis=1)
LLbest0 = np.sum(scipy.stats.poisson.logpmf(y0, mu))
print(LLbest0)

In [None]:
# ギブスサンプリングでパラメータをサンプリング
for rp in range(R):
    
    # 潜在変数sをサンプリング
    # 潜在変数sの割当確率
    joint_theta = [j for j in range(mode+1)]
    uv = np.zeros((N, mode+1))
    for j in range(mode+1):
        if k < mode:
            joint_theta[j] = thetat_v[location_id, ] * thetat_d[j][date_id[:, j], ]
        else:
            joint_theta[j] = thetat_v[location_id, ]
        uv[:, j] = np.dot(thetat_u[d_id, ] * joint_theta[j], k_vec)
    mu = np.sum(Z * uv, axis=1)
    Prob = 1 - np.exp(-mu)

    # 潜在変数sから新しい変数を作成
    s = np.random.binomial(1, Prob, N)
    y = s * x

    # トピックをサンプリング
    # トピックの割当確率
    Posterior = omega[d_id1, ] * scipy.stats.poisson.pmf(y_new[:, np.newaxis], mu)
    Prob2 = Posterior / np.sum(Posterior, axis=1)[:, np.newaxis]

    # 多項分布からトピックをサンプリング
    res = rmnom(Prob2, N1, mode+1, np.arange(N1), 1)
    z_vec = np.array(res[0], dtype="int8")
    Zi = np.array(res[1].todense(), dtype="int8")


    # ユーザーのテンソル分解のパラメータをサンプリング
    # 補助変数deltaを更新
    theta_location = theta_v[location_id1, ]
    theta_date = np.zeros((N1, k)); theta_date[Zi[:, mode]==1, ] = 1.0
    lambda_deploy = np.zeros((N1, k))
    for j in range(mode+1):
        z_target = Zi[:, j][:, np.newaxis]
        lambda_deploy += mu_deploy[j] * z_target
        if j < mode:
            theta_date += theta_d[j][date_id1[j], ] * z_target
    delta = lambda_deploy / np.sum(Zi * mu, axis=1)[:, np.newaxis]

    # ガンマ分布の事後分布のパラメータ
    delta_y = delta * y_new[:, np.newaxis]
    W1 = np.zeros((hh, k)); W2 = np.zeros((hh, k))
    for i in range(hh):
        index = d_list[i]
        W1[i, ] = np.dot(delta_y[index, ].T, d_vec[i])
        W2[i, ] = np.dot((theta_location[index, ] * theta_date[index, ]).T, d_vec[i])

    # ガンマ分布からパラメータをサンプリング
    theta_u = np.random.gamma(W1+alpha1, 1/(W2+beta1))
    theta_user = theta_u[d_id1, ]


    # 場所のテンソル分解のパラメータをサンプリング
    # 補助変数deltaを更新
    lambda_deploy = theta_user * theta_location * theta_date
    delta = lambda_deploy / np.sum(lambda_deploy, axis=1)[:, np.newaxis]

    # ガンマ分布の事後分布のパラメータ
    delta_y = delta * y_new[:, np.newaxis]
    H1 = np.zeros((location, k)); H2 = np.zeros((location, k))
    for i in range(location):
        index = location_list[i]
        H1[i, ] = np.dot(delta_y[index, ].T, location_vec[i])
        H2[i, ] = np.dot((theta_user[index, ] * theta_date[index, ]).T, location_vec[i])

    # ガンマ分布からパラメータをサンプリング
    theta_v = np.random.gamma(H1+alpha2, 1/(H2+beta2))
    theta_location = theta_v[location_id1, ]


    # 時間のテンソル分解のパラメータをサンプリング
    # 補助変数deltaをサンプリング
    lambda_deploy = theta_user * theta_location * theta_date
    delta = lambda_deploy / np.sum(lambda_deploy, axis=1)[:, np.newaxis]

    # ガンマ分布の事後分布のパラメータ
    delta_y = delta * y_new[:, np.newaxis]
    for i in range(mode):
        C1 = np.zeros((date_n[i], k)); C2 = np.zeros((date_n[i], k))
        for j in range(date_n[i]):
            index = date_list[i][j]
            C1[j, ] = np.dot(delta_y[index, ].T, Zi[index, i])
            C2[j, ] = np.dot((theta_user[index, ] * theta_location[index, ]).T, Zi[index, i])

        # ガンマ分布からパラメータをサンプリング
        theta_d[i] = np.random.gamma(C1+alpha3, 1/(C2+beta3))

        
    # サンプリング結果の格納と表示
    # サンプリング結果の格納
    if rp%keep==0:
        mkeep = rp//keep
        OMEGA[:, :, mkeep] = omega
        THETA_U[:, :, mkeep] = theta_u
        THETA_V[:, :, mkeep] = theta_v
        THETA_D1[:, :, mkeep] = theta_d[0]
        THETA_D2[:, :, mkeep] = theta_d[1]
        THETA_D3[:, :, mkeep] = theta_d[2]
        
        # 潜在変数を格納
        if rp >= burnin:
            S += s
            SEG += Zi

    if rp%disp==0:
        # 学習データの対数尤度
        mu = np.zeros((N1, mode+1))
        mu_deploy = [j for j in range(mode+1)]
        joint_theta = theta_u[d_id1, ] * theta_v[location_id1, ]
        for j in range(mode+1):
            if j < mode:
                mu_deploy[j] = joint_theta * theta_d[j][date_id1[j], ]
                mu[:, j] = np.dot(mu_deploy[j], k_vec)
            else:
                mu_deploy[j] = joint_theta.copy()
                mu[:, j] = np.dot(mu_deploy[j], k_vec)
        LL = np.sum(np.log(np.sum(omega[d_id1, ] * scipy.stats.poisson.pmf(y1[:, np.newaxis], mu), axis=1)))
        
        # テストデータの対数尤度
        mu0 = np.zeros((N0, mode+1))
        mu_deploy0 = [j for j in range(mode+1)]
        joint_theta0 = theta_u[d_id0, ] * theta_v[location_id0, ]
        for j in range(mode+1):
            if j < mode:
                mu_deploy0[j] = joint_theta0 * theta_d[j][date_id0[j], ]
                mu0[:, j] = np.dot(mu_deploy0[j], k_vec)
            else:
                mu_deploy0[j] = joint_theta0.copy()
                mu0[:, j] = np.dot(mu_deploy0[j], k_vec)
        LL0 = np.sum(np.log(np.sum(omega[d_id0, ] * scipy.stats.poisson.pmf(y0[:, np.newaxis], mu0), axis=1)))
        
        # サンプリング結果を表示
        print(rp)
        print(np.round([LL, LLbest, LLst], 1))
        print(np.round([LL0, LLbest0, LLst0], 1))