In [None]:
#####Semisupervised Attribute Estimation#####
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy.linalg
import itertools
from scipy import sparse
from scipy.stats import norm
from pandas.tools.plotting import scatter_matrix
from numpy.random import *
from scipy import optimize

#np.random.seed(98537)

In [None]:
##多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1)
    return z_id

In [None]:
####データの生成####
##データの設定
k1 = 12
k2 = 12
hh = 5000
place= 2500
Lambda = np.random.gamma(25, 1/0.15, hh)
pt = np.random.poisson(Lambda, hh)
hhpt = np.sum(pt)
k_vec1 = np.repeat(1, k1)
k_vec2 = np.repeat(1, k2)

In [None]:
##IDとインデックスの設定
#IDの設定
d_id = np.repeat(np.arange(hh), pt)
pt_id = np.array(list(itertools.chain(*[np.array(range(pt[i]), dtype="int") for i in range(hh)])), dtype="int")

#インデックスの設定
d_list = [i for i in range(hh)]
for i in range(hh):
    d_list[i] = np.array(np.where(d_id==i)[0], dtype="int")

In [None]:
##場所の割当を生成
#トピック割当を生成
topic = 25
phi_place = np.random.dirichlet(np.repeat(0.1, place), topic)
theta_place = np.random.dirichlet(np.repeat(0.25, topic), hh)
z = np.array(rmnom(theta_place[d_id, ], hhpt, topic, 0), dtype="int16")

#多項分布から場所を生成
place_id = np.zeros(hhpt, dtype="int")
for i in range(hh):
    place_id[d_list[i]] = rmnom(phi_place[z[d_list[i]], ], pt[i], place, 0)
place_dt = sparse.coo_matrix((np.repeat(1, hhpt), (place_id, range(hhpt))), shape=(place, hhpt)).tocsr()
place_n = np.array(np.sum(place_dt, axis=1)).reshape(-1)

#場所のインデックス
place_list = [i for i in range(place)]
for i in range(place):
    place_list[i] = np.array(np.where(place_id==i)[0], dtype="int")

In [None]:
####応答変数を生成####
rp = 0
while True:
    rp = rp + 1
    print(rp)

    ##パラメータの生成
    #パラメータの事前分布
    alpha01 = 0.25; beta01 = 0.6
    alpha02 = 0.2; beta02 = 0.5
    alpha03 = 0.15; beta03 = 0.5
    s1 = 9.5
    v1 = 10.0

    #モデルパラメータの生成
    gamma = np.random.beta(9.5, 10.0, place)
    theta_u = np.random.gamma(alpha01, 1/beta01, hh*k1).reshape(hh, k1)
    theta_v1 = np.random.gamma(alpha02, 1/beta02, place*k2).reshape(place, k2)
    theta_v2 = np.random.gamma(alpha02, 1/beta02, place*k2).reshape(place, k2)
    omega = np.random.gamma(alpha03, 1/beta03, k1*k2).reshape(k1, k2)
    gammat = gamma.copy()
    thetat_u = theta_u.copy(); thetat_v1 = theta_v1.copy(); thetat_v2 = theta_v2.copy(); omegat = omega.copy()
    
    #ポアソン分布からデータを生成
    gamma_vec = gamma[place_id]
    ar = np.dot(theta_u, omega)[d_id, ]
    mu1 = gamma_vec * np.dot(ar * theta_v1[place_id, ], k_vec2)
    mu2 = (1-gamma_vec) * np.dot(ar * theta_v2[place_id, ], k_vec2)
    mu = mu1 + mu2
    mut = mu.copy()
    y = np.random.poisson(mu, hhpt)

    #break条件
    if (np.max(y) < 200) & (np.max(y) > 75) & (np.sum(y==0) > 25000) & (np.sum(y==0) < 200000):
        break

In [None]:
#観測された属性比率を生成
Pr = np.array([0.3])
flag = np.random.binomial(1, Pr, place)
index_flag1 = np.array(np.where(flag==1)[0], dtype="int")
index_flag0 = np.array(np.where(flag==0)[0], dtype="int")
m1 = index_flag1.shape[0]
m0 = index_flag0.shape[0]

In [None]:
####マルコフ連鎖モンテカルロ法で半教師属性推定モデルを推定####
#ロジスティック関数
def logit(theta):
    mu = np.exp(theta)
    Prob = mu / (1 + mu)
    return Prob

In [None]:
##アルゴリズムの設定
R = 2000
keep = 2
burnin = int(500/keep)
iters = 0
disp = 10

In [None]:
#特徴行列の割当インデックス
allocation_u = np.repeat(np.arange(k1), k2)
allocation_v = np.tile(np.arange(k2), k1)
allocation_omega = np.array([np.delete(np.arange(k1), j).tolist() for j in range(k1)])

In [None]:
#パラメータの事前分布
epsilon = 0.05
alpha01 = 0.2; beta01 = 0.2
alpha02 = 0.2; beta02 = 0.2
alpha03 = 0.2; beta03 = 0.2
s1 = 25.0
v1 = 25.0

In [None]:
##パラメータの真値
#モデルパラメータの真値
gamma = np.repeat(0.5, place)
gamma[index_flag1] = gammat[index_flag1]
gamma = gammat.copy()
theta_u = thetat_u.copy()
theta_v1 = thetat_v1.copy()
theta_v2 = thetat_v2.copy()
omega = omegat.copy()

#期待値の真値
gamma_vec = gamma[place_id]
ar = np.dot(theta_u, omega)[d_id, ]
mu1 = gamma_vec * np.dot(ar * theta_v1[place_id, ], k_vec2)
mu2 = (1-gamma_vec) * np.dot(ar * theta_v2[place_id, ], k_vec2)
mu = mu1 + mu2
mut = mu.copy()

In [None]:
alpha01 = 0.25; beta01 = 0.8
alpha02 = 0.2; beta02 = 0.7
alpha03 = 0.15; beta03 = 0.75

In [None]:
##パラメータの初期値
#モデルパラメータの初期値
gamma = np.repeat(0.5, place)
gamma[index_flag1] = gammat[index_flag1]
theta_u = np.random.gamma(0.25, 1/0.8, hh*k1).reshape(hh, k1)
theta_v1 = np.random.gamma(0.2, 1/0.7, place*k2).reshape(place, k2)
theta_v2 = np.random.gamma(0.2, 1/0.7, place*k2).reshape(place, k2)
omega = np.random.gamma(0.15, 1/0.75, k1*k2).reshape(k1, k2)

#期待値の初期値
gamma_vec = gamma[place_id]
ar = np.dot(theta_u, omega)[d_id, ]
mu1 = gamma_vec * np.dot(ar * theta_v1[place_id, ], k_vec2)
mu2 = (1-gamma_vec) * np.dot(ar * theta_v2[place_id, ], k_vec2)
mu = mu1 + mu2

In [None]:
##パラメータの格納用配列
GAMMA = np.zeros((int(R/keep), place))
THETA_U = np.zeros((hh, k1, int(R/keep)))
THETA_V1 = np.zeros((place, k2, int(R/keep)))
THETA_V2 = np.zeros((place, k2, int(R/keep)))
OMEGA = np.zeros((k1, k2, int(R/keep)))

In [None]:
##対数尤度の基準値
#1パラメータモデルの対数尤度
LLst = np.sum(scipy.stats.poisson.logpmf(y, np.mean(y)))
print(LLst)

#真値の対数尤度
LLbest = np.sum(scipy.stats.poisson.logpmf(y, mut))
print(LLbest)

In [None]:
####ギブスサンプリングでパラメータをサンプリング####
for rp in range(R):
    
    ##MH法で属性比率をサンプリング
    #新しいパラメータをサンプリング
    gammad = gamma.copy(); gamman = gamma.copy()
    gamman[index_flag0] = logit(np.log(gammad[index_flag0]/(1-gammad[index_flag0])) + np.random.normal(0, epsilon, m0))

    #新しい期待値を計算
    mu_old = mu.copy()
    mu_new1 = gamman[place_id] * np.dot(ar * theta_v1[place_id, ], k_vec2)
    mu_new2 = (1-gamman[place_id]) * np.dot(ar * theta_v2[place_id, ], k_vec2)
    mu_new = mu_new1 + mu_new2

    #対数尤度と対数事前分布の計算
    lognew = scipy.stats.poisson.logpmf(y, mu_new)
    logold = scipy.stats.poisson.logpmf(y, mu_old)
    logpnew = scipy.stats.beta.logpdf(gamman, s1, v1)
    logpold = scipy.stats.beta.logpdf(gammad, s1, v1)

    #パラメータの採択率を定義
    alpha = np.repeat(0.0, m0)
    for i in range(m0):
        j = index_flag0[i]
        index = place_list[j]
        alpha[i] = np.exp(np.sum(lognew[index]) + logpnew[j] - np.sum(logold[index]) - logpold[j])   
    alpha[alpha > 1.0] = 1.0

    #採択率に基づき新しいパラメータを採択
    rand = np.random.uniform(0, 1, m0)
    accept = np.array(alpha > rand, dtype="int")
    gamma[index_flag0] = accept*gamman[index_flag0] + (1-accept)*gammad[index_flag0]
    gamma_vec = gamma[place_id]
    accept_prob = np.mean(accept)


    ##ユーザー特徴行列をサンプリング
    #期待値を更新
    ar1 = gamma_vec[:, np.newaxis] * np.dot(theta_v1, omega.T)[place_id, ]
    ar2 = (1-gamma_vec[:, np.newaxis]) * np.dot(theta_v2, omega.T)[place_id, ]
    mu_deploy = ar1 * theta_u[d_id, ] + ar2 * theta_u[d_id, ]
    mu = np.dot(mu_deploy, k_vec1)

    #補助変数lambdaを更新
    Lambda = mu_deploy / mu[:, np.newaxis]

    #事後分布のパラメータ
    lambda_y = Lambda * y[:, np.newaxis]
    lambda_h = ar1 + ar2
    W1 = np.zeros((hh, k1)); W2 = np.zeros((hh, k1))
    for i in range(hh):
        W1[i, ] = np.sum(lambda_y[d_list[i], ], axis=0)
        W2[i, ] = np.sum(lambda_h[d_list[i], ], axis=0)
    W1 = W1 + alpha01; W2 = W2 + beta01

    #ガンマ分布よりパラメータをサンプリング
    theta_u = np.random.gamma(W1.reshape(-1), 1/W2.reshape(-1), hh*k1).reshape(hh, k1)


    ##アイテム特徴行列をサンプリング
    #期待値を更新
    ar = np.dot(theta_u, omega)[d_id, ]
    ar1 = gamma_vec[:, np.newaxis] * ar
    ar2 = (1-gamma_vec[:, np.newaxis]) * ar
    mu_deploy1 = gamma_vec[:, np.newaxis]*ar*theta_v1[place_id, ]
    mu_deploy2 = (1-gamma_vec)[:, np.newaxis]*ar*theta_v2[place_id, ]
    mu = np.dot(mu_deploy1 + mu_deploy2, k_vec2)

    #補助変数lambdaを更新
    Lambda1 = mu_deploy1 / mu[:, np.newaxis]
    Lambda2 = mu_deploy2 / mu[:, np.newaxis]

    #事後分布のパラメータ
    lambda_y1 = Lambda1 * y[:, np.newaxis] 
    lambda_y2 = Lambda2 * y[:, np.newaxis]
    W11 = np.zeros((place, k2)); W12 = np.zeros((place, k2))
    W21 = np.zeros((place, k2)); W22 = np.zeros((place, k2))
    for i in range(place):
        W11[i, ] = np.sum(lambda_y1[place_list[i], ], axis=0)
        W12[i, ] = np.sum(lambda_y2[place_list[i], ], axis=0)
        W21[i, ] = np.sum(ar1[place_list[i], ], axis=0)
        W22[i, ] = np.sum(ar2[place_list[i], ], axis=0)
    W11 = W11 + alpha02; W12 = W12 + alpha02; W21 = W21 + beta02; W22 = W22 + beta02

    #ガンマ分布よりパラメータをサンプリング
    theta_v1 = np.random.gamma(W11.reshape(-1), 1/W21.reshape(-1), place*k2).reshape(place, k2)
    theta_v2 = np.random.gamma(W12.reshape(-1), 1/W22.reshape(-1), place*k2).reshape(place, k2)


    ##双線形回帰行列をサンプリング
    #補助変数lambdaを更新
    theta_deploy1 = theta_u[:, allocation_u][d_id, ]
    theta_deploy21 = theta_v1[:, allocation_v][place_id, ]
    theta_deploy22 = theta_v2[:, allocation_v][place_id, ]
    uv_block = np.sum(gamma_vec[:, np.newaxis]*theta_deploy1*theta_deploy22 + 
                      (1-gamma_vec)[:, np.newaxis]*theta_deploy1*theta_deploy22, axis=0).reshape(k1, k2)
    omega_block = omega * uv_block
    Lambda = omega_block / np.sum(omega_block)

    #事後分布のパラメータ
    omega1 = Lambda * np.sum(y) + alpha03
    omega2 = uv_block + beta03

    #ガンマ分布よりパラメータをサンプリング
    omega = np.random.gamma(omega1.reshape(-1), 1/omega2.reshape(-1), k1*k2).reshape(k1, k2)

    #期待値を更新
    ar = np.dot(theta_u, omega)[d_id, ]
    mu1 = gamma_vec * np.dot(ar * theta_v1[place_id, ], k_vec2)
    mu2 = (1-gamma_vec) * np.dot(ar * theta_v2[place_id, ], k_vec2)
    mu = mu1 + mu2


    ##パラメータの格納とサンプリング結果の表示
    #サンプリング結果の格納
    if rp%keep==0:
        mkeep = rp%keep
        GAMMA[mkeep, ] = gamma
        THETA_U[:, :, mkeep] = theta_u
        THETA_V1[:, :, mkeep] = theta_v1
        THETA_V2[:, :, mkeep] = theta_v2
        OMEGA[:, :, mkeep] = omega

    if rp%disp==0:
        #対数尤度を更新
        LL = np.sum(scipy.stats.poisson.logpmf(y, mu))

        #サンプリング結果の表示
        print(rp)
        print(np.round(accept_prob, 3))
        print(np.round(np.array([LL, LLst, LLbest]), 1))