In [96]:
#####Semisupervised Attribute Estimation#####
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy.linalg
import itertools
from scipy import sparse
from scipy.stats import norm
from pandas.tools.plotting import scatter_matrix
from numpy.random import *
from scipy import optimize

#np.random.seed(98537)

In [97]:
##多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1)
    return z_id

In [98]:
####データの生成####
##データの設定
k1 = 12
k2 = 12
hh = 10000
place= 2000
Lambda = np.random.gamma(15, 1/0.15, hh)
pt = np.random.poisson(Lambda, hh)
hhpt = np.sum(pt)
k_vec1 = np.repeat(1, k1)
k_vec2 = np.repeat(1, k2)

In [99]:
##IDとインデックスの設定
#IDの設定
d_id = np.repeat(np.arange(hh), pt)
pt_id = np.array(list(itertools.chain(*[np.array(range(pt[i]), dtype="int") for i in range(hh)])), dtype="int")

#インデックスの設定
d_list = [i for i in range(hh)]
for i in range(hh):
    d_list[i] = np.array(np.where(d_id==i)[0], dtype="int")

In [100]:
##場所の割当を生成
#トピック割当を生成
topic = 25
phi_place = np.random.dirichlet(np.repeat(0.1, place), topic)
theta_place = np.random.dirichlet(np.repeat(0.25, topic), hh)
z = np.array(rmnom(theta_place[d_id, ], hhpt, topic, 0), dtype="int16")

#多項分布から場所を生成
place_id = np.zeros(hhpt, dtype="int")
for i in range(hh):
    place_id[d_list[i]] = rmnom(phi_place[z[d_list[i]], ], pt[i], place, 0)
place_dt = sparse.coo_matrix((np.repeat(1, hhpt), (place_id, range(hhpt))), shape=(place, hhpt)).tocsr()
place_n = np.array(np.sum(place_dt, axis=1)).reshape(-1)

#場所のインデックス
place_list = [i for i in range(place)]
for i in range(place):
    place_list[i] = np.array(np.where(place_id==i)[0], dtype="int")

In [157]:
####応答変数を生成####
rp = 0
while True:
    rp = rp + 1
    print(rp)

    ##パラメータの生成
    #パラメータの事前分布
    alpha01 = 0.25; beta01 = 0.8
    alpha02 = 0.2; beta02 = 0.7
    alpha03 = 0.15; beta03 = 0.75
    s1 = 9.5
    v1 = 10.0

    #モデルパラメータの生成
    gamma = np.random.beta(9.5, 10.0, place)
    theta_u = np.random.gamma(alpha01, 1/beta01, hh*k1).reshape(hh, k1)
    theta_v1 = np.random.gamma(alpha02, 1/beta02, place*k2).reshape(place, k2)
    theta_v2 = np.random.gamma(alpha02, 1/beta02, place*k2).reshape(place, k2)
    omega = np.random.gamma(alpha03, 1/beta03, k1*k2).reshape(k1, k2)
    gammat = gamma.copy()
    thetat_u = theta_u.copy(); thetat_v1 = theta_v1.copy(); thetat_v2 = theta_v2.copy(); omegat = omega.copy()

    #ポアソン分布からデータを生成
    gamma_vec = gamma[place_id]
    ar = np.dot(theta_u, omega)[d_id, ]
    mu1 = gamma_vec * np.dot(ar * theta_v1[place_id, ], k_vec2)
    mu2 = (1-gamma_vec) * np.dot(ar * theta_v2[place_id, ], k_vec2)
    mu = mu1 + mu2
    y = np.random.poisson(mu, hhpt)

    #break条件
    if (np.max(y) < 200) & (np.max(y) > 75) & (np.sum(y==0) > 25000) & (np.sum(y==0) < 200000):
        break

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


In [158]:
#観測された属性比率を生成
Pr = np.array([0.3])
flag = np.random.binomial(1, Pr, place)
index_flag1 = np.array(np.where(flag==1)[0], dtype="int")
index_flag0 = np.array(np.where(flag==0)[0], dtype="int")
m1 = index_flag1.shape[0]
m0 = index_flag0.shape[0]

In [159]:
####マルコフ連鎖モンテカルロ法で半教師属性推定モデルを推定####
#ロジスティック関数
def logit(theta):
    mu = np.exp(theta)
    Prob = mu / (1 + mu)
    return Prob

In [160]:
##アルゴリズムの設定
R = 2000
keep = 2
burnin = int(500/keep)
iters = 0
disp = 10

In [161]:
#パラメータの事前分布
epsilon = 0.025
alpha01 = 0.25; beta01 = 0.8
alpha02 = 0.25; beta02 = 0.8
alpha03 = 0.15; beta03 = 0.75
s1 = 25.0
v1 = 25.0

In [162]:
##パラメータの真値
#モデルパラメータの真値
gamma = np.repeat(0.5, place)
gamma[index_flag1] = gammat[index_flag1]
gamma = gammat.copy()
theta_u = thetat_u.copy()
theta_v1 = thetat_v1.copy()
theta_v2 = thetat_v2.copy()
omega = omegat.copy()

#期待値の真値
gamma_vec = gamma[place_id]
ar = np.dot(theta_u, omega)[d_id, ]
mu1 = np.dot(gamma_vec[:, np.newaxis] * ar * theta_v1[place_id, ], k_vec2)
mu2 = np.dot((1-gamma_vec[:, np.newaxis]) * ar * theta_v2[place_id, ], k_vec2)
mu = mu1 + mu2

In [153]:
####ギブスサンプリングでパラメータをサンプリング####

##MH法で属性比率をサンプリング
#新しいパラメータをサンプリング
gammad = gamma.copy(); gamman = gamma.copy()
gamman[index_flag0] = logit(np.log(gammad[index_flag0]/(1-gammad[index_flag0])) + np.random.normal(0, epsilon, m0))

#新しい期待値を計算
mu_old = mu.copy()
mu_new1 = gamman[place_id] * np.dot(ar * theta_v1[place_id, ], k_vec2)
mu_new2 = (1-gamman[place_id]) * np.dot(ar * theta_v2[place_id, ], k_vec2)
mu_new = mu_new1 + mu_new2

#対数尤度と対数事前分布の計算
lognew = scipy.stats.poisson.logpmf(y, mu_new)
logold = scipy.stats.poisson.logpmf(y, mu_old)
logpnew = scipy.stats.beta.logpdf(gamman, s1, v1)
logpold = scipy.stats.beta.logpdf(gammad, s1, v1)

#パラメータの採択率を定義
alpha = np.repeat(0.0, m0)
for i in range(m0):
    j = index_flag0[i]
    index = place_list[j]
    alpha[i] = np.exp(np.sum(lognew[index]) + logpnew[j] - np.sum(logold[index]) - logpold[j])   
alpha[alpha > 1.0] = 1.0

#採択率に基づき新しいパラメータを採択
rand = np.random.uniform(0, 1, m0)
accept = np.array(alpha > rand, dtype="int")
gamma[index_flag0] = accept*gamman[index_flag0] + (1-accept)*gammad[index_flag0]
accept_prob = np.mean(accept)

In [154]:
##ユーザー特徴行列をサンプリング
#期待値を更新
#期待値の真値
gamma_vec = gamma[place_id]
ar = np.dot(theta_u, omega)[d_id, ]
mu1 = np.dot(gamma_vec[:, np.newaxis] * ar * theta_v1[place_id, ], k_vec2)
mu2 = np.dot((1-gamma_vec[:, np.newaxis]) * ar * theta_v2[place_id, ], k_vec2)
Lambda = Lambda1 + Lambda2

#補助変数lambdaを更新


In [None]:
##ガンマ分布よりユーザーパラメータをサンプリング
#補助変数lambdaを更新
uv_deploy = np.hstack((beta_u[:, np.newaxis], theta_u * theta_v))
Lambda = uv_deploy / mu1[:, np.newaxis]

#事後分布のパラメータ
lambda_y = Lambda * y_new[:, np.newaxis]
lambda_h = np.hstack((np.repeat(1, f)[:, np.newaxis], theta_v))
W1 = np.zeros((d, k+1)); W2 = np.zeros((d, k+1))
for i in range(d):
    W1[i, ] = np.sum(lambda_y[d_list[i], ], axis=0)
    W2[i, ] = np.sum(lambda_h[d_list[i], ], axis=0)
W1 = W1 + alpha11; W2 = W2 + alpha12

#パラメータをサンプリング
result = np.random.gamma(W1.reshape(-1), 1/W2.reshape(-1), d*(k+1)).reshape(d, k+1)
beta1 = result[:, 0]
W = result[:, 1:]
