In [1]:
#####Semisupervised Attribute Estimation#####
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy.linalg
import itertools
from scipy import sparse
from scipy.stats import norm
from pandas.tools.plotting import scatter_matrix
from numpy.random import *
from scipy import optimize

#np.random.seed(98537)

In [2]:
##多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1)
    return z_id

In [3]:
####データの生成####
##データの設定
k1 = 12
k2 = 12
hh = 10000
place= 2000
Lambda = np.random.gamma(15, 1/0.15, hh)
pt = np.random.poisson(Lambda, hh)
hhpt = np.sum(pt)
k_vec1 = np.repeat(1, k1)
k_vec2 = np.repeat(1, k2)

In [4]:
##IDとインデックスの設定
#IDの設定
d_id = np.repeat(np.arange(hh), pt)
pt_id = np.array(list(itertools.chain(*[np.array(range(pt[i]), dtype="int") for i in range(hh)])), dtype="int")

#インデックスの設定
d_list = [i for i in range(hh)]
for i in range(hh):
    d_list[i] = np.array(np.where(d_id==i)[0], dtype="int")

In [5]:
##場所の割当を生成
#トピック割当を生成
topic = 25
phi_place = np.random.dirichlet(np.repeat(0.1, place), topic)
theta_place = np.random.dirichlet(np.repeat(0.25, topic), hh)
z = np.array(rmnom(theta_place[d_id, ], hhpt, topic, 0), dtype="int16")

#多項分布から場所を生成
place_id = np.zeros(hhpt, dtype="int")
for i in range(hh):
    place_id[d_list[i]] = rmnom(phi_place[z[d_list[i]], ], pt[i], place, 0)
place_dt = sparse.coo_matrix((np.repeat(1, hhpt), (place_id, range(hhpt))), shape=(place, hhpt)).tocsr()
place_n = np.array(np.sum(place_dt, axis=1)).reshape(-1)

#場所のインデックス
place_list = [i for i in range(place)]
for i in range(place):
    place_list[i] = np.array(np.where(place_id==i)[0], dtype="int")

In [6]:
####応答変数を生成####
rp = 0
while True:
    rp = rp + 1
    print(rp)

    ##パラメータの生成
    #パラメータの事前分布
    alpha01 = 0.25; beta01 = 0.8
    alpha02 = 0.2; beta02 = 0.7
    alpha03 = 0.15; beta03 = 0.75
    s1 = 9.5
    v1 = 10.0

    #モデルパラメータの生成
    gamma = np.random.beta(9.5, 10.0, place)
    theta_u = np.random.gamma(alpha01, 1/beta01, hh*k1).reshape(hh, k1)
    theta_v1 = np.random.gamma(alpha02, 1/beta02, place*k2).reshape(place, k2)
    theta_v2 = np.random.gamma(alpha02, 1/beta02, place*k2).reshape(place, k2)
    omega = np.random.gamma(alpha03, 1/beta03, k1*k2).reshape(k1, k2)
    gammat = gamma.copy()
    thetat_u = theta_u.copy(); thetat_v1 = theta_v1.copy(); thetat_v2 = theta_v2.copy(); omegat = omega.copy()

    #ポアソン分布からデータを生成
    gamma_vec = gamma[place_id]
    ar = np.dot(theta_u, omega)[d_id, ]
    Lambda1 = np.dot(gamma_vec[:, np.newaxis] * ar * theta_v1[place_id, ], k_vec2)
    Lambda2 = np.dot((1-gamma_vec[:, np.newaxis]) * ar * theta_v2[place_id, ], k_vec2)
    Lambda = Lambda1 + Lambda2
    y = np.random.poisson(Lambda, hhpt)

    #break条件
    if (np.max(y) < 200) & (np.max(y) > 75) & (np.sum(y==0) > 25000) & (np.sum(y==0) < 200000):
        break

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57


In [190]:
#観測された属性比率を生成
Pr = np.array([0.3])
flag = np.random.binomial(1, Pr, place)
index_flag1 = np.array(np.where(flag==1)[0], dtype="int")
index_flag0 = np.array(np.where(flag==0)[0], dtype="int")

In [191]:
####マルコフ連鎖モンテカルロ法で半教師属性推定モデルを推定####
##アルゴリズムの設定
R = 2000
keep = 2
burnin = int(500/keep)
iters = 0
disp = 10

In [192]:
#パラメータの事前分布
alpha01 = 0.25; beta01 = 0.8
alpha02 = 0.25; beta02 = 0.8
alpha03 = 0.15; beta03 = 0.75
s1 = 9.5
v1 = 10.0

In [193]:
##パラメータの真値
#モデルパラメータの真値
gamma = np.repeat(0.5, place)
gamma[index_flag1] = gammat[index_flag1]
theta_u = thetat_u.copy()
theta_v1 = thetat_v1.copy()
theta_v2 = thetat_v2.copy()
omega = omegat.copy()

#期待値の真値
gamma_vec = gamma[place_id]
ar = np.dot(theta_u, omega)[d_id, ]
Lambda1 = np.dot(gamma_vec[:, np.newaxis] * ar * theta_v1[place_id, ], k_vec2)
Lambda2 = np.dot((1-gamma_vec[:, np.newaxis]) * ar * theta_v2[place_id, ], k_vec2)
Lambda = Lambda1 + Lambda2

In [205]:


####ギブスサンプリングでパラメータをサンプリング####

##MH法で属性比率をサンプリング
i = 10
j = index_flag0[i]
index = place_list[j]
np.sum(scipy.stats.poisson.logpmf(y[index], Lambda[index]))

-1404.7543825426683

In [206]:
Lambda_new1 = np.dot(gammat[j] * ar[index, ] * theta_v1[j, ], k_vec2)
Lambda_new2 = np.dot((1-gammat[j]) * ar[index, ] * theta_v2[j, ], k_vec2)
Lambda_new = Lambda_new1 + Lambda_new2
np.sum(scipy.stats.poisson.logpmf(y[index], Lambda_new))

-1393.751319633987

In [207]:
i = 5
print(Lambda_new[i])
print(Lambda[index][i])

0.9420150960801877
1.0032961104811535


In [208]:
gammat[j]

0.5508645116342759

array([[0.32447142, 1.53932952, 0.28916842, ..., 0.08058726, 0.08568389,
        0.35326907],
       [7.60470531, 1.11440237, 0.70883764, ..., 1.88226858, 0.23800483,
        2.44149563],
       [0.47059416, 0.86534633, 0.22934645, ..., 0.16315301, 0.20120113,
        0.85858266],
       ...,
       [1.34975245, 0.40099885, 1.32253339, ..., 1.67257465, 0.40682703,
        2.62942084],
       [1.11984582, 0.88675651, 0.24021765, ..., 1.05470734, 0.19879013,
        0.46704477],
       [0.29897382, 0.34251878, 0.14120143, ..., 0.24244109, 0.16000365,
        0.10938484]])

array([0.5       , 0.5       , 0.47475315, ..., 0.5       , 0.5       ,
       0.44324396])