In [23]:
#####Latent Count Mixed membership Block model#####
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy.linalg
import itertools
from scipy import sparse
from scipy.stats import norm
from numpy.random import *
from scipy import optimize

In [24]:
#多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.array(np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1), dtype="int")
    return z_id

#切断ポアソン分布の乱数を生成する関数
def rtpois(mu, a, b, n):
    if a==0:
        FA = np.exp(-mu)
    else:
        FA = scipy.stats.poisson.cdf(a, mu)
    if b==np.inf:
        FB = np.repeat(1, n)
    else:
        FB = scipy.stats.poisson.cdf(b, mu)
    return np.array(scipy.stats.poisson.ppf(np.random.uniform(0, 1, n)*(FB-FA)+FA, mu), dtype="int")

#ポアソン分布の対数尤度関数
def loglike(mu, y, y_factorial):
    LL = y * np.log(mu) - mu - y_factorial
    return LL

#ポアソン分布の尤度関数
def Poisson_pmf(phi, y, y_factorial):
    lambda_y = np.power(phi, y)
    Lho = lambda_y / y_factorial * np.exp(-phi_deploy)
    return Lho

In [25]:
####学習データの生成####
##データの設定
#データの定義
g = 4   #グラフ数
k1 = 6; k2 = 5   #トピック数
d1 = 2500; d2 = 2000   #ノード数 
Lambda = np.random.gamma(30.0, 1/0.225, d1)
n = np.random.poisson(Lambda, d1)
N = np.sum(n)
k_vec1 = np.repeat(1.0, k1)
k_vec2 = np.repeat(1.0, k2)

#idを定義
d_id1 = np.repeat(np.arange(d1), n)
pt_id1 = np.array(list(itertools.chain(*[np.array(range(n[i]), dtype="int") for i in range(d1)])))
d_list1 = [i for i in range(d1)]
for i in range(d1):
    d_list1[i] = np.array(np.where(d_id1==i)[0], "int")

In [26]:
##ノードの割当を生成
#セグメント割当を生成
topic = 30
phi0 = np.random.dirichlet(np.repeat(1.0, d2), topic)
theta0 = np.random.dirichlet(np.repeat(0.25, topic), d1)
z = rmnom(theta0[d_id1, ], N, topic, 0)

#多項分布からアイテムを生成
d_id2 = np.zeros(N, dtype='int')
for i in range(d1):
    d_id2[d_list1[i]] = rmnom(phi0[z[d_list1[i]], ], n[i], d2, 0)

In [27]:
##パラメータを生成
#トピック分布を生成
theta1 = np.random.dirichlet(np.repeat(0.2, k1), d1)
theta2 = np.random.dirichlet(np.repeat(0.2, k2), d2)
thetat1 = theta1.copy(); thetat2 = theta2.copy()

#モデルパラメータを生成
phi = np.random.gamma(0.5, 3.5, k1*k2*g).reshape(k1, k2, g)
phit = phi.copy()

##応答変数を生成
#トピックを生成
z1, Z1 = rmnom(theta1[d_id1, ], N, k1,  1)
z2, Z2 = rmnom(theta2[d_id2, ], N, k2,  1)

#ポアソン分布からリンクを生成
mu = np.zeros((N, g))
freq = np.zeros((N, g), dtype="int")
for j in range(g):
    mu[:, j] = np.sum(phi[z1, :, j] * Z2, axis=1)
    freq[:, j] = np.random.poisson(mu[:, j], N)
y = np.array(freq > 0, dtype="int")
rate = np.mean(y, axis=0)


#インデックスを定義
y_list = [j for j in range(g)]
n1 = np.repeat(0, d1)
n2 = np.repeat(0, d2)
M = np.repeat(0, g)
d_list1 = [i for i in range(d1)]
d_list2 = [i for i in range(d2)]
d_vec1 = [i for i in range(d1)]
d_vec2 = [i for i in range(d2)]
for j in range(g):
    y_list[j] = np.array(np.where(y[:, j]==1)[0], dtype="int")
    M[j] = y_list[j].shape[0]
for i in range(d1):
    d_list1[i] = np.array(np.where(d_id1==i)[0], "int")
    d_vec1[i] = np.repeat(1.0, d_list1[i].shape[0])
    n1[i] = d_list1[i].shape[0]
for i in range(d2):
    d_list2[i] = np.array(np.where(d_id2==i)[0], "int")
    d_vec2[i] = np.repeat(1.0, d_list2[i].shape[0])
    n2[i] = d_list2[i].shape[0]

In [28]:
####テストデータの生成####
##データの設定
#データの定義
n0 = np.random.poisson(Lambda, d1)
N0 = np.sum(n0)

#idを定義
d_id01 = np.repeat(np.arange(d1), n0)
pt_id01 = np.array(list(itertools.chain(*[np.array(range(n0[i]), dtype="int") for i in range(d1)])))
d_list01 = [i for i in range(d1)]
for i in range(d1):
    d_list01[i] = np.array(np.where(d_id01==i)[0], "int")
    
##ノードの割当を生成
#セグメント割当を生成
z = rmnom(theta0[d_id01, ], N0, topic, 0)

#多項分布からアイテムを生成
d_id02 = np.zeros(N0, dtype='int')
for i in range(d1):
    d_id02[d_list01[i]] = rmnom(phi0[z[d_list01[i]], ], n0[i], d2, 0)
    
    
##応答変数を生成
#トピックを生成
z01, Z01 = rmnom(theta1[d_id01, ], N0, k1, 1)
z02, Z02 = rmnom(theta2[d_id02, ], N0, k2, 1)

#ポアソン分布からリンクを生成
mu0 = np.zeros((N0, g))
freq0 = np.zeros((N0, g), dtype="int")
for j in range(g):
    mu0[:, j] = np.sum(phi[z01, :, j] * Z02, axis=1)
    freq0[:, j] = np.random.poisson(mu0[:, j], N0)
y0 = np.array(freq0 > 0, dtype="int")

#インデックスを定義
y_list0 = [j for j in range(g)]
for j in range(g):
    y_list0[j] = np.array(np.where(y0[:, j]==1)[0], dtype="int")
y_index0 = np.array(np.where(np.sum(y0, axis=1) > 0)[0], dtype="int")

In [29]:
####ギブスサンプリングでLatent Count Mixed membership Block modelを推定####
##アルゴリズムの設定
#MCMCの設定
R = 2000
keep = 2
burnin = int(1000/keep)
iter = 0
disp = 10

#ブロックを展開したトピックの割当
allocation_z1 = np.repeat(np.arange(k1), k2)
allocation_z2 = np.tile(np.arange(k2), k1)
k = k1*k2
k_vec = np.repeat(1.0, k)

#事前分布の設定
alpha = 0.1
s0 = 0.5
v0 = 0.5

In [35]:
##パラメータの真値
#モデルパラメータの真値
theta1 = thetat1.copy()
theta2 = thetat2.copy()
phi = phit.copy()

#トピックの真値
Zi1 = Z1.copy()
Zi2 = Z2.copy()
z_vec1 = np.dot(Zi1, np.arange(k1))
z_vec2 = np.dot(Zi2, np.arange(k2))

In [36]:
##パラメータの初期値
#モデルパラメータの初期値
theta1 = np.random.dirichlet(np.repeat(1.0, k1), d1)
theta2 = np.random.dirichlet(np.repeat(1.0, k2), d2)
phi = np.random.gamma(0.5, 1.0, k1*k2*g).reshape(k1, k2, g)

#トピックを生成
z_vec1, Zi1 = rmnom(theta1[d_id1, ], N, k1,  1)
z_vec2, Zi2 = rmnom(theta2[d_id2, ], N, k2,  1)

In [37]:
##パラメータの格納用配列
#モデルパラメータの格納用配列
THETA1 = np.zeros((d1, k1, int(R/keep)))
THETA2 = np.zeros((d2, k2, int(R/keep)))
PHI = np.zeros((k1, k2, g, int(R/keep)))

#トピックの格納用配列
SEG1 = np.zeros((int(R/keep), N), dtype="int8")
SEG2 = np.zeros((int(R/keep), N), dtype="int8")

In [38]:
##対数尤度の基準値
#1パラメータモデルでの対数尤度
LLst = np.sum(scipy.stats.poisson.logpmf(freq, np.mean(freq, axis=0)), axis=0)
print(np.round(LLst, 1))

#真値での対数尤度
mut = np.zeros((N, g))
LLbest = np.repeat(0.0, g)
for j in range(g):
    mut[:, j] = np.sum(phit[np.dot(Z1, np.arange(k1)), :, j] * Z2, axis=1)
    LLbest[j] = np.sum(scipy.stats.poisson.logpmf(freq[:, j], mut[:, j]))
print(np.round(LLbest, 1))

[-857531.5 -819141.2 -846166.7 -660317.2]
[-448847.2 -352976.1 -420057.3 -340752.8]


In [39]:
####ギブスサンプリングでパラメータをサンプリング####
for rp in range(R):
    
    ##Bernoulli-Poisson Linkで潜在頻度を生成
    #データの格納用配列
    mu = np.zeros((N, g))
    x = np.zeros((N, g), dtype="int")

    #切断ポアソン分布から潜在頻度を生成
    for j in range(g):
        index = y_list[j]
        mu[:, j] = np.dot(phi[z_vec1, :, j] * Zi2, k_vec2)   #モデルの期待値
        x[index, j] = rtpois(mu[index, j], 0, np.inf, M[j])
    x_factorial = scipy.special.gamma(x + 1)


    ##ノード間の結合トピックをサンプリング
    #ノード間の事後分布を定義
    theta_long1 = theta1[:, allocation_z1][d_id1, ]
    theta_long2 = theta2[:, allocation_z2][d_id2, ]
    Lho = np.full((N, k), 1.0)
    for j in range(g):
        phi_deploy = phi[:, :, j].reshape(-1)
        Lho *= Poisson_pmf(phi_deploy, x[:, j][:, np.newaxis], x_factorial[:, j][:, np.newaxis])
    Posterior = theta_long1 * theta_long2 * Lho
        
    #多項分布からトピックをサンプリング
    Prob = Posterior / np.dot(Posterior, k_vec)[:, np.newaxis]
    Zi = rmnom(Prob, N, k, 1)[1]
    z_vec1 = np.array(np.dot(allocation_z1 * Zi, k_vec), dtype="int")
    z_vec2 = np.array(np.dot(allocation_z2 * Zi, k_vec), dtype="int")
    Zi1 = np.diag(np.repeat(1, k1))[z_vec1, ]
    Zi2 = np.diag(np.repeat(1, k2))[z_vec2, ]

    
    #トピックの真値
    Zi1 = Z1.copy()
    Zi2 = Z2.copy()
    z_vec1 = np.dot(Zi1, np.arange(k1))
    z_vec2 = np.dot(Zi2, np.arange(k2))

    ##トピック分布をサンプリング
    #始点ノードのトピック分布をサンプリング
    wsum1 = np.zeros((d1, k1))
    for i in range(d1):
        wsum1[i, ] = np.dot(Zi1[d_list1[i], ].T, d_vec1[i]) + alpha
        theta1[i, ] = np.random.dirichlet(wsum1[i, ], 1)

    #終点ノードのトピック分布をサンプリング
    wsum2 = np.zeros((d2, k2))
    for i in range(d2):
        wsum2[i, ] = np.dot(Zi2[d_list2[i], ].T, d_vec2[i]) + alpha
        theta2[i, ] = np.random.dirichlet(wsum2[i, ], 1)


    ##モデルパラメータをサンプリング
    #ガンマ分布のパラメータ
    v = np.dot(Zi1.T, Zi2) + v0
    for j in range(g):
        index = y_list[j]
        s = np.dot((Zi1[index, ] * x[index, j][:, np.newaxis]).T, Zi2[index, ]) + s0
        
        #パラメータをサンプリング
        phi[:, :, j] = np.random.gamma(s, 1/v)


    ##サンプリング結果の格納と表示
    #サンプリング結果の格納
    if rp%keep==0:
        mkeep = int(rp/keep)
        THETA1[:, :, mkeep] = theta1
        THETA2[:, :, mkeep] = theta2
        PHI[:, :, :, mkeep] = phi
        SEG1[mkeep, ] = z_vec1
        SEG2[mkeep, ] = z_vec2

    #対数尤度の更新とサンプリング結果の表示
    if rp%disp==0:
        #対数尤度の更新
        mu = np.zeros((N, g))
        LL = np.repeat(0.0, g)
        for j in range(g):
            mu[:, j] = np.sum(phi[z_vec1, :, j] * Zi2, axis=1)
            LL[j] = np.sum(scipy.stats.poisson.logpmf(freq[:, j], mu[:, j]))

        #サンプリング結果を表示
        print(rp)
        print(np.vstack((np.sum(x, axis=0), np.sum(freq, axis=0))))
        print(np.round([np.append(LL, np.sum(LL)), np.append(LLst, np.sum(LLst)), np.append(LLbest, np.sum(LLbest))], 1))

0
[[257468 207565 251284 209181]
 [730009 552926 672928 413218]]
[[ -869960.1  -686195.1  -792894.6  -517782.6 -2866832.4]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
10
[[517657 372614 487533 324865]
 [730009 552926 672928 413218]]
[[ -518227.4  -424698.9  -501163.9  -383702.7 -1827793. ]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
20
[[579000 418439 536767 345417]
 [730009 552926 672928 413218]]
[[ -485708.6  -391374.1  -472427.4  -366997.5 -1716507.6]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
30
[[613881 440145 559369 356852]
 [730009 552926 672928 413218]]
[[ -473304.5  -380377.3  -459341.3  -361305.7 -1674328.8]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
40
[[636081 455374 576398 361511]
 [7

340
[[714378 537970 629579 391544]
 [730009 552926 672928 413218]]
[[ -450637.9  -353787.2  -432143.3  -343426.  -1579994.5]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
350
[[714499 543056 629505 392083]
 [730009 552926 672928 413218]]
[[ -450247.4  -353873.   -432365.7  -343411.  -1579897.1]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
360
[[716706 546695 626944 395147]
 [730009 552926 672928 413218]]
[[ -450007.4  -353870.5  -433326.1  -342993.2 -1580197.1]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
370
[[715995 540588 631985 395093]
 [730009 552926 672928 413218]]
[[ -449724.1  -354002.3  -431781.   -343214.  -1578721.5]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
380
[[720910 540782 630021 39500

680
[[729092 551709 639247 401083]
 [730009 552926 672928 413218]]
[[ -449435.2  -353634.   -427798.9  -341739.7 -1572607.8]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
690
[[729405 550248 636366 400725]
 [730009 552926 672928 413218]]
[[ -449539.4  -353691.5  -427545.4  -341755.  -1572531.4]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
700
[[729429 549507 635392 402701]
 [730009 552926 672928 413218]]
[[ -449533.9  -354147.3  -427904.9  -341561.6 -1573147.6]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
710
[[724648 551118 638036 403795]
 [730009 552926 672928 413218]]
[[ -449680.5  -354271.3  -428009.2  -341422.7 -1573383.7]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
720
[[724034 548793 638365 40613

1020
[[723839 540312 648697 409949]
 [730009 552926 672928 413218]]
[[ -449644.8  -354149.6  -422823.2  -341144.  -1567761.6]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1030
[[726893 540776 651901 409623]
 [730009 552926 672928 413218]]
[[ -449722.4  -353829.5  -423245.   -341106.3 -1567903.3]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1040
[[728140 542255 646833 408448]
 [730009 552926 672928 413218]]
[[ -449574.2  -353671.9  -422994.5  -341116.2 -1567356.9]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1050
[[730604 543493 649182 410661]
 [730009 552926 672928 413218]]
[[ -449897.3  -353417.5  -423018.   -340881.9 -1567214.7]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1060
[[733358 542225 648540 

1360
[[720620 543850 652388 415220]
 [730009 552926 672928 413218]]
[[ -449345.8  -353603.1  -422065.1  -340800.6 -1565814.6]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1370
[[718347 544587 650894 412569]
 [730009 552926 672928 413218]]
[[ -449682.5  -353551.7  -421923.5  -340770.9 -1565928.6]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1380
[[720967 545592 646709 411180]
 [730009 552926 672928 413218]]
[[ -449784.4  -353425.4  -422310.8  -340812.7 -1566333.3]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1390
[[718646 544442 645729 413311]
 [730009 552926 672928 413218]]
[[ -449868.1  -353497.9  -422611.3  -340795.7 -1566773. ]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1400
[[721013 545275 651327 

1700
[[718931 543917 663124 406575]
 [730009 552926 672928 413218]]
[[ -449815.4  -353635.4  -421565.7  -340986.4 -1566002.9]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1710
[[723164 543772 662636 406579]
 [730009 552926 672928 413218]]
[[ -449447.1  -353453.   -421517.1  -341083.4 -1565500.7]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1720
[[727712 548225 659072 406933]
 [730009 552926 672928 413218]]
[[ -449632.6  -353490.4  -421442.3  -341015.4 -1565580.8]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1730
[[727338 549028 662261 407029]
 [730009 552926 672928 413218]]
[[ -449337.5  -353541.5  -421404.5  -341085.  -1565368.5]
 [ -857531.5  -819141.2  -846166.7  -660317.2 -3183156.6]
 [ -448847.2  -352976.1  -420057.3  -340752.8 -1562633.4]]
1740
[[727768 549380 664661 

In [None]:
##ノード間の結合トピックをサンプリング
#ノード間の事後分布を定義
theta_long1 = theta1[:, allocation_z1][d_id1, ]
theta_long2 = theta2[:, allocation_z2][d_id2, ]
Lho = np.full((N, k), 1.0)
for j in range(g):
    Lho *= scipy.stats.poisson.pmf(x[:, j][:, np.newaxis], phi[:, :, j].reshape(-1))
Posterior = theta_long1 * theta_long2 * Lho