In [90]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import scipy.linalg
import itertools
import time
import torch
import torch.nn as nn
import torch.optim as optimizers
from scipy.stats import norm
from numpy.random import *
from scipy import optimize

np.random.seed(9837)
torch.manual_seed(9837)
pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

In [91]:
# 多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.array(np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1), dtype="int")
    return z_id

# データの生成

In [172]:
# データの定義
# データの設定
k = 12
item = 250
m = k*item

# 出現頻度を生成
alpha = 2.0
beta1 = np.random.gamma(2.5, 2.5, k)
beta2 = np.random.gamma(2.0, 2.5, item)
beta = beta1[np.tile(np.arange(k), item)] * beta2[np.repeat(np.arange(item), k)]
n = np.random.poisson(np.random.gamma(alpha, beta, m), m).reshape(item, k)
N = np.sum(n)

In [173]:
# パラメータの生成
# 事前分布の定義
alpha1 = np.array([0.25, 3.0])
alpha2 = np.array([1.5, 1/1.5])
beta1 = 0.25
beta2 = 0.25

# モデルパラメータを生成
theta1 = np.random.gamma(alpha1[0], alpha1[1], k*k).reshape(k, k)
theta2 = np.random.gamma(alpha2[0], alpha2[1], item*k).reshape(item, k)
pi = np.zeros((k, k, item))
for i in range(item):
    mu = theta1 * theta2[i, ]
    pi[:, :, i] = mu / np.sum(mu, axis=1)[:, np.newaxis]
thetat1 = theta1.copy(); thetat2 = theta2.copy()
pit = pi.copy()

In [174]:
# 応答変数の生成
y = np.zeros((k, k, item), dtype="int")
for i in range(item):
    for j in range(k):
        y[j, :, i] = np.random.multinomial(n[i, j], pi[j, :, i], 1).reshape(-1)

# パラメータの推定

## アルゴリズムの設定

In [175]:
# アルゴリズムの設定
R = 500
keep = 2
burnin = 200
skeep = int(burnin/keep)
iters = 0
disp = 100

# パラメータの事前分布を定義
alpha1 = 1.0
alpha2 = 0.25
beta1 = 1.0
beta2 = 0.25

# パラメータの初期値
theta1 = np.random.gamma(1.0, 1.0, k*k).reshape(k, k)
theta2 = np.random.gamma(1.0, 1.0, item*k).reshape(item, k)

In [176]:
# パラメータの格納用配列
# バーンインのインデックスを定義
RS = np.arange(skeep, int(R/keep))
rs = RS.shape[0]

# 推移確率とトピック分布の格納用配列
THETA1 = np.zeros((k, k, rs))
THETA2 = np.zeros((item, k, rs))

## パラメータの推定

In [177]:
# ギブスサンプリングでパラメータをサンプリング
for rp in range(R):
    if rp%disp==0:
        print(rp)

    # グローバルパラメータの格納用配列
    s1 = np.zeros((k, k))
    v1 = np.zeros((k, k))

    # itemパラメータごとにパラメータを更新
    for i in range(item):    
        # itemパラメータを更新
        s2  = np.sum(y[:, :, i], axis=0) + alpha2
        v2 = np.sum(n[i, ][:, np.newaxis] * theta1, axis=0) + beta2
        theta2[i, ] = np.random.gamma(s2, 1/v2, k)

        # itemの頻度の和を更新
        s1 += y[:, :, i]
        v1 += n[i, ][:, np.newaxis] * theta2[i, ]

    # グローバルパラメータを更新
    s1 = s1 + alpha1
    v1 = v1 + beta1
    theta1 = np.random.gamma(s1, 1/v1)
    
    # サンプリング結果の格納
    if (rp%keep==0) & (rp >= burnin):
        mkeep = int(rp/keep) - skeep
        
        # モデルパラメータの格納
        THETA1[:, :, mkeep] = theta1
        THETA2[:, :, mkeep] = theta2

0
100
200
300
400


## 結果の要約と評価

In [178]:
# 推定されたパラメータの要約と評価
# パラメータの事後平均
RS = np.arange(skeep, int(R/keep))
rs = RS.shape[0]
theta1 = np.mean(THETA1, axis=2)
theta2 = np.mean(THETA2, axis=2)

In [179]:
# 二乗誤差を検証
# 推定パラメータの期待値での二乗誤差
MSE1 = 0.0
for i in range(item):
    mu = theta1 * theta2[i, ]
    MSE1 += np.sqrt(np.sum(np.power(y[:, :, i] - n[i, ][:, np.newaxis] * mu, 2)))
    
# 標準化済みパラメータでの二乗誤差
MSE2 = 0.0
for i in range(item):
    mu = theta1 * theta2[i, ]
    Prob = mu / np.sum(mu, axis=1)[:, np.newaxis]
    MSE2 += np.sqrt(np.sum(np.power(y[:, :, i] - n[i, ][:, np.newaxis] * Prob, 2)))
    
# 真値での二乗誤差
MSE3 = 0.0
for i in range(item):
    mu = thetat1 * thetat2[i, ]
    Prob = mu / np.sum(mu, axis=1)[:, np.newaxis]
    MSE3 += np.sqrt(np.sum(np.power(y[:, :, i] - n[i, ][:, np.newaxis] * Prob, 2)))

# 結果を確認
print(np.round([MSE1, MSE2, MSE3], 1))

[6817.7 5050.8 4295.1]


In [189]:
# 結果をデータフレームで確認
# 推定パラメータと真値パラメータを比較
i = 121
mu = theta1 * theta2[i, ]
pi = mu / np.sum(mu, axis=1)[:, np.newaxis]
res1 = pd.concat((pd.DataFrame(y[:, :, i]), np.round(pd.DataFrame(pi), 3)), axis=1)
res2 = pd.concat((pd.DataFrame(y[:, :, i]), np.round(pd.DataFrame(pit[:, :, i]), 3)), axis=1)
pd.concat((res1, res2), axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9.1,10.1,11.1
0,0,0,6,2,3,0,2,0,1,0,0,3,0.0,0.001,0.327,0.076,0.211,0.02,0.064,0.0,0.091,0.008,0.003,0.2
1,0,0,0,0,0,1,0,0,1,0,0,0,0.0,0.092,0.0,0.071,0.125,0.204,0.0,0.065,0.43,0.004,0.0,0.008
2,0,0,4,6,1,14,1,5,1,0,0,0,0.0,0.0,0.101,0.189,0.009,0.357,0.05,0.184,0.037,0.07,0.004,0.0
3,0,2,2,37,1,0,14,0,0,0,11,0,0.0,0.053,0.032,0.53,0.014,0.0,0.291,0.001,0.004,0.0,0.067,0.007
4,1,1,0,0,0,1,0,0,0,7,25,0,0.04,0.017,0.0,0.0,0.013,0.03,0.032,0.0,0.001,0.211,0.655,0.001
5,0,0,0,7,3,0,10,0,13,0,0,3,0.005,0.003,0.0,0.268,0.151,0.0,0.186,0.012,0.337,0.001,0.007,0.031
6,1,0,0,0,0,1,0,0,1,1,1,0,0.192,0.006,0.011,0.124,0.001,0.054,0.0,0.016,0.36,0.06,0.175,0.0
7,0,1,0,2,0,16,0,0,0,0,10,1,0.003,0.007,0.019,0.105,0.0,0.409,0.001,0.015,0.02,0.0,0.321,0.099
8,6,0,1,0,2,6,0,3,0,0,1,3,0.213,0.0,0.071,0.121,0.009,0.244,0.001,0.021,0.03,0.0,0.078,0.212
9,0,1,1,0,0,0,0,0,0,0,0,5,0.009,0.134,0.13,0.179,0.014,0.008,0.0,0.031,0.011,0.0,0.0,0.483


In [182]:
# 推定された期待値と真値の期待値を比較
i = 8
mu = theta1 * theta2[i, ]
pi = mu / np.sum(mu, axis=1)[:, np.newaxis]
res1 = pd.concat((pd.DataFrame(y[:, :, i]), np.round(pd.DataFrame(n[i, ][:, np.newaxis] * pi), 1)), axis=1)
res2 = pd.concat((pd.DataFrame(y[:, :, i]), np.round(pd.DataFrame(n[i, ][:, np.newaxis] * pit[:, :, i]), 1)), axis=1)
pd.concat((res1, res2), axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9.1,10.1,11.1
0,0,0,40,2,8,0,12,0,4,0,0,18,0.0,0.1,31.6,2.2,9.4,0.3,15.4,0.0,4.5,0.3,0.2,20.0
1,0,3,0,0,1,0,0,1,4,0,0,0,0.0,1.7,0.0,0.3,0.8,0.4,0.0,2.4,3.2,0.0,0.0,0.1
2,0,0,13,14,0,3,20,118,5,2,0,0,0.1,0.0,20.8,11.4,0.8,10.7,25.6,96.0,3.9,5.3,0.4,0.1
3,0,0,0,2,1,0,10,0,0,0,0,0,0.0,0.8,0.4,1.9,0.1,0.0,9.1,0.0,0.0,0.0,0.5,0.1
4,11,2,0,0,1,0,11,0,0,9,42,0,8.7,2.6,0.0,0.0,0.7,0.5,9.7,0.0,0.1,9.3,44.3,0.1
5,3,1,0,11,7,0,84,6,18,0,0,4,1.5,0.6,0.0,12.2,10.7,0.0,71.8,4.8,26.8,0.0,0.6,4.9
6,2,0,0,1,0,0,0,0,1,1,3,0,3.7,0.1,0.1,0.4,0.0,0.1,0.0,0.4,2.0,0.2,1.0,0.0
7,3,2,2,6,0,24,1,7,4,0,48,36,1.7,2.7,5.4,8.9,0.0,17.2,1.0,11.0,3.0,0.0,52.3,29.8
8,25,0,5,2,0,3,0,2,0,0,2,12,22.8,0.0,4.2,2.1,0.2,2.1,0.1,3.1,0.9,0.0,2.6,12.9
9,1,12,5,4,2,0,0,1,0,0,0,33,1.0,10.3,7.8,3.2,0.4,0.1,0.0,4.8,0.3,0.0,0.0,30.1
