In [1]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import scipy.linalg
import itertools
import time
import torch
import torch.nn as nn
import torch.optim as optimizers
from scipy.stats import norm
from numpy.random import *
from scipy import optimize

np.random.seed(9837)
torch.manual_seed(9837)
pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

In [2]:
# 多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.array(np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1), dtype="int")
    return z_id

# ディリクリ分布の乱数を生成する関数
def Dirichlet(alpha, n):
    x = torch.Tensor(np.random.dirichlet(alpha, n))
    return x

# データの生成

## 入力データの定義

In [3]:
# データの設定
# パラメータ数を定義
s = 2
syntax1 = 7
syntax2 = np.repeat(7, syntax1)
k = 15
d = 5000
v1 = 800
v2 = 200
v = v1 + v2

# 文書データの統計量を生成
min_word = 2
max_word = 50
pt = np.random.poisson(np.random.gamma(10.0, 1.0, d), d)
pt[pt < 5] = 5
L = np.sum(pt)
ph = np.random.poisson(np.random.gamma(17.5, 0.5, L), L)
ph[ph < 2] = 2
M = np.sum(ph)
w = np.random.poisson(np.random.gamma(75.0, 0.05, M), M)
w[w < min_word] = min_word
N = np.sum(w)

# 行列演算ベクトルを定義
k_vec = np.repeat(1.0, k)
index_k = np.arange(k)
index_v1 = np.arange(v1)
index_v2 = np.arange(v2) + v1

# syntaxの配置行列を定義
syntax_diag = np.diag(np.repeat(1, syntax1))
syntax_block = np.full((syntax1, syntax1), np.arange(syntax1))[syntax_diag==0].reshape(syntax1, syntax1-1)

In [4]:
# idとインデックスを定義
# 文書idを定義
d_id1 = np.repeat(np.repeat(np.arange(d), pt), ph)
sentence_id1 = np.repeat(np.arange(L), ph)
pt_id1 = np.hstack(([np.arange(ph[i]) for i in range(L)]))
d_id2 = np.repeat(np.repeat(np.repeat(np.arange(d), pt), ph), w)
sentence_id2 = np.repeat(np.repeat(np.arange(L), ph), w)
phrase_id2 = np.repeat(np.arange(M), w)
pt_id2 = np.hstack(([np.arange(w[i]) for i in range(M)]))

# 文書のインデックスを定義
d_list1 = []
d_list2 = []
sentence_list1 = []
sentence_list2 = []
for i in range(d):
    d_list1.append(np.where(d_id1==i)[0].astype("int"))
    d_list2.append(np.where(d_id2==i)[0].astype("int"))
for i in range(L):
    sentence_list1.append(np.where(sentence_id1==i)[0].astype("int"))
    sentence_list2.append(np.where(sentence_id2==i)[0].astype("int"))
    
# 語順のインデックスを定義
max_pt1 = np.max(pt_id1) + 1
max_pt2 = np.max(pt_id2) + 1
pt_list10 = [j for j in range(max_pt1)]
pt_list11 = [j for j in range(max_pt1)]
pt_list12 = [j for j in range(max_pt1)]
pt_list20 = [j for j in range(max_pt2)]
pt_list21 = [j for j in range(max_pt2)]
pt_n1 = np.repeat(0, max_pt1)
pt_n2 = np.repeat(0, max_pt2)
for j in range(max_pt1):
    id1 = np.append(sentence_id1, -1)
    pt_list11[j] = np.array(np.where(pt_id1==j)[0], dtype="int")
    pt_list10[j] = pt_list11[j] - 1
    pt_list12[j] = pt_list11[j] + 1
    pt_list10[j][id1[pt_list10[j]]!=id1[pt_list11[j]]] = M
    pt_list12[j][id1[pt_list12[j]]!=id1[pt_list11[j]]] = M
    pt_n1[j] = pt_list11[j].shape[0]
for j in range(max_pt2):
    pt_list21[j] = np.array(np.where(pt_id2==j)[0], dtype="int")
    if j > 0:
        pt_list20[j] = pt_list21[j] - 1
    pt_n2[j] = pt_list21[j].shape[0]

In [5]:
# 機能語を生成
# フレーズのインデックスを定義
phrase_list = []
for i in range(M):
    if i==0:
        phrase_list.append(np.arange(w[i]))
    else:
        phrase_list.append(np.max(phrase_list[i-1]) + np.arange(w[i]) + 1)
        
# 機能語の位置を定義
weights = 1.0
function_flag = np.repeat(0, N)
for i in range(M):
    x = np.arange(w[i])[::-1][:3]
    logit = np.exp(weights * x)
    prob = logit / np.sum(logit)
    index = x[np.random.multinomial(1, prob, 1).reshape(-1)==1]
    function_flag[phrase_list[i][index]] = 1
function_index = np.where(function_flag==1)[0].astype("int")

## パラメータと応答変数を生成

In [6]:
# 事前分布の定義
# HMMの事前分布
alpha1 = np.repeat(0.5, syntax1)
alpha21 = np.array([0.3, 3.0])
alpha22 = np.array([1.75, 1/1.75])
beta1 = [np.repeat(0.5, syntax2[j]) for j in range(syntax1)]
beta2 = [np.repeat(0.2, syntax2[j]) for j in range(syntax1)]

# 切り換え確率の事前分布
delta1 = np.array([0.3, 3.0])
delta2 = np.array([1.75, 1/1.75])

# 単語分布の事前分布
gamma1 = [np.repeat(0.01, v1) for j in range(syntax1)]
gamma2 = [np.repeat(0.01, v2) for j in range(syntax1)]

In [7]:
# 応答変数の生成
while True:

    # パラメータを生成
    # 推移確率のパラメータを生成
    theta11 = np.zeros((syntax1, syntax1))
    for j in range(syntax1):
        theta11[j, syntax_block[j, ]] = np.random.gamma(alpha21[0], alpha21[1], syntax1-1)
    theta12 = np.random.gamma(alpha22[0], alpha22[1], v2*syntax1).reshape(v2, syntax1)
    pi1 = np.random.dirichlet(alpha1, 1)
    pi2 = np.zeros((syntax1, syntax1, v2))
    for j in range(v2):
        logit = theta11 * theta12[j, ]
        pi2[:, :, j] = logit / np.sum(logit, axis=1)[:, np.newaxis]
    psi1 = [np.random.dirichlet(beta1[j], 1).reshape(-1) for j in range(syntax1)]
    psi2 = [np.random.dirichlet(beta2[j], syntax2[j]) for j in range(syntax1)]
    thetat11 = theta11.copy(); thetat12 = theta12.copy()
    pit1 = pi1.copy(); pit2 = pi2.copy(); psit1 = psi1.copy(); psit2 = psi2.copy()

    # 切り換え確率の生成
    lower_prob = 0.2; upper_prob = 0.4
    while True:
        theta21 = np.random.gamma(delta1[0], delta1[1], s*syntax1).reshape(s, syntax1)
        theta22 = np.random.gamma(alpha22[0], alpha22[1], s*syntax1*v2).reshape(s, syntax1, v2)
        kappa = np.zeros((v2, syntax1))
        for j in range(v2):
            logit = theta21 * theta22[:, :, j]
            kappa[j, ] = (logit / np.sum(logit, axis=0))[0, ]
        if (np.mean(kappa) >= lower_prob) & (np.mean(kappa) <= upper_prob):
            break
    thetat21 = theta21.copy(); thetat22 = theta22.copy()
    kappat = kappa.copy()

    # 単語分布のパラメータを生成
    phi1 = []; phi2 = []; phi = []
    for j in range(syntax1):
        phi1.append(np.random.dirichlet(gamma1[j], syntax2[j]))
        phi2.append(np.random.dirichlet(gamma2[j], syntax2[j]))
        phi.append(np.hstack((phi1[j], phi2[j])))
    phit = phi.copy(); phit1 = phi1.copy(); phit2 = phi2.copy()


    # 潜在変数と単語を生成
    # 生成したデータの格納用配列
    S = np.repeat(0, M)
    Z1 = np.zeros((M, syntax1), dtype="int")
    Z2 = np.zeros((N, np.max(syntax2)), dtype="int")
    z1 = np.repeat(0, M)
    z2 = np.repeat(0, N)
    word_id = np.repeat(0, N)
    function_word = np.repeat(0, M)

    # フレーズごとに潜在変数と単語を生成
    for i in range(M):
        if (i%10000==0) & (i!=0):
            print([i, np.round(np.mean(S[:i]), 3)])

            # 切換え確率が既定値外ならやり直す
            if (np.mean(S[:i]) < lower_prob) | (np.mean(S[:i]) > upper_prob):
                break

        # 上位階層の潜在変数を生成
        if pt_id1[i]==0:    
            # 1フレーズ目の潜在変数を生成
            Z1[i, ] = np.random.multinomial(1, pi1.reshape(-1), 1).reshape(-1)
            z1[i] = np.argmax(Z1[i, ])
        else:
            # 切換え変数を生成
            prob = kappa[function_word[i-1], z1[i-1]]   
            S[i] = np.random.binomial(1, prob, 1)

            
            # 切換え変数に応じて潜在変数を生成
            if S[i]==1:
                Z1[i, ] = Z1[i-1, ]
                z1[i] = z1[i-1]
            else:
                Z1[i, ] = np.random.multinomial(1, pi2[z1[i-1], :, function_word[i-1]], 1).reshape(-1)
                z1[i] = np.argmax(Z1[i, ])

        # 下位階層の潜在変数と単語を生成
        # 上位階層に対応するパラメータを抽出
        index = phrase_list[i]
        psi01 = psi1[z1[i]]; psi02 = psi2[z1[i]]
        phi01 = phi1[z1[i]]; phi02 = phi2[z1[i]]

        # 単語単位で潜在変数と対応する単語を生成
        for j in range(w[i]):

            # 1単語目の応答変数を生成
            if j==0:
                # 下位階層の潜在変数を生成    
                Z2[index[j], ] = np.random.multinomial(1, psi01, 1)
                z2[index[j]] = np.argmax(Z2[index[j], ])

                # 単語を生成
                if function_flag[index[j]]==0:
                    word_id[index[j]] = np.argmax(np.random.multinomial(1, phi01[z2[index[j]], ], 1).reshape(-1))
                else:
                    function_word[i] = np.argmax(np.random.multinomial(1, phi02[z2[index[j]], ], 1).reshape(-1))
                    word_id[index[j]] = function_word[i] + v1

            # 2単語目以降の応答変数を生成
            if j > 0:
                # 下位階層の潜在変数を生成
                Z2[index[j], ] = np.random.multinomial(1, psi02[z2[index[j-1]], ], 1)
                z2[index[j]] = np.argmax(Z2[index[j], ])

                # 単語を生成
                if function_flag[index[j]]==0:
                    word_id[index[j]] = np.argmax(np.random.multinomial(1, phi01[z2[index[j]], ], 1).reshape(-1))
                else:
                    function_word[i] = np.argmax(np.random.multinomial(1, phi02[z2[index[j]], ], 1).reshape(-1))
                    word_id[index[j]] = function_word[i] + v1
                    
    # ループが最後まで完了すれば終了
    if i==(M-1):
        break

[10000, 0.326]
[20000, 0.319]
[30000, 0.325]
[40000, 0.324]
[50000, 0.322]
[60000, 0.324]
[70000, 0.324]
[80000, 0.325]
[90000, 0.327]
[100000, 0.327]
[110000, 0.325]
[120000, 0.326]
[130000, 0.326]
[140000, 0.325]
[150000, 0.325]
[160000, 0.324]
[170000, 0.325]
[180000, 0.326]
[190000, 0.327]
[200000, 0.326]
[210000, 0.327]
[220000, 0.327]
[230000, 0.327]
[240000, 0.328]
[250000, 0.328]
[260000, 0.329]
[270000, 0.328]
[280000, 0.328]
[290000, 0.328]
[300000, 0.328]
[310000, 0.327]
[320000, 0.327]
[330000, 0.327]
[340000, 0.327]
[350000, 0.327]
[360000, 0.327]
[370000, 0.327]
[380000, 0.327]
[390000, 0.327]
[400000, 0.327]
[410000, 0.326]
[420000, 0.327]
[430000, 0.326]
[440000, 0.327]
[450000, 0.326]


In [8]:
# 一部データに教師をつける
# 潜在変数の教師をつける
q = 3
supervised_prob = 0.4
target_syntax = np.random.choice(np.arange(syntax1), q, replace=False)
index = np.where(np.in1d(z1, target_syntax))[0].astype("int")
y = np.repeat(0, M)
y[index] = np.random.binomial(1, supervised_prob, index.shape[0])
index_y0 = np.where(y==0)[0].astype("int")
index_y1 = np.where(y==1)[0].astype("int")
Y1 = Z1[index_y1, ]
Y0 = Z1[index_y0, ]

# 切換え変数の教師をつける
supervised_prob = 0.25
x = np.random.binomial(1, supervised_prob, index.shape[0])
index_x0 = np.where(x==0)[0].astype("int")
index_x1 = np.where(x==1)[0].astype("int")
X1 = S[index_x1, ]
X0 = S[index_x0, ]

# Hierarchical Swithcing Unsupervised phrase estimationを推定

In [9]:
# アルゴリズムの設定
R = 1000
keep = 2
burnin = 500
skeep = int(burnin/keep)
iters = 0
disp = 50
serial_no = np.arange(N)
syntax_vec1 = np.repeat(1, syntax1)
syntax_vec2 = [np.repeat(1, syntax2[i]) for i in range(syntax1)]
phrase_matrix = scipy.sparse.csr_matrix((np.repeat(1, N), (phrase_id2, np.arange(N))), shape=(M, N))

In [10]:
# インデックスの定義
# 単語のインデックスを定義
word_list1 = [i for i in range(v1)]
word_list2 = [i for i in range(v2)]
word_vec1 = [i for i in range(v1)]
word_vec2 = [i for i in range(v2)]
for i in range(v1):
    word_list1[i] = np.where(word_id==i)[0].astype("int")
    word_vec1[i] = np.repeat(1, word_list1[i].shape[0])
for i in range(v2):
    word_list2[i] = np.where(word_id==v1+i)[0].astype("int")
    word_vec2[i] = np.repeat(1, word_list2[i].shape[0])
    
# 先頭と末尾のインデックスを定義
index_p11 = np.where(pt_id1==0)[0].astype("int")
index_q11 = np.where(pt_id2==0)[0].astype("int")
index_p12 = np.repeat(0, L)
index_q12 = np.repeat(0, M)
for i in range(L):
    index_p12[i] = np.max(sentence_list1[i])
for i in range(M):    
    index_q12[i] = np.max(phrase_list[i])
    
# 中間のインデックスを定義
index_list_p21 = [j for j in range(max_pt1-1)]
index_list_p22 = [j for j in range(max_pt1-1)]
index_list_q21 = [j for j in range(max_pt2-1)]
index_list_q22 = [j for j in range(max_pt2-1)]
for j in range(1, max_pt1):
    index_list_p21[j-1] = np.where(pt_id1==j)[0].astype("int") - 1
    index_list_p22[j-1] = np.where(pt_id1==j)[0].astype("int")
for j in range(1, max_pt2):
    index_list_q21[j-1] = np.where(pt_id2==j)[0].astype("int") - 1
    index_list_q22[j-1] = np.where(pt_id2==j)[0].astype("int")
index_p21 = np.sort(np.hstack((index_list_p21)))
index_p22 = np.sort(np.hstack((index_list_p22)))
index_q21 = np.sort(np.hstack((index_list_q21)))
index_q22 = np.sort(np.hstack((index_list_q22)))

# 機能語のインデックスを定義
new_function = function_word.copy()
new_function[index_p12] = np.max(v2)
function_list1 = [j for j in range(v2)]
function_list2 = [j for j in range(v2)]
for j in range(v2):
    function_list1[j] = np.where(new_function==j)[0].astype("int") 
    function_list2[j] = function_list1[j] + 1

In [11]:
# 事前分布の定義
# HMMの事前分布を定義
alpha1 = np.repeat(0.5, syntax1)
alpha21 = np.array([0.5, 0.5])
alpha22 = np.array([1.0, 1.0])
beta1 = 0.2
beta2 = 0.2

# 切り換え確率の事前分布
delta1 = np.array([0.5, 0.5])
delta2 = np.array([1.0, 1.0])

# 単語分布の事前分布
gamma1 = 0.1
gamma2 = 0.1

In [12]:
# パラメータの真値を定義
# 推移確率のパラメータの真値
theta11 = thetat11.copy()
theta12 = thetat12.copy()
pi1 = pit1.copy()
pi2 = pit2.copy()
psi1 = psit1.copy()
psi2 = psit2.copy()

# 切換え確率の真値
theta21 = thetat21.copy()
theta22 = thetat22.copy()
kappa = kappat.copy()

# 単語分布のパラメータの真値
phi1 = phit1.copy()
phi2 = phit2.copy()
phi = [np.hstack((phi1[j], phi2[j])) for j in range(syntax1)]

# 潜在変数の真値
Si = S.copy()
Zi1 = Z1.copy()
Zi2 = Z2.copy()
z1 = np.dot(Zi1, np.arange(syntax1))
z2 = np.dot(Zi2, np.arange(np.max(syntax2)))

In [13]:
# パラメータの初期値
# 推移確率のパラメータの初期値
theta11 = np.zeros((syntax1, syntax1))
for j in range(syntax1):
    theta11[j, syntax_block[j, ]] = np.random.gamma(0.2, 0.2, syntax1-1)
theta12 = np.random.gamma(0.2, 0.2, v2*syntax1).reshape(v2, syntax1)
pi1 = np.random.dirichlet(np.repeat(100.0, syntax1), 1).reshape(-1)
pi2 = np.zeros((syntax1, syntax1, v2))
for j in range(v2):
    logit = theta11 * theta12[j, ]
    pi2[:, :, j] = logit / np.sum(logit, axis=1)[:, np.newaxis]
psi1 = [np.random.dirichlet(np.repeat(100.0, syntax2[j]) , 1).reshape(-1) for j in range(syntax1)]
psi2 = [np.random.dirichlet(np.repeat(100.0, syntax2[j]), syntax2[j]) for j in range(syntax1)]

# 切換え確率の初期値
theta21 = np.random.gamma(0.2, 0.2, s*syntax1).reshape(s, syntax1)
theta22 = np.random.gamma(0.2, 0.2, s*syntax1*v2).reshape(s, syntax1, v2)
kappa = np.zeros((v2, syntax1))
for j in range(v2):
    logit = theta21 * theta22[:, :, j]
    kappa[j, ] = (logit / np.sum(logit, axis=0))[0, ]
            
# 単語分布のパラメータの初期値
phi1 = []; phi2 = []; phi = []
for j in range(syntax1):
    phi1.append(np.random.dirichlet(np.repeat(100.0, v1), syntax2[j]))
    phi2.append(np.random.dirichlet(np.repeat(100.0, v2), syntax2[j]))
    phi.append(np.hstack((phi1[j], phi2[j])))
    
# 潜在変数の初期値
Si = np.repeat(0, M)
Zi1 = np.random.multinomial(1, np.random.dirichlet(np.repeat(100.0, syntax1)).reshape(-1), M)
Zi2 = np.random.multinomial(1, np.random.dirichlet(np.repeat(100.0, np.min(syntax2))), N)
if np.max(syntax2) > np.min(syntax2):
    Zi2 = np.hstack((Zi2, np.zeros((N, np.max(syntax2)-np.min(syntax2)), dtype="int")))
z1 = np.dot(Zi1, np.arange(syntax1))
z2 = np.dot(Zi2, np.arange(np.max(syntax2)))

## パラメータの推定

In [13]:
# ギブスサンプリングでパラメータをサンプリング

# 上位階層のsyntaxごとに下位階層のsyntaxを生成
# 上位階層のパラメータの格納用配列
Syntax = np.zeros((N, np.max(syntax2), syntax1), dtype="int")
Lho = np.zeros((N, syntax1))

# syntaxに応じた下位階層のsyntaxを生成
for i in range(syntax1):

    # 潜在変数の格納用配列
    Posterior2 = np.zeros((N, syntax2[i]))
    Zi2 = np.zeros((N, syntax2[i]), dtype="int")
    z2 = np.repeat(0, N)

    # 下位階層の尤度を定義
    phi_long = (phi[i].T)[word_id, ]

    # 語順に応じてsyntaxを生成
    for j in range(max_pt2):

        # フレーズの先頭のsyntaxを生成
        index = pt_list21[j]
        if j==0:
            # 事後分布から潜在確率を定義
            prior = psi1[i]
            Posterior2 = prior * phi_long[index, ]
            Prob2 = Posterior2 / np.dot(Posterior2, syntax_vec2[i])[:, np.newaxis]

            # 多項分布からsyntaxを生成
            res = rmnom(Prob2, pt_n2[j], syntax2[i], 1)
            z2[index] = res[0]
            Zi2[index, :syntax2[i]] = res[1]

            # 上位階層の尤度を定義
            Lho[index, i] = np.dot(res[1] * Posterior2, syntax_vec2[i])

        # フレーズの2単語目以降のsyntaxを生成
        else:
            # 事後分布から潜在確率を定義
            prior = psi2[i][z2[pt_list20[j]], ]
            Posterior2 = prior * phi_long[index]
            Prob2 = Posterior2 / np.dot(Posterior2, syntax_vec2[i])[:, np.newaxis]

            # 多項分布からsyntaxを生成
            res = rmnom(Prob2, pt_n2[j], syntax2[i], 1)
            z2[index] = res[0]
            Zi2[index, :syntax2[i]] = res[1]

            # 上位階層の尤度を定義
            Lho[index, i] = np.dot(res[1] * Posterior2, syntax_vec2[i])

    # 潜在変数を格納
    Syntax[:, :, i] = Zi2

In [14]:
# 事後分布から切換え変数を生成
LL = np.dot(phrase_matrix, scipy.sparse.csr_matrix(np.log(Lho))).toarray()

In [73]:
LL

array([[ -57.49680659,  -49.8241187 ,  -39.33223007, ...,  -35.83202115,
         -52.49124121,  -14.28367089],
       [ -31.30920564,  -24.68290083,  -35.25295289, ...,  -43.47087211,
         -13.42149225,  -42.50706249],
       [ -77.9948751 ,  -89.6841648 ,  -62.46406634, ...,  -84.76253461,
        -115.32132517,  -16.70801202],
       ...,
       [ -35.36332412,   -7.43506843,  -26.19633701, ...,  -16.01784863,
         -35.81241326,  -18.25171396],
       [ -23.02590346,   -9.78023572,  -14.25179287, ...,  -20.95616352,
         -33.8432739 ,  -36.86423418],
       [ -98.27296614,  -16.82165612,  -40.36142672, ...,  -51.27343254,
         -77.47322914,  -57.45162757]])

In [70]:
j = 1
pt_list11[j]


array([     1,      8,     12, ..., 451395, 451412, 451422])

In [19]:
kappa[function_word, ]

array([[2.01977972e-02, 9.99914826e-01, 4.59419784e-02, ...,
        2.00722759e-01, 2.18878493e-01, 8.00506609e-02],
       [4.17749850e-02, 9.99986626e-01, 1.77324836e-03, ...,
        1.94018500e-01, 4.99849718e-01, 3.22831835e-01],
       [9.63103467e-02, 9.99731430e-01, 9.11052547e-04, ...,
        6.31239039e-01, 2.16905843e-01, 2.60336732e-01],
       ...,
       [3.94534507e-01, 9.99509105e-01, 4.87629730e-03, ...,
        2.73850616e-01, 2.57363875e-01, 4.76156018e-02],
       [1.07750555e-02, 9.99913747e-01, 1.43185777e-03, ...,
        3.31594061e-02, 1.98475816e-01, 2.73604923e-02],
       [3.94534507e-01, 9.99509105e-01, 4.87629730e-03, ...,
        2.73850616e-01, 2.57363875e-01, 4.76156018e-02]])

In [18]:
function_word

(451431,)