In [None]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import scipy.linalg
import itertools
import time
import torch
import torch.nn as nn
import torch.optim as optimizers
from scipy.stats import norm
from numpy.random import *
from scipy import optimize

np.random.seed(9837)
torch.manual_seed(9837)
pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

In [None]:
# 多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.array(np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1), dtype="int")
    return z_id

# ディリクリ分布の乱数を生成する関数
def Dirichlet(alpha, n):
    x = torch.Tensor(np.random.dirichlet(alpha, n))
    return x

# データの生成

## 入力データの定義

In [None]:
# データの設定
# パラメータ数を定義
types = 2
min_word = 5
max_word = 50
k11 = 5   # topic wordのsyntax数
k12 = 7   # general wordのsyntax数
k1 = k11 + k12   # syntax数
k21 = 15   # topic wordのトピック数
k22 = 10   # general wordのトピック数
k2 = k21 + k22   # topic数
k3 = 15   # word classのトピック数
d = 5000   # 文書数
v11 = 1000  # topic wordのvocabulary数
v12 = 400   # general wordのvocabulary数
v1 = v11 + v12   # vocabulary数
v2 = 100   # word class数

# 文書データの統計量を生成
pt = np.random.poisson(np.random.gamma(12.5, 1.0, d), d)
pt[pt < 5] = 5
M = np.sum(pt)
w = np.random.poisson(np.random.gamma(17.5, 1.5, np.sum(pt)), np.sum(pt))
w[w < min_word] = min_word
N = np.sum(w)

# 行列演算ベクトルを定義
k_vec1 = np.repeat(1.0, k1)
k_vec2 = np.repeat(1.0, k2)
k_vec21 = np.repeat(1.0, k21)
k_vec22 = np.repeat(1.0, k22)
index_k11 = np.arange(k11)
index_k12 = np.arange(k12) + k11
index_k21 = np.arange(k21)
index_k22 = np.arange(k22) + k21
index_v11 = np.arange(v11)
index_v12 = np.arange(v12) + v11

In [None]:
# IDとインデックスを定義
# IDの定義
m = np.repeat(0, d)
doc_list = []
d_id = []
doc_id = np.repeat(np.arange(d), pt)
for i in range(d):
    doc_list.append(np.where(doc_id==i)[0].astype("int"))
    m[i] = np.sum(w[doc_list[i]])
    d_id.append(np.repeat(i, m[i]))
d_id = np.hstack((d_id))
sentence_id = np.repeat(np.arange(M), w)
pt_id = np.hstack(([np.arange(w[i]) for i in range(M)]))

# 文書のインデックスを定義
d_list = []
sentence_list = []
for i in range(d):
    d_list.append(np.where(d_id==i)[0].astype("int"))
for i in range(M):
    sentence_list.append(np.where(sentence_id==i)[0].astype("int"))
    
# 語順のインデックスを定義
max_pt = np.max(pt_id) + 1
pt_list = []
pt_n = np.repeat(0, max_pt)
for j in range(max_pt):
    pt_list.append(np.array(np.where(pt_id==j)[0], dtype="int"))
    pt_n[j] = pt_list[j].shape[0]

## パラメータと応答変数を生成

In [None]:
# 事前分布の定義
# HMMの事前分布を定義
alpha1 = np.repeat(1.0, k1)
alpha2 = np.append(np.repeat(0.5, k1), 5.0)

# トピック分布の事前分布
beta1 = np.append(np.repeat(0.1, k21), np.repeat(0.075, k22))
beta2 = np.append(np.repeat(0.05, k21), np.repeat(0.1, k22))

# 単語分布の事前分布
max_word = 30
gamma11 = np.full((k1, v11), 0.01)
gamma12 = np.full((k1, v12), 0.005)
gamma11[index_k11, ] = 0.05
gamma12[index_k12, ] = 0.05
gamma1 = np.hstack((gamma11, gamma12))
gamma21 = np.full((k2, v11), 0.0025)
gamma22 = np.full((k2, v12), 0.001)
gamma21[index_k21, ] = 0.025
gamma22[index_k22, ] = 0.025
gamma2 = np.hstack((gamma21, gamma22))

In [None]:
# パラメータを生成
# HMMの推移確率を生成
pi1 = np.append(np.random.dirichlet(alpha1, 1), 0.0).reshape(-1)
while True:
    pi2 = np.random.dirichlet(alpha2, k1+1)
    if (np.mean(pi2[:, k1]) > 0.45) & (np.mean(pi2[:, k1]) < 0.6):
        break
pit1 = pi1.copy(); pit2 = pi2.copy()

# ディリクリ分布からトピック分布を生成
kappa = np.random.normal(0, 0.75, v1)
theta = np.vstack((np.random.dirichlet(beta1, v11), 
                   np.random.dirichlet(beta2, v12)))
kappat = kappa.copy(); thetat = theta.copy()

# 単語分布の事前分布
psi = np.array([np.random.dirichlet(gamma1[j, ], 1).reshape(-1) for j in range(k1)])
phi = np.array([np.random.dirichlet(gamma2[j, ], 1).reshape(-1) for j in range(k2)])
psit = psi.copy(); phit = phi.copy()

In [None]:
# 応答変数を生成
# 生成したデータの格納用配列
S = np.zeros((N, k1+1), dtype="int")
s = np.repeat(0, N)
Z = np.zeros((N, k2), dtype="int")
z = np.repeat(-1, N)
word_id = np.repeat(0, N).astype("int16")
word_long = np.full((N, max_pt), -1, dtype="int16")
attention_id = np.repeat(-1, N).astype("int16")

# トピックと単語を生成
for j in range(max_pt):

    # 語順に応じた生成を実行
    if j==0:
        
        # 語順が先頭の単語を生成
        # 多項分布からsyntaxを生成
        index = pt_list[j]
        S[index, ] = np.random.multinomial(1, pi1, pt_n[j])
        s[index] = np.dot(S[index, ], np.arange(k1+1))

        # 単語を生成
        word_id[index] = rmnom(psi[s[index], ], pt_n[j], v1, 0)
        word_long[index, j] = word_id[index, ]
        
    else:
        
        # 語順が2単語目以降の単語を生成
        # 多項分布からsyntaxを生成
        index = pt_list[j]
        res = rmnom(pi2[s[index-1], ], pt_n[j], k1+1, 1)
        S[index, ] = res[1]
        s[index] = res[0]

        # 単語履歴を更新
        for q in range(j):
            word_long[index, q] = word_long[index-1, q]
        if j < max_word:
            index_col = np.arange(j)
        else:
            index_col = np.arange(j-max_word, j)

        # attentionの単語を選択
        index_hmm = index[np.array(np.where(res[1][:, k1]==0)[0], dtype="int")]
        index_attention = index[np.array(np.where(res[1][:, k1]==1)[0], dtype="int")]  
        m1 = index_hmm.shape[0]
        m2 = index_attention.shape[0]
        
        if m2 > 0:
            candidate_word = word_long[index_attention-1, ][:, index_col]
            logit = kappa[candidate_word, ]
            prob = np.exp(logit) / np.sum(np.exp(logit), axis=1)[:, np.newaxis]
            word = np.sum(candidate_word * rmnom(prob, m2, prob.shape[1], 1)[1], axis=1)
            attention_id[index_attention] = word

            # attentionからトピックを生成
            res = rmnom(theta[word, ], word.shape[0], k2, 1)
            Z[index_attention, ] = res[1]
            z[index_attention] = res[0]

        # 単語を生成
        word_id[index_hmm] = rmnom(psi[s[index_hmm], ], m1, v1, 0)
        word_id[index_attention] = rmnom(phi[z[index_attention], ], m2, v1, 0)
        word_long[index, j] = word_id[index]

In [None]:
# 一部データに教師をつける
supervised_prob = 0.25
index_topic = np.where(s==k1)[0].astype("int")
y = np.repeat(0, N)
y[index_topic] = np.random.binomial(1, 0.25, index_topic.shape[0])
index_y0 = np.where(y==0)[0].astype("int")
index_y1 = np.where(y==1)[0].astype("int")
wd = word_id[index_y1]

# attentionの単語集合をセット
attention_id_ = attention_id.copy()
attention_set = np.full((N, max_word), -1, dtype="int16")
for j in range(1, max_pt):
    index = pt_list[j]
    if j < max_word:
        index_col = np.arange(max_word)
    else:
        index_col = np.arange(j-max_word, j)
    attention_set[index, ] = word_long[index-1, ][:, index_col]
word_set = attention_set.copy()
word_set[y==1] = -1

# 単語集合のインデックスを定義
set_list = [j for j in range(max_word+1)]
set_id = [j for j in range(max_word+1)] 
set_list[0] = np.array([])
set_id[0] = np.array([]) 
for j in range(max_word):
    set_list[j+1] = np.array(np.where(word_set[:, j]!=-1)[0], dtype="int")
    set_id[j+1] = word_set[set_list[j+1], j]
set_index = np.where((pt_id!=0) & (y!=1))[0].astype("int")

In [None]:
# attentionの潜在変数行列を定義
attention_index = np.where(attention_id!=-1)[0].astype("int")
D = np.full((N, max_word), 0)
for i in range(attention_index.shape[0]):
    index = attention_index[i]
    target_index = np.where(attention_set[index, ]==attention_id_[index])[0].astype("int")
    D[index, target_index[0]] = 1
d_supervised = D[index_y1, ]
allocation_id = np.sum(D * attention_set, axis=1)[attention_index, ]

# 依存単語の頻度を数える
kappa = np.repeat(0, v1)
freq = np.unique(allocation_id, return_counts=True)
kappa[freq[0]] = freq[1]
kappat = kappa.copy()

# Semi-Supervised Self Attention LDAを推定

In [None]:
# アルゴリズムの設定
R = 1000
keep = 2
burnin = 500
skeep = int(burnin/keep)
iters = 0
disp = 10
latent_n = k1 + max_word
latent_vec = np.repeat(1.0, latent_n)
max_vec = np.repeat(1, max_word)

In [None]:
# インデックスの定義
# 単語のインデックスを定義
word_list1 = [i for i in range(v11)]
word_list2 = [i for i in range(v12)]
word_vec1 = [i for i in range(v11)]
word_vec2 = [i for i in range(v12)]
for i in range(v11):
    word_list1[i] = np.where(word_id==i)[0].astype("int")
    word_vec1[i] = np.repeat(1, word_list1[i].shape[0])
for i in range(v12):
    word_list2[i] = np.where(word_id==v11+i)[0].astype("int")
    word_vec2[i] = np.repeat(1, word_list2[i].shape[0])
    
# 先頭と末尾のインデックスを定義
max_pt = np.max(pt_id) + 1
index_t11 = np.where(pt_id==0)[0].astype("int")
index_t12 = np.repeat(0, d)
for i in range(d):
    index_t12[i] = np.max(d_list[i])
    
# 中間のインデックスを定義
index_list_t21 = [j for j in range(max_pt-1)]
index_list_t22 = [j for j in range(max_pt-1)]
for j in range(1, max_pt):
    index_list_t21[j-1] = np.where(pt_id==j)[0].astype("int") - 1
    index_list_t22[j-1] = np.where(pt_id==j)[0].astype("int")
index_t21 = np.sort(np.array(list(itertools.chain(*[index_list_t21[j] for j in range(max_pt-1)]))))
index_t22 = np.sort(np.array(list(itertools.chain(*[index_list_t22[j] for j in range(max_pt-1)]))))

In [None]:
# パラメータの事前分布を定義
# HMMの事前分布を定義
alpha1 = np.repeat(1.0, k1)
alpha2 = np.repeat(1.0, k1+1)

# トピック分布の事前分布
beta1 = 0.1
beta2 = 0.1

# 単語分布の事前分布
gamma1 = 0.05
gamma2 = 0.05

In [None]:
# パラメータの真値
# 推移確率とトピック分布の真値
pi1 = pit1.copy()
pi2 = pit2.copy()
kappa = kappat.copy()
theta = thetat.copy()

# 単語分布の真値
psi1 = psit[:, index_v11] / np.sum(psit[:, index_v11], axis=1)[:, np.newaxis]
psi2 = psit[:, index_v12] / np.sum(psit[:, index_v12], axis=1)[:, np.newaxis]
psi = np.hstack((psi1, psi2))
phi1 = phit[:, index_v11] / np.sum(phit[:, index_v11], axis=1)[:, np.newaxis]
phi2 = phit[:, index_v12] / np.sum(phit[:, index_v12], axis=1)[:, np.newaxis]
phi = np.hstack((phi1, phi2))

# 潜在変数の真値
Si = S.copy()
Di = D.copy()
Zi = Z.copy()
s = np.dot(Si, np.arange(k1+1))
z = np.dot(Zi, np.arange(k2))

In [None]:
# パラメータの初期値
# 推移確率とトピック分布の初期値
pi1 = np.append(np.random.dirichlet(np.repeat(1.0, k1), 1), 0.0).reshape(-1)
pi2 = np.random.dirichlet(np.repeat(1.0, k1+1), k1+1)
theta = np.vstack((np.random.dirichlet(np.repeat(beta1, k2), v11), 
                   np.random.dirichlet(np.repeat(beta2, k2), v12)))

# 単語ごとのattentionの事前分布
n = int(np.mean(pi2[:, k1])*N)
prob = np.random.dirichlet(np.repeat(10.0, v1), 1).reshape(-1)
kappa = np.random.multinomial(n, prob, 1).reshape(-1)

# 単語分布の初期値
psi1 = np.random.dirichlet(np.repeat(1.0, v11), k1)
psi2 = np.random.dirichlet(np.repeat(1.0, v12), k1)
psi = np.hstack((psi1, psi2))
phi1 = np.random.dirichlet(np.repeat(1.0, v11), k2)
phi2 = np.random.dirichlet(np.repeat(1.0, v12), k2)
phi = np.hstack((phi1, phi2))

# 潜在変数の初期値
Si = np.zeros((N, k1+1), dtype="int")
Si[index_y1, k1] = 1
Si[index_y0, ] = np.random.multinomial(1, np.random.dirichlet(np.repeat(1.0, k1+1), 1).reshape(-1), index_y0.shape[0])
Zi = np.random.multinomial(1, np.random.dirichlet(np.repeat(1.0, k2), 1).reshape(-1), N)
s = np.dot(Si, np.arange(k1+1))
z = np.dot(Zi, np.arange(k2))

In [None]:
# パラメータの格納用配列
# バーンインのインデッククを定義
RS = np.arange(skeep, int(R/keep))
rs = RS.shape[0]

# 推移確率とトピック分布の格納用配列
PI1 = np.zeros((rs, k1+1))
PI2 = np.zeros((k1+1, k1+1, rs))
THETA = np.zeros((v1, k2, rs))

# モデルパラメータの格納用配列
PSI = np.zeros((k1, v1, rs))
PHI = np.zeros((k2, v1, rs))

# 潜在変数の格納用配列
SEG1 = np.zeros((N, k1+1))
SEG2 = np.zeros((N, max_word))
SEG3 = np.zeros((N, k2))

In [None]:
# 対数尤度の基準値
# ユニグラムモデルの基準値
freq = np.unique(word_id, return_counts=True)
par1 = np.repeat(0.0, v11)
par2 = np.repeat(0.0, v12)
par1[freq[0][freq[0] < v11]] = freq[1][freq[0] < v11]
par2[freq[0][freq[0] >= v11] - v11] = freq[1][freq[0] >= v11]
LLst1 = np.sum(np.log((par1 / np.sum(par1))[word_id[word_id < v11]]))
LLst2 = np.sum(np.log((par2 / np.sum(par2))[word_id[word_id >= v11] - v11]))
LLst = LLst1 + LLst2
print(np.round([LLst1, LLst2, LLst], 1))

# 真値での対数尤度の基準値
index_syntax = np.where(S[:, k1]==0)[0].astype("int")
index_topic = np.where(S[:, k1]==1)[0].astype("int")
LLbest1 = np.sum(np.log(np.sum(S[index_syntax, :k1] * (psit.T)[word_id[index_syntax], ], axis=1)))
LLbest2 = np.sum(np.log(np.sum(Z[index_topic, ] * (phit.T)[word_id[index_topic], ], axis=1)))
LLbest = LLbest1 + LLbest2
print(np.round([LLbest1, LLbest2, LLbest], 1))

## パラメータを推定

In [None]:
# ギブスサンプリングでパラメータをサンプリング
start_time = time.time()
for rp in range(R):
    
    # 事後分布から潜在変数を生成
    # syntaxとトピックの尤度を定義
    phi_long = (phi.T)[word_id, ]
    Lho1 = (psi.T)[word_id, ]   # syntaxごとの尤度
    Lho2 = np.full((N, max_word), 0.0)   # attention wordごとの尤度
    Prior_sum = np.full((N, max_word), 0)   # 事前分布の頻度
    for j in range(1, max_word+1):
        index = set_list[j]
        Lho2[index, j-1] = np.dot(theta[set_id[j], ] * phi_long[index, ], k_vec2)
        Prior_sum[index, j-1] = kappa[set_id[j]]
    Prior_sum = Prior_sum[set_index, ]

    # 推移確率とattentionの事前分布を定義
    prior_pi1 = np.full((N, k1+1), 1/(k1+1)); prior_pi2 = prior_pi1.copy()
    prior_pi1[index_t11, ] = np.full((M, k1+1), pi1)   # 文書の先頭と末尾の混合率
    prior_pi1[index_t22, ] = pi2[s[index_t21], ]   # 1単語前の混合率
    prior_pi2[index_t21, ]= (pi2.T)[s[index_t22], ]   # 1単語後の混合率
    prior_kappa = Prior_sum / np.sum(Prior_sum, axis=1)[:, np.newaxis]

    # 事前分布を結合
    Prior1 = prior_pi1[:, :k1] * prior_pi2[:, :k1]
    Prior2 = np.full((N, max_word), 0.0)
    Prior2[set_index, ] = (prior_pi1[set_index, k1] * prior_pi2[set_index, k1])[:, np.newaxis] * prior_kappa

    # 事後分布を定義
    Posterior1 = Prior1 * Lho1
    Posterior2 = Prior2 * Lho2
    Posterior = np.hstack((Posterior1, Posterior2))[index_y0, ]

    # 多項分布から潜在変数を生成
    Prob = Posterior / np.dot(Posterior, latent_vec)[:, np.newaxis]
    res = rmnom(Prob, index_y0.shape[0], latent_n, 1)
    Si = np.full((N, k1+1), 0)
    Si[index_y0, ] = np.hstack((res[1][:, :k1], np.dot(res[1][:, k1:], np.repeat(1, max_word))[:, np.newaxis]))
    Si[index_y1, k1] = 1
    s = np.dot(Si, np.arange(k1+1))
    Di = np.full((N, max_word), 0)
    Di[index_y0, ] = res[1][:, k1:]
    Di[index_y1, ] = d_supervised


    # 事前分布のパラメータをサンプリング
    # ディリクレ分布から推移確率をサンプリング
    rf1 = np.sum(Si[index_t11, ], axis=0)[:k1] + alpha1
    rf2 = np.dot(Si[index_t21, ].T, Si[index_t22, ]) + alpha2
    pi1 = np.append(np.random.dirichlet(rf1, 1).reshape(-1), 0.0)
    pi2 = np.zeros((k1+1, k1+1))
    for j in range(k1+1):
        pi2[j, ] = np.random.dirichlet(rf2[j, ], 1).reshape(-1)

    # attentionのトピック分布をサンプリング
    index_syntax = np.where(s!=k1)[0].astype("int")
    index_topic = np.where(s==k1)[0].astype("int")
    attention_id = np.repeat(-1, N).astype("int16")
    attention_id[index_topic] = np.dot((Di[index_topic, ] * attention_set[index_topic, ]), max_vec)
    freq = np.unique(attention_id[index_topic], return_counts=True)
    kappa = np.repeat(0, v1)
    kappa[freq[0]] = freq[1]


    # attentionのトピックをサンプリング
    # 事後分布を定義
    n = index_topic.shape[0]
    Posterior = theta[attention_id[index_topic], ] * phi_long[index_topic, ]   

    # 多項分布からトピックをサンプリング
    Prob = Posterior / np.dot(Posterior, k_vec2)[:, np.newaxis]
    Zi = np.zeros((N, k2), dtype="int")
    Zi[index_topic, ] = rmnom(Prob, n, k2, 1)[1]

    # トピック分布をサンプリング
    attention_matrix = scipy.sparse.csr_matrix((np.repeat(1, n), (attention_id[index_topic], np.arange(n))), shape=(v1, n))
    Zi_matrix = scipy.sparse.csr_matrix(Zi[index_topic, ])
    wsum = np.dot(attention_matrix, Zi_matrix).toarray() + beta1
    for i in range(v1):
        theta[i, ] = np.random.dirichlet(wsum[i, ], 1).reshape(-1)


    # 単語分布をサンプリング
    # 事後分布のパラメータを定義
    vf11 = np.zeros((k1, v11)); vf12 = np.zeros((k1, v12))
    vf21 = np.zeros((k2, v11)); vf22 = np.zeros((k2, v12))
    for j in range(v11):
        index = word_list1[j]
        vf11[:, j] = np.dot(Si[index, :k1].T, word_vec1[j]) + gamma1
        vf21[:, j] = np.dot(Zi[index, ].T, word_vec1[j]) + gamma1
    for j in range(v12):
        index = word_list2[j]
        vf12[:, j] = np.dot(Si[index, :k1].T, word_vec2[j]) + gamma2
        vf22[:, j] = np.dot(Zi[index, ].T, word_vec2[j]) + gamma2

    # ディリレク分布からパラメータをサンプリング
    for j in range(k1):
        psi1[j, ] = np.random.dirichlet(vf11[j, ], 1).reshape(-1) 
        psi2[j, ] = np.random.dirichlet(vf12[j, ], 1).reshape(-1) 
    psi = np.hstack((psi1, psi2))
    for j in range(k2):
        phi1[j, ] = np.random.dirichlet(vf21[j, ], 1).reshape(-1) 
        phi2[j, ] = np.random.dirichlet(vf22[j, ], 1).reshape(-1) 
    phi = np.hstack((phi1, phi2))


    # サンプリング結果の保存と表示
    # パラメータの格納
    if (rp%keep==0) & (rp >= burnin):
        mkeep = int(rp/keep) - skeep
        
        # モデルパラメータの格納
        PI1[mkeep, ] = pi1
        PI2[:, :, mkeep] = pi2
        THETA[:, :, mkeep] = theta
        PSI[:, :, mkeep] = psi
        PHI[:, :, mkeep] = phi
        
        # 潜在変数の格納
        SEG1 += Si
        SEG2 += Di
        SEG3 += Zi
        
    # 対数尤度の更新と結果の表示
    if rp%disp==0:    
        
        # 経過時間を取得
        intermediate_time = time.time()
        elapsed_time = (intermediate_time - start_time) / 60
        
        # 対数尤度の更新
        LL1 = np.sum(np.log(np.sum(Si[index_syntax, :k1] * (psi.T)[word_id[index_syntax], ], axis=1)))
        LL2 = np.sum(np.log(np.sum(Zi[index_topic, ] * (phi.T)[word_id[index_topic], ], axis=1)))
        LL = LL1 + LL2

        #サンプリング結果を確認
        print(rp)
        print("経過時間: {}".format(elapsed_time))
        print(np.round(np.mean(Si, axis=0), 3))
        print(np.round([LL, LLst, LLbest], 1))