In [1]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import scipy.linalg
import itertools
import time
import torch
import torch.nn as nn
import torch.optim as optimizers
from scipy.stats import norm
from numpy.random import *
from scipy import optimize

np.random.seed(9837)
torch.manual_seed(9837)
pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

In [2]:
# 多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.array(np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1), dtype="int")
    return z_id

# ディリクリ分布の乱数を生成する関数
def Dirichlet(alpha, n):
    x = torch.Tensor(np.random.dirichlet(alpha, n))
    return x

# データの生成

## 入力データの定義

In [3]:
# データの設定
# パラメータ数を定義
syntax1 = 8
syntax2 = np.repeat(7, syntax1)
k = 15
d = 5000
v1 = 500   
v2 = 100
v = v1 + v2

# 文書データの統計量を生成
min_word = 2
max_word = 50
pt = np.random.poisson(np.random.gamma(10.0, 1.0, d), d)
pt[pt < 5] = 5
L = np.sum(pt)
ph = np.random.poisson(np.random.gamma(17.5, 0.5, L), L)
ph[ph < 2] = 2
M = np.sum(ph)
w = np.random.poisson(np.random.gamma(75.0, 0.05, M), M)
w[w < min_word] = min_word
N = np.sum(w)

# 行列演算ベクトルを定義
k_vec = np.repeat(1.0, k)
index_k = np.arange(k)
index_v1 = np.arange(v1)
index_v2 = np.arange(v2) + v1

In [4]:
# idとインデックスを定義
# 文書idを定義
d_id1 = np.repeat(np.repeat(np.repeat(np.arange(d), pt), ph), w)
sentence_id1 = np.repeat(np.repeat(np.arange(L), ph), w)
phrase_id1 = np.repeat(np.arange(M), w)
pt_id1 = np.hstack(([np.arange(w[i]) for i in range(M)]))
d_id2 = np.repeat(np.repeat(np.arange(d), pt), ph)
sentence_id2 = np.repeat(np.arange(L), ph)
pt_id2 = np.hstack(([np.arange(ph[i]) for i in range(L)]))

# 文書のインデックスを定義
d_list1 = []
d_list2 = []
sentence_list1 = []
sentence_list2 = []
for i in range(d):
    d_list1.append(np.where(d_id1==i)[0].astype("int"))
    d_list2.append(np.where(d_id2==i)[0].astype("int"))
for i in range(L):
    sentence_list1.append(np.where(sentence_id1==i)[0].astype("int"))
    sentence_list2.append(np.where(sentence_id2==i)[0].astype("int"))
    
# 語順のインデックスを定義
max_pt1 = np.max(pt_id1) + 1
max_pt2 = np.max(pt_id2) + 1
pt_list1 = []
pt_list2 = []
pt_n1 = np.repeat(0, max_pt1)
pt_n2 = np.repeat(0, max_pt2)
for j in range(max_pt1):
    pt_list1.append(np.array(np.where(pt_id1==j)[0], dtype="int"))
    pt_n1[j] = pt_list1[j].shape[0]
    
for j in range(max_pt2):
    pt_list2.append(np.array(np.where(pt_id2==j)[0], dtype="int"))
    pt_n2[j] = pt_list2[j].shape[0]

In [5]:
# 機能語を生成
# フレーズのインデックスを定義
phrase_list = []
for i in range(M):
    if i==0:
        phrase_list.append(np.arange(w[i]))
    else:
        phrase_list.append(np.max(phrase_list[i-1]) + np.arange(w[i]) + 1)
        
# 機能語の位置を定義
weights = 1.0
function_flag = np.repeat(0, N)
for i in range(M):
    x = np.arange(w[i])[::-1][:3]
    logit = np.exp(weights * x)
    prob = logit / np.sum(logit)
    index = x[np.random.multinomial(1, prob, 1).reshape(-1)==1]
    function_flag[phrase_list[i][index]] = 1
function_index = np.where(function_flag==1)[0].astype("int")

## パラメータと応答変数を生成

In [6]:
# 事前分布の定義
# HMMの事前分布を定義
alpha1 = np.repeat(0.5, syntax1)
alpha2 = [np.repeat(0.5, syntax2[j]) for j in range(syntax1)]
beta = np.repeat(0.1, syntax1)

# 単語分布の事前分布
gamma1 = [np.repeat(0.025, v1) for j in range(syntax1)]
gamma2 = [np.repeat(0.025, v2) for j in range(syntax1)]

In [7]:
# パラメータを生成
# 推移確率のパラメータを生成
theta = np.random.dirichlet(beta, v2)
pi1 = np.random.dirichlet(alpha1, 1)
pi2 = np.random.dirichlet(alpha1, syntax1)
psi1 = [np.random.dirichlet(alpha2[j], 1).reshape(-1) for j in range(syntax1)]
psi2 = [np.random.dirichlet(alpha2[j], syntax2[j]).reshape(-1) for j in range(syntax1)]

# 単語分布のパラメータを生成
phi1 = []; phi2 = []; phi = []
for j in range(syntax1):
    phi1.append(np.random.dirichlet(gamma1[j], syntax2[j]))
    phi2.append(np.random.dirichlet(gamma2[j], syntax2[j]))
    phi.append(np.hstack((phi1[j], phi2[j])).T)

In [8]:
phi

[array([[1.90484776e-14, 3.31177954e-12, 1.28078606e-60, ...,
         1.70402309e-05, 2.12435128e-43, 3.94985501e-22],
        [5.78107330e-58, 9.19433635e-34, 1.10494280e-43, ...,
         5.06829528e-49, 1.41095545e-18, 1.81485721e-03],
        [1.65643348e-28, 9.29087738e-14, 5.53196175e-22, ...,
         4.64635143e-33, 7.77163807e-29, 8.19239533e-05],
        ...,
        [4.55479734e-15, 3.06056708e-01, 2.96294506e-30, ...,
         1.11589289e-06, 1.01924255e-13, 4.68682832e-08],
        [3.78942857e-10, 4.68765882e-60, 2.85443694e-54, ...,
         6.04730638e-31, 5.45866178e-14, 5.02806746e-09],
        [3.02460666e-11, 8.52252371e-37, 4.48772991e-53, ...,
         5.51218011e-02, 3.28419244e-13, 2.21163062e-05]]),
 array([[2.30185419e-19, 1.93136367e-03, 6.20811993e-20, ...,
         6.69751246e-09, 1.34271201e-07, 4.17867403e-09],
        [1.60135607e-04, 8.89115523e-52, 4.02125826e-06, ...,
         6.46408468e-05, 5.52317741e-22, 2.53563445e-25],
        [3.78593752e-28, 