In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tqdm
import pickle
import os

from rpy2.robjects import numpy2ri
from rpy2.robjects.packages import importr

In [2]:
def calc_dnml(X, Z1, Z2, K=3, L=3):    
    N = X.shape[0]
    
    codelen_x_z = 0.0
    codelen_z = 0.0

    for k in range(K):
        for l in range(L):
            lam = np.mean(X[Z1 == k, :][:, Z2 == l])

            codelen_x_z += n_all * np.log(n_all)
            codelen_x_z += np.log(normterm_discrete(n_all, 2))

        n_k = np.sum(Z1 == k)
        if n_k >= 1:
            codelen_z += n_k * (np.log(N) - np.log(n_k))

    codelen_z += np.log(normterm_discrete(N, K))
    
    codelen = codelen_x_z + codelen_z

    return codelen, codelen_x_z, codelen_z

In [3]:
outdir = 'output'
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [4]:
blockmodels = importr("blockmodels")
base = importr("base")
dollar = base.__dict__["$"]

In [5]:
EPS = np.finfo(np.float).eps

In [6]:
# read data
d = pd.read_csv('data_processed/FEH_200504_201403.csv')

In [7]:
d.head(3)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,北海道,青森県,岩手県,宮城県,秋田県,山形県,...,愛媛県,高知県,福岡県,佐賀県,長崎県,熊本県,大分県,宮崎県,鹿児島県,沖縄県
0,2014000303,2014年3月,1000,北海道,-,460,181,493,110,97,...,45,19,264,58,43,60,21,41,46,125
1,2014000303,2014年3月,2000,青森県,500,-,453,833,222,125,...,21,6,53,1,11,7,11,18,30,91
2,2014000303,2014年3月,3000,岩手県,195,426,-,1168,231,108,...,1,0,29,1,13,15,6,7,6,8


In [8]:
columns = d.loc[:, '北海道':'沖縄県'].columns

In [9]:
columns

Index(['北海道', '青森県', '岩手県', '宮城県', '秋田県', '山形県', '福島県', '茨城県', '栃木県', '群馬県',
       '埼玉県', '千葉県', '東京都', '神奈川県', '新潟県', '富山県', '石川県', '福井県', '山梨県', '長野県',
       '岐阜県', '静岡県', '愛知県', '三重県', '滋賀県', '京都府', '大阪府', '兵庫県', '奈良県', '和歌山県',
       '鳥取県', '島根県', '岡山県', '広島県', '山口県', '徳島県', '香川県', '愛媛県', '高知県', '福岡県',
       '佐賀県', '長崎県', '熊本県', '大分県', '宮崎県', '鹿児島県', '沖縄県'],
      dtype='object')

In [10]:
for cn in columns:
    d[cn] = d[cn].str.replace('-', '0').str.replace(',', '').astype(np.int)

In [11]:
YM_list = np.unique(d['Unnamed: 0'].values)

In [12]:
with open(os.path.join(outdir, 'ym_list.pkl'), 'wb') as f:
    pickle.dump(YM_list, f)

In [13]:
d_list = []
for ym in YM_list:
    d_ym = d.loc[d['Unnamed: 0'] == ym, :]
    d_ym_values = d_ym.iloc[:, 4:].values
    d_ym_values = d_ym_values.astype(np.float)
    d_ym_values[np.isnan(d_ym_values)] = 0.0
    d_ym_values = d_ym_values.astype(np.int)
    
    d_list.append(d_ym_values)

In [14]:
d_list_array = np.array(d_list)

In [15]:
with open('data_processed/X.pkl', 'wb') as f:
    pickle.dump(d_list_array, f)

In [16]:
def estimate_poisson(X, K):
    numpy2ri.activate()
    
    sbm = blockmodels.BM_poisson(membership_type="SBM", adj=np.array(X),
                                 verbosity=0,
                                 exploration_factor=1.5,
                                 explore_min=K,
                                 explore_max=K)

    estimate = dollar(sbm, "estimate")
    estimate()

    pi_list = []
    theta_list = []
    z_posterior_list = []
    
    for k in range(K):
        n_clusters = k + 1
        theta = np.array(dollar(dollar(sbm, "model_parameters")[k], "lambda"))
        z_posterior = np.array(dollar(dollar(sbm, "memberships")[k], "Z"))
        pi = np.sum(z_posterior, axis=0) + 10 * EPS
        pi /= np.sum(pi)

        theta_list.append(theta)
        z_posterior_list.append(z_posterior)
        pi_list.append(pi)

    numpy2ri.deactivate()

    return pi_list, theta_list, z_posterior_list

In [17]:
T = len(YM_list)
h = 2
K = 10

theta_list = [] 
pi_list = []
z_list = []

for t in tqdm.tqdm(range(T)):
    d = d_list_array[t, :, :]
    
    pi, theta, z = estimate_poisson(d, K=K)
    pi_list.append(pi)
    theta_list.append(theta)
    z_list.append(z)

  0%|          | 0/108 [00:00<?, ?it/s]




  1%|          | 1/108 [00:04<08:29,  4.76s/it]




  2%|▏         | 2/108 [00:11<09:27,  5.36s/it]




  3%|▎         | 3/108 [00:17<09:34,  5.47s/it]




  4%|▎         | 4/108 [00:21<08:54,  5.14s/it]




  5%|▍         | 5/108 [00:27<09:23,  5.47s/it]




  6%|▌         | 6/108 [00:34<09:39,  5.68s/it]




  6%|▋         | 7/108 [00:40<09:51,  5.85s/it]




  7%|▋         | 8/108 [00:46<09:50,  5.91s/it]




  8%|▊         | 9/108 [00:51<09:20,  5.66s/it]




  9%|▉         | 10/108 [00:57<09:36,  5.88s/it]




 10%|█         | 11/108 [01:02<09:03,  5.61s/it]




 11%|█         | 12/108 [01:08<08:56,  5.59s/it]




 12%|█▏        | 13/108 [01:15<09:36,  6.07s/it]




 13%|█▎        | 14/108 [01:23<10:29,  6.69s/it]




 14%|█▍        | 15/108 [01:29<10:07,  6.53s/it]




 15%|█▍        | 16/108 [01:35<09:33,  6.23s/it]




 16%|█▌        | 17/108 [01:41<09:26,  6.22s/it]




 17%|█▋        | 18/108 [01:47<09:08,  6.10s/it]




 18%|█▊        | 19/108 [01:53<09:05,  6.13s/it]




 19%|█▊        | 20/108 [01:57<07:55,  5.40s/it]




 19%|█▉        | 21/108 [02:00<06:54,  4.76s/it]




 20%|██        | 22/108 [02:03<06:13,  4.34s/it]




 21%|██▏       | 23/108 [02:07<05:55,  4.18s/it]




 22%|██▏       | 24/108 [02:13<06:21,  4.54s/it]




 23%|██▎       | 25/108 [02:19<06:51,  4.96s/it]




 24%|██▍       | 26/108 [02:25<07:12,  5.27s/it]




 25%|██▌       | 27/108 [02:30<07:02,  5.21s/it]




 26%|██▌       | 28/108 [02:35<06:51,  5.14s/it]




 27%|██▋       | 29/108 [02:40<06:45,  5.13s/it]




 28%|██▊       | 30/108 [02:44<06:21,  4.89s/it]




 29%|██▊       | 31/108 [02:48<05:48,  4.53s/it]




 30%|██▉       | 32/108 [02:51<05:09,  4.08s/it]




 31%|███       | 33/108 [02:54<04:42,  3.76s/it]




 31%|███▏      | 34/108 [02:58<04:40,  3.78s/it]




 32%|███▏      | 35/108 [03:01<04:18,  3.54s/it]




 33%|███▎      | 36/108 [03:04<04:18,  3.59s/it]




 34%|███▍      | 37/108 [03:11<05:32,  4.69s/it]




 35%|███▌      | 38/108 [03:16<05:26,  4.66s/it]




 36%|███▌      | 39/108 [03:22<05:41,  4.94s/it]




 37%|███▋      | 40/108 [03:26<05:32,  4.88s/it]




 38%|███▊      | 41/108 [03:30<04:58,  4.45s/it]




 39%|███▉      | 42/108 [03:36<05:17,  4.81s/it]




 40%|███▉      | 43/108 [03:42<05:35,  5.16s/it]




 41%|████      | 44/108 [03:47<05:32,  5.20s/it]




 42%|████▏     | 45/108 [03:50<04:53,  4.66s/it]




 43%|████▎     | 46/108 [03:54<04:40,  4.53s/it]




 44%|████▎     | 47/108 [03:58<04:19,  4.25s/it]




 44%|████▍     | 48/108 [04:02<04:05,  4.10s/it]




 45%|████▌     | 49/108 [04:06<03:55,  4.00s/it]




 46%|████▋     | 50/108 [04:14<05:04,  5.25s/it]




 47%|████▋     | 51/108 [04:19<04:53,  5.15s/it]




 48%|████▊     | 52/108 [04:22<04:23,  4.70s/it]




 49%|████▉     | 53/108 [04:25<03:52,  4.22s/it]




 50%|█████     | 54/108 [04:31<04:11,  4.65s/it]




 51%|█████     | 55/108 [04:36<04:04,  4.62s/it]




 52%|█████▏    | 56/108 [04:40<03:55,  4.53s/it]




 53%|█████▎    | 57/108 [04:46<04:20,  5.10s/it]




 54%|█████▎    | 58/108 [04:53<04:41,  5.63s/it]




 55%|█████▍    | 59/108 [04:58<04:24,  5.39s/it]




 56%|█████▌    | 60/108 [05:02<03:56,  4.93s/it]




 56%|█████▋    | 61/108 [05:06<03:41,  4.72s/it]




 57%|█████▋    | 62/108 [05:11<03:36,  4.70s/it]




 58%|█████▊    | 63/108 [05:15<03:30,  4.68s/it]




 59%|█████▉    | 64/108 [05:22<03:56,  5.37s/it]




 60%|██████    | 65/108 [05:27<03:45,  5.24s/it]




 61%|██████    | 66/108 [05:31<03:26,  4.91s/it]




 62%|██████▏   | 67/108 [05:37<03:34,  5.23s/it]




 63%|██████▎   | 68/108 [05:42<03:20,  5.02s/it]




 64%|██████▍   | 69/108 [05:46<03:07,  4.80s/it]




 65%|██████▍   | 70/108 [05:51<03:00,  4.75s/it]




 66%|██████▌   | 71/108 [05:55<02:43,  4.42s/it]




 67%|██████▋   | 72/108 [05:59<02:36,  4.35s/it]




 68%|██████▊   | 73/108 [06:03<02:30,  4.31s/it]




 69%|██████▊   | 74/108 [06:09<02:46,  4.89s/it]




 69%|██████▉   | 75/108 [06:14<02:36,  4.75s/it]




 70%|███████   | 76/108 [06:18<02:28,  4.63s/it]




 71%|███████▏  | 77/108 [06:25<02:44,  5.31s/it]




 72%|███████▏  | 78/108 [06:30<02:38,  5.28s/it]




 73%|███████▎  | 79/108 [06:36<02:35,  5.36s/it]




 74%|███████▍  | 80/108 [06:41<02:27,  5.27s/it]




 75%|███████▌  | 81/108 [06:45<02:14,  5.00s/it]




 76%|███████▌  | 82/108 [06:51<02:14,  5.17s/it]




 77%|███████▋  | 83/108 [06:56<02:09,  5.19s/it]




 78%|███████▊  | 84/108 [07:02<02:09,  5.38s/it]




 79%|███████▊  | 85/108 [07:06<01:58,  5.16s/it]




 80%|███████▉  | 86/108 [07:14<02:07,  5.78s/it]




 81%|████████  | 87/108 [07:19<01:57,  5.58s/it]




 81%|████████▏ | 88/108 [07:22<01:40,  5.04s/it]




 82%|████████▏ | 89/108 [07:28<01:41,  5.34s/it]




 83%|████████▎ | 90/108 [07:35<01:44,  5.80s/it]




 84%|████████▍ | 91/108 [07:40<01:33,  5.50s/it]




 85%|████████▌ | 92/108 [07:44<01:19,  4.96s/it]




 86%|████████▌ | 93/108 [07:50<01:19,  5.33s/it]




 87%|████████▋ | 94/108 [07:54<01:10,  5.05s/it]




 88%|████████▊ | 95/108 [07:59<01:02,  4.81s/it]




 89%|████████▉ | 96/108 [08:02<00:52,  4.37s/it]




 90%|████████▉ | 97/108 [08:05<00:44,  4.08s/it]




 91%|█████████ | 98/108 [08:10<00:43,  4.33s/it]




 92%|█████████▏| 99/108 [08:14<00:37,  4.22s/it]




 93%|█████████▎| 100/108 [08:20<00:36,  4.59s/it]




 94%|█████████▎| 101/108 [08:24<00:30,  4.37s/it]




 94%|█████████▍| 102/108 [08:29<00:27,  4.56s/it]




 95%|█████████▌| 103/108 [12:51<06:49, 81.95s/it]




 96%|█████████▋| 104/108 [12:56<03:54, 58.68s/it]




 97%|█████████▋| 105/108 [13:00<02:06, 42.33s/it]




 98%|█████████▊| 106/108 [13:06<01:03, 31.61s/it]




 99%|█████████▉| 107/108 [13:11<00:23, 23.41s/it]




100%|██████████| 108/108 [13:15<00:00,  7.36s/it]


In [18]:
with open(os.path.join(outdir, 'pi_list.pkl'), 'wb') as f:
    pickle.dump(pi_list, f)

with open(os.path.join(outdir, 'theta_list.pkl'), 'wb') as f:
    pickle.dump(theta_list, f)
    
with open(os.path.join(outdir, 'z_list.pkl'), 'wb') as f:
    pickle.dump(z_list, f)