In [41]:
# Temporal Aatching model between Users and Locations
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy.linalg
import itertools
import calendar
from datetime import datetime
from datetime import timedelta
from scipy import sparse
from scipy.stats import norm
from numpy.random import *
from scipy import optimize

In [42]:
# 連続した日付を取得する関数
def daterange(_start, _end):
    for n in range((_end - _start).days):
        yield _start + timedelta(n)
        
# 切断ポアソン分布を生成する関数
def rtpois(mu, a, b, n):
    FA = scipy.stats.poisson.cdf(a, mu)
    FB = scipy.stats.poisson.cdf(b, mu)
    return np.array(scipy.stats.poisson.ppf(np.random.uniform(0, 1, n)*(FB-FA)+FA, mu), dtype="int")

# 多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.array(np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1), dtype="int")
    return z_id

In [43]:
# シミュレーションデータを生成
# 日付データを作成
# 対象期間の日付と曜日情報を出力
start = datetime.strptime('2014-01-01', '%Y-%m-%d')
end   = datetime.strptime('2019-09-30', '%Y-%m-%d')
get_date = np.array([])
day_of_week = np.array([], dtype="int")
for day in daterange(start, end):
    get_date = np.append(get_date, day)
    day_of_week = np.append(day_of_week, day.weekday())
    
#日付を定義
index_week = np.array(np.where(day_of_week < 5)[0], dtype="int")
Timestamp = pd.Series(get_date).iloc[index_week]
date = pd.DataFrame({"date": Timestamp.astype("U")})
date_df = pd.merge(date, pd.DataFrame({"date": np.unique(date), "date_id": np.arange(date.shape[0])}), on="date", how="inner")
year = pd.DataFrame({"year": Timestamp.astype("U").str[:4]})
unique_year = np.unique(year)
year_df = pd.merge(year, pd.DataFrame({"year": unique_year, "year_id": np.arange(unique_year.shape[0])}), on="year", how="inner")
month = pd.DataFrame({"month": Timestamp.astype("U").str[5:7]})
unique_month = np.unique(month)
month_df = pd.merge(month, pd.DataFrame({"month": unique_month, "month_id": np.arange(unique_month.shape[0])}), on="month", how="inner")

#週を定義
new_week = day_of_week[index_week]
index = np.array(np.where((new_week-4)==0)[0], dtype="int")
week_n = index.shape[0]
week = np.repeat(0, new_week.shape[0])
for i in range(week_n):
    if i==0:
        get = np.arange(0, index[i]+1)
        week[get] = np.repeat(i, get.shape[0])
    else:
        get = np.arange(index[i-1]+1, index[i]+1)
        week[get] = np.repeat(i, get.shape[0])
        
#データフレームを作成
calendar_df = pd.concat((date_df, year_df, month_df, pd.DataFrame({"week_id": week})), axis=1)
date = np.array(calendar_df["date_id"], dtype="int")
year = np.array(calendar_df["year_id"], dtype="int")
month = np.array(calendar_df["month_id"], dtype="int")
week = np.array(calendar_df["week_id"], dtype="int")

In [44]:
# データの定義
k = 10
hh = 5000
place = 3000
category = 50
area = 25
w = 2
Lambda = np.random.gamma(40.0, 1/0.3, hh)
pt = np.random.poisson(Lambda, hh)
hhpt = np.sum(pt)
k_vec = np.repeat(1.0, k)

In [45]:
# idとインデックスを定義
# idの定義
d_id = np.repeat(np.arange(hh), pt)
pt_id = np.array(list(itertools.chain(*[np.array(range(pt[i]), dtype="int") for i in range(hh)])))

#インデックスを定義
d_list = [i for i in range(hh)]
for i in range(hh):
    d_list[i] = np.array(np.where(d_id==i)[0], dtype="int")

In [46]:
# placeを選択
#トピックを生成
topic = 25 
theta_topic = np.random.dirichlet(np.repeat(0.2, topic), hh)
phi_place = np.random.dirichlet(np.repeat(0.1, place), topic)
z = np.array(rmnom(theta_topic[d_id, ], hhpt, topic, 0), dtype="int16")

# 多項分布からplaceを生成
place_id = np.repeat(0, hhpt)
for i in range(hh):
    index = d_list[i]
    place_id[index] = rmnom(phi_place[z[index], ], pt[i], place, 0)
    
# インデックスを定義
place_list = [i for i in range(place)]
place_n = np.repeat(0, place)
for i in range(place):
    place_list[i] = np.array(np.where(place_id==i)[0], dtype="int")
    place_n[i] = place_list[i].shape[0]

In [47]:
# areaとcategoryを選択
# トピックを生成
theta_place = (phi_place.T) / np.sum(phi_place.T, axis=1)[:, np.newaxis]
phi_area = np.random.dirichlet(np.repeat(0.1, area), topic)
phi_category = np.random.dirichlet(np.repeat(0.1, category), topic)
s = rmnom(theta_place, place, topic, 0)

# 多項分布からareaとcategoryを生成
area_id = rmnom(phi_area[s, ], place, area, 0)[place_id]
category_id = rmnom(phi_category[s, ], place, category, 0)[place_id]

# インデックスを定義
area_list = [i for i in range(place)]
category_list = [i for i in range(category)]
area_n = np.repeat(0, area)
category_n = np.repeat(0, category)
for i in range(area):
    area_list[i] = np.array(np.where(area_id==i)[0], dtype="int")
    area_n[i] = area_list[i].shape[0]
for i in range(category):
    category_list[i] = np.array(np.where(category_id==i)[0], dtype="int")
    category_n[i] = category_list[i].shape[0]

In [48]:
# 日付と時間を割り当て
# トピックを生成
weekday = 2
hours = 24
phi_weekday = np.random.beta(3.0, 5.0, place)
phi_hours = np.random.dirichlet(np.repeat(0.05, hours), place)
z = np.array(rmnom(theta_topic[d_id, ], hhpt, topic, 0), dtype="int16")

# 多項分布から日付と時間を生成
day_id = np.repeat(0, hhpt)
hours_id = np.repeat(0, hhpt)
day_list = [i for i in range(w)]
for i in range(hh):
    index = d_list[i]
    day_id[index] = np.random.binomial(1, phi_weekday[z[index]], pt[i])
    hours_id[index] = rmnom(phi_hours[z[index], ], pt[i], hours, 0)
for i in range(w):
    day_list[i] = np.array(np.where(day_id==i)[0], dtype="int")
wd_index = day_list[0]
we_index = day_list[1]

In [49]:
# 時間割当配分を定義
# areaの時間割当
area_wd = np.zeros((area, hours))
area_we = np.zeros((area, hours))
for i in range(area):
    index = area_list[i]
    freq_wd = np.sum(day_id[index][:, np.newaxis] * np.diag(np.repeat(1, hours))[hours_id[index], ], axis=0)
    freq_we = np.sum((1 - day_id[index])[:, np.newaxis] * np.diag(np.repeat(1, hours))[hours_id[index], ], axis=0)
    area_wd[i, ] = freq_wd / np.sum(freq_wd)
    area_we[i, ] = freq_we / np.sum(freq_we)
area_rate = np.zeros((area, hours, w))
area_rate[:, :, 0] = area_wd; area_rate[:, :, 1] = area_we

# categoryの時間割当
category_wd = np.zeros((category, hours))
category_we = np.zeros((category, hours))
for i in range(category):
    index = category_list[i]
    freq_wd = np.sum(day_id[index][:, np.newaxis] * np.diag(np.repeat(1, hours))[hours_id[index], ], axis=0)
    freq_we = np.sum((1 - day_id[index])[:, np.newaxis] * np.diag(np.repeat(1, hours))[hours_id[index], ], axis=0)
    category_wd[i, ] = freq_wd / np.sum(freq_wd)
    category_we[i, ] = freq_we / np.sum(freq_we)
category_rate = np.zeros((category, hours, w))
category_rate[:, :, 0] = category_wd; category_rate[:, :, 1] = category_we

In [50]:
# パラメータを生成
# 事前分布の定義
alpha_u = 2*np.random.dirichlet(np.repeat(0.25, k), hh)
alpha_d = np.random.gamma(0.75, 1.0, hh*hours).reshape(hh, hours)
alpha_v = np.random.gamma(1.25, 1.75, place*k).reshape(place, k)

# モデルパラメータを生成
pi = np.random.beta(2.5, 3.0, place)
theta_u = np.zeros((hh, k, w))
theta_d = np.zeros((hh, hours, w))
theta_v = np.zeros((place, k, w))
for i in range(hh):
    for j in range(w):
        theta_u[i, :, j] = np.random.dirichlet(alpha_u[i, ], 1).reshape(-1)
        theta_d[i, :, j] = np.random.gamma(0.5, alpha_d[i, ], hours)
for i in range(place):
    for j in range(w):
        theta_v[i, :, j] = np.random.gamma(0.5, alpha_v[i, ], k)
        
# パラメータの真値をコピー
pit = pi.copy()
alphat_u = alpha_u.copy()
alphat_d = alpha_d.copy()
alphat_v = alpha_v.copy()
thetat_u = theta_u.copy()
thetat_d = theta_d.copy()
thetat_v = theta_v.copy()

In [51]:
# 応答変数を生成
# areaとcategoryの割当トピックを生成
Z = np.random.binomial(1, pi[place_id], hhpt)
z_vec = Z[place_id][:, np.newaxis]

# 期待値を定義
mv_weights = z_vec*area_rate[area_id, :, day_id] + (1-z_vec)*category_rate[category_id, :, day_id]
mv = np.dot(theta_d[d_id, :, day_id] * mv_weights, np.repeat(1.0, hours))
uv = np.dot(theta_u[d_id, :, day_id] * theta_v[place_id, :, day_id], k_vec)
mu = mv + uv

# Bernoulli Poisson linkからbinaryデータを生成
freq = np.random.poisson(mu, hhpt)
y = np.array(freq > 0, dtype="int")

In [52]:
# Temporal Matching model between users and locationsのパラメータを推定
# MCMCの設定
R = 1000
keep = 2
burnin = int(500/keep)
iter = 0
disp = 10

# 事前分布の定義
alpha1 = 10.0; beta1 = 1/10.0
alpha2 = 10.0; beta2 = 1/10.0
alpha3 = 5.0; beta3 = 1/5.0

In [54]:
# パラメータの真値
# モデルパラメータの真値
pi = pit.copy()
alpha_u = alphat_u.copy()
alpha_d = alphat_d.copy()
alpha_v = alphat_v.copy()
theta_u = thetat_u.copy()
theta_d = thetat_d.copy()
theta_v = thetat_v.copy()

# areaとcategoryの割当トピックを生成
Zi = Z.copy()
z_vec = Zi[place_id][:, np.newaxis]

# 期待値を定義
mv_weights = z_vec*area_rate[area_id, :, day_id] + (1-z_vec)*category_rate[category_id, :, day_id]
mv = np.dot(theta_d[d_id, :, day_id] * mv_weights, np.repeat(1.0, hours))
uv = np.dot(theta_u[d_id, :, day_id] * theta_v[place_id, :, day_id], k_vec)
mu = mv + uv