In [1]:
####Mixture Multinomial Logit Model####
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy
import scipy.stats as ss
from numpy.random import *
from scipy import optimize
from scipy.stats import norm
from scipy import sparse

In [2]:
####データの発生####
##データの設定
segment = 5
hh = 10000   #サンプル数
pt = numpy.random.poisson(numpy.random.gamma(5.0, 1/0.5, hh))   #購買機会数
pt[pt==0] = 1   #購買機会が0なら1に置き換え
hhpt = np.sum(pt)   #総サンプル数
member = 10   #選択可能数
k = 5   #説明変数数

In [3]:
##idとセグメントの設定
#idの設定
id = np.repeat(range(hhpt), member)
no = np.zeros(hhpt*member, dtype='int')
start = 0
for i in range(hhpt):
    no[start:start+member] = np.array(range(member))
    start += member

user_id = np.repeat(range(hh), pt)
pt_id = np.zeros(hhpt, dtype='int')
start = 0
for i in range(hh):
    pt_id[start:start+pt[i]] = np.array(range(pt[i]))
    start += pt[i]
u_id = np.repeat(user_id, member)
user_dt = sparse.coo_matrix((np.repeat(1, hhpt), (range(hhpt), user_id)), shape=(hhpt, hh)).T

In [4]:
#インデックスを設定
index = np.array(range(hhpt*member))
user_list = [i for i in range(hh)]
id_list = [i for i in range(hhpt)]
no_list = [i for i in range(member)]

for i in range(hh):
    user_list[i] = index[u_id==i]
for i in range(hhpt):
    id_list[i] = index[id==i]
for j in range(member):
    no_list[j] = index[no==j]

In [5]:
#セグメントの設定
theta = thetat = np.random.dirichlet(np.repeat(3.0, segment), 1).reshape(-1)
Z = np.random.multinomial(1, theta, hh)
seg = np.dot(Z, range(segment))

In [6]:
##説明変数の発生
#切片の設定
diag_ind = np.ravel(np.diag(np.ravel(np.matlib.repmat(1, member, 1))))
diag_vec = np.ravel(np.matlib.repmat(diag_ind, hhpt, 1))
intercept = np.reshape(diag_vec, (hhpt*member, member))[:, range(member-1)]

In [7]:
#衣装の割当を設定
c_num = 8
prob = np.random.dirichlet(np.repeat(1.5, c_num), member); m = np.argmin(np.sum(prob, axis=0))
Cloth = np.zeros((hhpt*member, c_num-1))
for j in range(member):
    x = np.random.multinomial(1, prob[j, ], hhpt)
    Cloth[no_list[j], :] = np.delete(x, m, axis=1)

In [8]:
#どのメンバーの勧誘回だったか
prob = np.repeat(1/member, member)
scout = np.zeros((hhpt, member))
for i in range(hhpt):
    while True:
        x = np.random.multinomial(2, prob, 1).reshape(-1)
        if np.max(x) == 1:
            break
    scout[i, :] = x
Scout = scout.reshape(-1)

In [9]:
#レベルの対数
lv_weibull = np.round(scipy.stats.weibull_min.rvs(1.8, scale=250, size=hh*member))
lv = np.log(np.random.choice(lv_weibull[lv_weibull > 80], hh, replace=False))
Lv = np.zeros((hhpt*member, member-1))
for i in range(hh):
    Lv[user_list[i], :] = (lv[i] * np.identity(member))[:, range(member-1)][np.tile(range(member), pt[i]), :]

In [10]:
#スコアの対数
score = np.abs(np.random.normal(0, 0.75, hhpt))
Score = np.zeros((hhpt*member, member-1))
for i in range(hh):
    Score[user_list[i], :] = (score[i] * np.identity(member))[:, range(member-1)][np.tile(range(member), pt[i]), :]

In [12]:
#データの結合
Data = np.hstack((intercept, Cloth, Scout.reshape((hhpt*member, 1)), Lv, Score))
sparse_data = scipy.sparse.csr_matrix(Data)
k1 = intercept.shape[1]; k2 = Cloth.shape[1]; k3 = 1; k4 = Lv.shape[1]; k5 = Score.shape[1]
k = Data.shape[1]
pd.DataFrame(np.round(Data, 3))   #データを確認

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,0.071,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,0.000,0.071,0.000,0.000,0.000,0.000,0.000,0.000,0.000
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,0.000,0.000,0.071,0.000,0.000,0.000,0.000,0.000,0.000
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,0.000,0.000,0.000,0.071,0.000,0.000,0.000,0.000,0.000
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.000,0.000,0.000,0.000,0.000,0.071,0.000,0.000,0.000,0.000
5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.071,0.000,0.000,0.000
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.071,0.000,0.000
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.071,0.000
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,5.553,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.071
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [13]:
##パラメータの設定
beta1 = beta4 = beta5 = np.zeros((segment, member-1))
beta2 = np.zeros((segment, c_num-1))
beta3 = np.repeat(0.0, segment)

for j in range(segment):
    beta1[j, :] = np.random.uniform(-1.0, 4.0, member-1)   #切片の回帰係数
    beta2[j, ] = np.random.uniform(-2.0, 3.0, c_num-1)   #衣装の回帰係数
    beta3[j] = np.random.uniform(0.6, 4.0, 1)   #勧誘の回帰係数
    beta4[j, :] = np.random.uniform(-0.4, 0.4, member-1)   #レベルの回帰係数
    beta5[j, :] = np.random.uniform(-0.6, 0.6, member-1)   #スコアの回帰係数
beta = betat = np.hstack((beta1, beta2, beta3.reshape((segment, 1)), beta4, beta5)).T   #回帰係数を結合
b = beta.reshape(-1)

In [14]:
##応答変数の発生
#ロジットと確率を発生
U = np.sum(np.dot(Data, beta) * Z[u_id, ], axis=1).reshape(hhpt, member)
Pr = np.exp(U) / np.sum(np.exp(U), axis=1).reshape(hhpt, 1)

In [15]:
#応答変数の発生
y = np.array([np.random.multinomial(1, Pr[i, :], 1) for i in range(hhpt)]).reshape((hhpt, member))
y_vec = y.reshape(-1)
np.sum(y, axis=0)

array([14976,  2217,  2933, 12538, 12343,  3049,  8966, 21513, 18146,
        4169])

In [16]:
####EMアルゴリズムで有限混合ロジットモデルを推定####
##完全データのロジットモデルの対数尤度
def cll(b, y, y_vec, Data, zpt, hhpt, member, k, segment, u_id):
    #パラメータの設定
    beta = b.reshape(k, segment)

    #潜在変数での重み付き確率
    U = np.exp(np.sum(np.dot(Data, beta) * zpt[u_id, ], axis=1).reshape(hhpt, member))
    Pr = U / np.sum(U, axis=1).reshape(hhpt, 1)

    #完全データの対数尤度の和
    LL = -np.sum(np.dot((y * np.log(Pr)), np.repeat(1, member)))
    return(LL)

In [17]:
##完全データのロジットモデルの対数微分関数
def dll(b, y, y_vec, Data, zpt, hhpt, member, k, segment, u_id):
    #パラメータの設定
    beta = b.reshape(k, segment)

    #潜在変数での重み付き確率
    U = np.exp(np.dot(Data, beta))
    dlogit = np.zeros((segment, k))

    for j in range(segment):
        #効用と確率を設定
        u = U[:, j].reshape(hhpt, member)
        Pr = u / np.sum(u, axis=1).reshape(hhpt, 1)

        #勾配ベクトルを定義
        Pr_vec = Pr.reshape(-1)
        dlogit[j, :] = np.sum(zpt[u_id, j].reshape(hhpt*member, 1) * (y_vec - Pr_vec).reshape(hhpt*member, 1) * Data, axis=0)

    LLd = -dlogit.T.reshape(-1)
    return(LLd)

In [18]:
##観測データでの尤度と潜在変数zを計算する関数
def ollz(beta, y, Data, zpt, hhpt, member, k, segment, user_dt):
    #潜在変数ごとの確率と尤度を設定
    U = np.exp(np.dot(Data, beta))
    LLho = np.zeros((hh, segment))

    for j in range(segment):
        #効用と確率を設定
        u = U[:, j].reshape(hhpt, member)
        Pr = u / np.sum(u, axis=1).reshape(hhpt, 1)

        #ユーザーごとの尤度を取る
        LLho[:, j] = np.exp((scipy.dot(user_dt, sparse.csr_matrix(np.dot(y * np.log(Pr), np.repeat(1, member))).T).todense())).reshape(-1)

    #観測データの対数尤度
    r = np.tile(theta, hh).reshape(hh, segment)
    LLobz = np.sum(np.log(np.dot(r * LLho, np.repeat(1, segment))))

    #潜在変数zの推定
    LLi = r * LLho
    z = LLi / np.dot(LLi, np.repeat(1, segment)).reshape(hh, 1)
    return LLobz, z

In [19]:
##EMアルゴリズムの設定
iter = 0
rp = 200   #最大繰り返し数
LL = -1000000000   #対数尤度の初期値
dl = 100   #EMアルゴリズムの対数尤度の差の初期値
tol = 0.1
maxit = 20   #準ニュートン法のステップ数

In [20]:
##EMアルゴリズムの初期値を設定
#混合率と潜在変数の初期値
theta = np.repeat(1/segment, segment)
zpt = np.tile(theta, hh).reshape(hh, segment)

#パラメータの初期値
b = np.random.uniform(-0.25, 0.25, segment*k)
beta = b.reshape(k, segment)

In [21]:
##観測データの対数尤度と潜在変数
oll = ollz(beta, y, Data, zpt, hhpt, member, k, segment, user_dt)
zpt = oll[1]
LL1 = oll[0]

In [22]:
##EMアルゴリズムでパラメータを推定
while abs(dl) >= tol:
    
    #完全データのロジットモデルの推定(Mステップ)
    res = optimize.minimize(cll, b, jac=dll, method='BFGS', args=(y, y_vec, Data, zpt, hhpt, member, k, segment, u_id),
                            options={'gtol': 0.1})
    b = res.x; beta = b.reshape(k, segment)   #パラメータの更新
    theta = np.sum(zpt, axis=0) / hh   #混合率を更新

    #Eステップで対数尤度の期待値と潜在変数を更新(Eステップ)
    obsllz = ollz(beta, y, Data, zpt, hhpt, member, k, segment, user_dt)
    LL = obsllz[0]
    zpt = obsllz[1]

    #EMアルゴリズムのパラメータを更新
    iter = iter + 1
    dl = LL - LL1
    LL1 = LL
    print(LL)

-145513.74626343872
-106042.86151843413
-95426.83284287868
-95422.21686442403
-95421.84492144879
-95421.8448490604


In [23]:
####推定結果と要約####
##推定されたパラメータと真のパラメータの比較
#推定されたパラメータ
pd.DataFrame(np.hstack((beta, betat)))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.632521,-0.280775,-0.430263,1.042541,-0.462964,0.243983,-0.020031,0.13003,0.541275,-0.355612
1,0.820386,-0.583856,-0.373077,-0.660224,0.618965,-0.018949,-0.231109,0.015584,-0.162727,-0.486841
2,0.032392,0.000386,0.569986,1.62105,1.657978,0.24085,-0.06115,-0.257906,-0.55496,-0.301514
3,0.428932,0.57567,0.089246,0.606981,-0.321978,-0.265412,0.454271,0.129994,0.392862,-0.263761
4,-0.017902,-1.334788,0.162524,-0.363643,-0.04511,-0.192138,-0.588346,0.063889,-0.429696,0.450185
5,-0.119372,1.231616,-1.171234,1.078668,0.702971,-0.119695,-0.408354,0.277716,-0.420301,-0.174562
6,-0.462712,0.563882,-0.93485,-0.312731,0.469752,0.101735,0.179773,0.414619,-0.184004,-0.591251
7,0.563941,0.458369,-0.546259,1.121089,-1.85856,0.559097,-0.273188,-0.425051,0.491914,-0.450087
8,-1.440981,-0.325524,-0.107901,-0.134045,0.600746,-0.43285,-0.252671,0.583334,-0.359359,0.063406
9,-0.850885,-1.263371,-1.525491,-0.106825,1.170567,-0.99461,-1.205025,1.198489,-0.217889,-1.639797


In [24]:
##混合率とセグメントへの所属確率
print(np.round(np.vstack((theta, thetat)), 3))   #推定された混合率
np.argmax(zpt, axis=1)   #セグメントへの所属
pd.DataFrame(np.hstack((np.round(zpt, 3), Z)))   #潜在変数の割当確率と真のセグメントを比較

[[0.222 0.13  0.145 0.239 0.264]
 [0.22  0.126 0.266 0.24  0.148]]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.000,1.000,0.000,0.000,0.000,0.0,1.0,0.0,0.0,0.0
1,0.000,0.000,0.000,0.000,1.000,0.0,0.0,1.0,0.0,0.0
2,1.000,0.000,0.000,0.000,0.000,1.0,0.0,0.0,0.0,0.0
3,0.000,1.000,0.000,0.000,0.000,0.0,1.0,0.0,0.0,0.0
4,0.000,0.000,0.000,1.000,0.000,0.0,0.0,0.0,1.0,0.0
5,0.000,0.000,0.000,1.000,0.000,0.0,0.0,0.0,1.0,0.0
6,0.000,0.000,0.000,0.000,1.000,0.0,0.0,1.0,0.0,0.0
7,0.000,1.000,0.000,0.000,0.000,0.0,1.0,0.0,0.0,0.0
8,0.000,0.000,0.000,0.000,1.000,0.0,0.0,1.0,0.0,0.0
9,0.000,0.000,1.000,0.000,0.000,0.0,0.0,0.0,0.0,1.0


In [330]:
##AICとBICの計算
print(np.round(LL, 3))   #最大化された観測データの対数尤度
print(np.round(-2*LL + 2*(k*segment-1), 3))   #AIC
print(np.round(-2*LL + np.log(hhpt)*(k*segment-1), 3))   #BIC

-114552.95
229453.9
231108.846
