# Preparing data

In [844]:
stoplist = set('for a of the and to rt'.split())

## A small corpus

In [503]:
corpus = [
    'Human machine interface for Lab ABC computer applications',
    'A survey of user opinion of computer system response time',
    'The EPS user interface management system',
    'System and human system engineering testing of EPS',
    'Relation of user-perceived response time to error measurement',
    'The generation of random, binary, unordered trees',
    'The intersection graph of paths in trees',
    'Graph minors IV: Width of trees and well-quasi-ordering',
    'Graph minors: A survey'
]

In [504]:
# creating generator object for streaming tweets
class Tweets:
    def __iter__(self):
        for tweet in corpus:
            yield tweet

In [505]:
# streaming corpus and storing documents in bow representation
import re
from collections import defaultdict

tweets = Tweets()
token2id = {}
# token2id : dict of (token(str), tokenId(int))
idf = defaultdict(int)
# idf: dict of (tokenId, frequency of tokenId in corpus)
docs2bow = []
# docs2bow: list of [doc2bow]
# doc2bow: list of (tokenId, frequency of tokenId in document)
for docno, tweet in enumerate(tweets):
    # lowering tweets and removing punctuations from it, then splitting
    document = re.sub(r'[-,:.]', ' ', tweet.lower()).split()
    # checking, adding and counting tokens to creat doc2bow and docs2bow
    counter = defaultdict(int)
    # counter: dict of (tokenIds in doc, frequency of them)
    for word in document:
        if word in stoplist: continue   # check word by stoplist
        if word not in token2id: token2id[word] = len(token2id) # add word as a token if seen for the first time
        counter[word] += 1
        idf[token2id[word]] += 1
    # creating doc2bow for this doc
    doc2bow = [(token2id[word], freq) for word, freq in counter.items()]
    print(docno, doc2bow)
    # append doc2bow to docs2bow
    docs2bow.append(doc2bow)

0 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
1 [(7, 1), (8, 1), (9, 1), (5, 1), (10, 1), (11, 1), (12, 1)]
2 [(13, 1), (8, 1), (2, 1), (14, 1), (10, 1)]
3 [(10, 2), (0, 1), (15, 1), (16, 1), (13, 1)]
4 [(17, 1), (8, 1), (18, 1), (11, 1), (12, 1), (19, 1), (20, 1)]
5 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)]
6 [(26, 1), (27, 1), (28, 1), (29, 1), (25, 1)]
7 [(27, 1), (30, 1), (31, 1), (32, 1), (25, 1), (33, 1), (34, 1), (35, 1)]
8 [(27, 1), (30, 1), (7, 1)]


## A user timeline corpus

In [840]:
import tweepy as tw

# define keys
consumer_key= 'gEJhQtgiIvxzNB50u4JPic8f4'
consumer_secret= 'iEfTG65lFX8cAzKJ4QIhJklvuh3tfWdaRAAWO3b17082dZaSiu'
access_token= '1369691334051852293-IGWGrIUKFY6rTwmrA5WD3YLkJrlUk5'
access_token_secret= '7ftmxYiYnso7PNkPOWOWKCkNrguFFMhwwPTHhQ6bFVvgG'
# authenticate and create api object
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

In [838]:
class Tweets():
    def __init__(self, pagination_num=60):
        self.pagination_num = pagination_num
        self.cursor = tw.Cursor(api.user_timeline, id="unicef", # id = "indykaila"
                              exclude_replies=True,
                              include_rts=True,
                              tweet_mode='extended').pages(self.pagination_num)
    def __iter__(self):
        for page in self.cursor:
            for status in page:
                yield status.full_text

In [853]:
# streaming corpus and storing documents in bow representation
import re
from collections import defaultdict

tweets = Tweets()
token2id = {}
idf = defaultdict(int)
docs2bow = []
for docno, tweet in enumerate(tweets):
    tweet = re.sub(r'\bhttps:\S+', '', tweet.lower())
    # print(tweet)
    document = re.sub(r'[-,:.!?"]', ' ', tweet).split()
    # print(document)
    counter = defaultdict(int)
    for word in document:
        if word in stoplist: continue
        if word not in token2id: token2id[word] = len(token2id)
        counter[word] += 1
        idf[token2id[word]] += 1
    doc2bow = [(token2id[word], freq) for word, freq in counter.items()]
    # print(docno, doc2bow)
    print(docno)
    docs2bow.append(doc2bow)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

## corpus read

In [854]:
len(token2id)

4533

In [506]:
token2id

{'human': 0,
 'machine': 1,
 'interface': 2,
 'lab': 3,
 'abc': 4,
 'computer': 5,
 'applications': 6,
 'survey': 7,
 'user': 8,
 'opinion': 9,
 'system': 10,
 'response': 11,
 'time': 12,
 'eps': 13,
 'management': 14,
 'engineering': 15,
 'testing': 16,
 'relation': 17,
 'perceived': 18,
 'error': 19,
 'measurement': 20,
 'generation': 21,
 'random': 22,
 'binary': 23,
 'unordered': 24,
 'trees': 25,
 'intersection': 26,
 'graph': 27,
 'paths': 28,
 'in': 29,
 'minors': 30,
 'iv': 31,
 'width': 32,
 'well': 33,
 'quasi': 34,
 'ordering': 35}

In [855]:
len(idf)

4533

In [509]:
idf

defaultdict(int,
            {0: 2,
             1: 1,
             2: 2,
             3: 1,
             4: 1,
             5: 2,
             6: 1,
             7: 2,
             8: 3,
             9: 1,
             10: 4,
             11: 2,
             12: 2,
             13: 2,
             14: 1,
             15: 1,
             16: 1,
             17: 1,
             18: 1,
             19: 1,
             20: 1,
             21: 1,
             22: 1,
             23: 1,
             24: 1,
             25: 3,
             26: 1,
             27: 3,
             28: 1,
             29: 1,
             30: 2,
             31: 1,
             32: 1,
             33: 1,
             34: 1,
             35: 1})

In [856]:
len(docs2bow)

1164

In [511]:
docs2bow

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (5, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1), (8, 1), (2, 1), (14, 1), (10, 1)],
 [(10, 2), (0, 1), (15, 1), (16, 1), (13, 1)],
 [(17, 1), (8, 1), (18, 1), (11, 1), (12, 1), (19, 1), (20, 1)],
 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(26, 1), (27, 1), (28, 1), (29, 1), (25, 1)],
 [(27, 1), (30, 1), (31, 1), (32, 1), (25, 1), (33, 1), (34, 1), (35, 1)],
 [(27, 1), (30, 1), (7, 1)]]

In [860]:
# filter once words
bad_ids = set(tokenid for tokenid, freq in idf.items() if freq == 1)
# creat new token2id and idf which filtered once words
token2id = {token: tokenid for token, tokenid in token2id.items() if idf[tokenid] > 1}
idf = {tokenid: freq for tokenid, freq in idf.items() if freq > 1}

In [861]:
len(bad_ids)

2372

In [514]:
bad_ids

{1,
 3,
 4,
 6,
 9,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 26,
 28,
 29,
 31,
 32,
 33,
 34,
 35}

In [862]:
len(token2id)

2161

In [516]:
token2id

{'human': 0,
 'interface': 2,
 'computer': 5,
 'survey': 7,
 'user': 8,
 'system': 10,
 'response': 11,
 'time': 12,
 'eps': 13,
 'trees': 25,
 'graph': 27,
 'minors': 30}

In [517]:
idf

{0: 2, 2: 2, 5: 2, 7: 2, 8: 3, 10: 4, 11: 2, 12: 2, 13: 2, 25: 3, 27: 3, 30: 2}

In [863]:
# compactify token2id and idf with help of idmap: maps old tokenIds to new ordered tokenIds
idmap = dict(zip(sorted(token2id.values()), range(len(token2id))))

In [864]:
len(idmap)

2161

In [None]:
idmap

In [865]:
# note this cell is one time run
# compactify token2id and idf
token2id = {token: idmap[tokenid] for token, tokenid in token2id.items()}
idf = {idmap[tokenid]: freq for tokenid, freq in idf.items()}

In [521]:
token2id

{'human': 0,
 'interface': 1,
 'computer': 2,
 'survey': 3,
 'user': 4,
 'system': 5,
 'response': 6,
 'time': 7,
 'eps': 8,
 'trees': 9,
 'graph': 10,
 'minors': 11}

In [522]:
idf

{0: 2, 1: 2, 2: 2, 3: 2, 4: 3, 5: 4, 6: 2, 7: 2, 8: 2, 9: 3, 10: 3, 11: 2}

In [523]:
docs2bow

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (5, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1), (8, 1), (2, 1), (14, 1), (10, 1)],
 [(10, 2), (0, 1), (15, 1), (16, 1), (13, 1)],
 [(17, 1), (8, 1), (18, 1), (11, 1), (12, 1), (19, 1), (20, 1)],
 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(26, 1), (27, 1), (28, 1), (29, 1), (25, 1)],
 [(27, 1), (30, 1), (31, 1), (32, 1), (25, 1), (33, 1), (34, 1), (35, 1)],
 [(27, 1), (30, 1), (7, 1)]]

In [866]:
# rebuild docs2bow based on new token2id
docs2bow = [
    [(idmap[tokenid], docfreq) for tokenid, docfreq in doc2bow if tokenid not in bad_ids]
    for doc2bow in docs2bow
]

In [525]:
docs2bow

[[(0, 1), (1, 1), (2, 1)],
 [(3, 1), (4, 1), (2, 1), (5, 1), (6, 1), (7, 1)],
 [(8, 1), (4, 1), (1, 1), (5, 1)],
 [(5, 2), (0, 1), (8, 1)],
 [(4, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(10, 1), (9, 1)],
 [(10, 1), (11, 1), (9, 1)],
 [(10, 1), (11, 1), (3, 1)]]

In [867]:
# not necessary
# creating words2bod by docs2bow -> n -> n.T -> words2bod
# words2bod shows each word appeared in which docs
import numpy as np
# n: numpy array of n[d][w] = n(d,w)
# d = document number, w = word's tokenId
n = np.zeros((len(docs2bow), len(idf)))
for docno, doc2bow in enumerate(docs2bow):
    for tokenid, docfreq in doc2bow:
        n[docno, tokenid] += docfreq
n

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [868]:
n.shape

(1164, 2161)

In [302]:
n.T

array([[1., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 1., 0., 1., 0., 0., 0., 0.],
       [0., 1., 1., 2., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 1.]])

In [303]:
words2bod = [
    [(tokenid, int(docfreq)) for tokenid, docfreq in enumerate(rows) if docfreq != 0]
    for rows in n.T
]

In [304]:
words2bod

[[(0, 1), (3, 1)],
 [(0, 1), (2, 1)],
 [(0, 1), (1, 1)],
 [(1, 1), (8, 1)],
 [(1, 1), (2, 1), (4, 1)],
 [(1, 1), (2, 1), (3, 2)],
 [(1, 1), (4, 1)],
 [(1, 1), (4, 1)],
 [(2, 1), (3, 1)],
 [(5, 1), (6, 1), (7, 1)],
 [(6, 1), (7, 1), (8, 1)],
 [(7, 1), (8, 1)]]

# PLSI model

## Parameters and likelihood

In [527]:
from numpy.random import rand

def random_init_pars(K, nshape):
    N, M = nshape   # N = number of documents, M = number of tokens
    Pz = rand(K); Pz /= sum(Pz) # P(z)
    Pd_z = rand(N, K); Pd_z /= Pd_z.sum(axis=0) # P(d|z)
    Pw_z = rand(M, K); Pw_z /= Pw_z.sum(axis=0) # P(w|z)
    pars = Pz, Pd_z, Pw_z   # pack parameters in a variable called pars
    return pars

In [763]:
n.shape

(595, 1340)

In [531]:
random_init_pars(2, n.shape)

(array([0.24354204, 0.75645796]),
 array([[0.13145524, 0.16549059],
        [0.06062532, 0.09934932],
        [0.10888767, 0.10808775],
        [0.15583634, 0.15645107],
        [0.00953156, 0.07490212],
        [0.14447679, 0.02209781],
        [0.14275294, 0.17391863],
        [0.14079858, 0.05178905],
        [0.10563556, 0.14791364]]),
 array([[0.02726565, 0.13109268],
        [0.01255203, 0.07435627],
        [0.15698346, 0.0675854 ],
        [0.12545428, 0.16088747],
        [0.12436121, 0.03136782],
        [0.0421001 , 0.12645216],
        [0.08076678, 0.0591329 ],
        [0.09404127, 0.03963929],
        [0.12487559, 0.03398142],
        [0.0751404 , 0.08672398],
        [0.06580309, 0.03995258],
        [0.07065612, 0.14882804]]))

In [560]:
def likelihood(pars, docs2bow):
    Pz, Pd_z, Pw_z = pars   # unpack parameters
    L = 0
    # iterate through datas in docs2bow and calculate prob of co-occur based on pars
    for d, doc2bow in enumerate(docs2bow):
        for w, ndw in doc2bow:
            Pcocur = sum(Pz[:] * Pd_z[d,:] * Pw_z[w, :])
            # add log-likelihood for each data
            L += ndw * np.log(Pcocur)
    return L

In [878]:
likelihood(random_init_pars(10, n.shape), docs2bow)

-364308.95092838595

## EM

In [630]:
def Estep(pars, docs2bow):  # no necessity to pass docs2bow (datas) to Estep, but it could decrease computations
    Pz, Pd_z, Pw_z = pars
    poster = np.zeros((len(Pz), len(Pd_z), len(Pw_z))) 
    # poster could be an attribute and no need to reset to zeros because it's not accumulative
    for z in range(len(Pz)):
        for d, doc2bow in enumerate(docs2bow):
            for w, ndw in doc2bow:
                poster[z, d, w] = Pz[z] * Pd_z[d, z] * Pw_z[w, z]
    # normalization
    poster /= poster.sum(axis=0) + 1e-16
    return poster

In [881]:
poster = Estep(random_init_pars(10, n.shape), docs2bow)
poster

array([[[0.05637757, 0.05015883, 0.11700925, ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.08651542, 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ]],

       [[0.23469508, 0.16303089, 0.09791355, ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0. 

In [882]:
poster.shape

(10, 1164, 2161)

In [883]:
poster.sum(axis=0)

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [542]:
def Mstep(poster, docs2bow):
    K, N, M = poster.shape  # K, N, M could be in an attribute self.archit
    rePz, rePd_z, rePw_z = np.zeros(K), np.zeros((N, K)), np.zeros((M, K))  # should reset to zeros because they're accumulative
    for z in range(K):
        for d, doc2bow in enumerate(docs2bow):
            for w, ndw in doc2bow:
                rePz[z] += ndw * poster[z, d, w]
                rePd_z[d, z] += ndw * poster[z, d, w]
                rePw_z[w, z] += ndw * poster[z, d, w]
    # normalization
    rePz /= sum(rePz)
    rePd_z /= rePd_z.sum(axis=0)
    rePw_z /= rePw_z.sum(axis=0)
    repars = rePz, rePd_z, rePw_z
    return repars

In [885]:
pars = random_init_pars(10, n.shape)
print(pars)
print(likelihood(pars, docs2bow))
poster = Estep(pars, docs2bow)
repars = Mstep(poster, docs2bow)
print(repars)
print(likelihood(repars, docs2bow))

(array([0.05614871, 0.05365472, 0.15433872, 0.12308663, 0.07215241,
       0.05176493, 0.05068781, 0.15686634, 0.14855696, 0.13274278]), array([[1.31645888e-03, 3.53017781e-04, 6.84228490e-04, ...,
        1.58699129e-03, 1.44983498e-03, 1.31261735e-03],
       [5.45360989e-04, 1.12386436e-03, 3.20305077e-05, ...,
        1.13169338e-03, 1.12546039e-03, 1.39253585e-03],
       [1.13630079e-03, 1.55473061e-03, 4.00399153e-04, ...,
        9.08037206e-04, 1.50063187e-03, 5.55911021e-04],
       ...,
       [8.54501351e-04, 3.87464253e-04, 1.07309439e-03, ...,
        8.54520642e-05, 1.36830631e-04, 9.09613499e-04],
       [8.16101482e-04, 1.64713692e-03, 1.66197342e-03, ...,
        1.17533954e-03, 1.36720113e-03, 3.54908159e-04],
       [1.79325222e-04, 1.10381537e-03, 6.20611821e-04, ...,
        7.77293815e-04, 8.91266467e-04, 2.83870689e-04]]), array([[6.52089055e-04, 5.62816198e-04, 1.71238041e-04, ...,
        5.52011596e-04, 4.34971404e-04, 4.36840325e-04],
       [6.00298617e-04,

In [545]:
def EMsteps(runtimes, pars, docs2bow):
    print(pars)
    print(likelihood(pars, docs2bow))
    for runtime in range(runtimes):
        poster = Estep(pars, docs2bow)
        pars = Mstep(poster, docs2bow)
        print(likelihood(pars, docs2bow))
    print(pars)

In [886]:
pars = random_init_pars(10, n.shape)
EMsteps(20, pars, docs2bow)

(array([0.09109596, 0.08042649, 0.16186366, 0.26510841, 0.04003123,
       0.06493685, 0.0321282 , 0.06887484, 0.02669741, 0.16883694]), array([[1.05532712e-04, 8.49120266e-04, 1.05843595e-03, ...,
        1.63579796e-03, 4.61050489e-04, 1.54058138e-03],
       [5.34024085e-04, 1.41549825e-03, 3.76231980e-04, ...,
        1.25339061e-04, 1.90293490e-04, 6.09975997e-04],
       [5.28808711e-04, 1.70442211e-03, 1.64743460e-03, ...,
        6.39163442e-04, 1.13967558e-03, 1.21157270e-03],
       ...,
       [1.64790796e-03, 9.26288957e-04, 5.76982771e-04, ...,
        1.21640824e-03, 5.08892325e-04, 5.01616932e-04],
       [1.52309219e-03, 1.24822972e-03, 2.31808670e-06, ...,
        2.18301545e-05, 1.47313741e-03, 2.27448870e-04],
       [1.37220932e-03, 1.15318736e-03, 6.78644344e-04, ...,
        7.63239496e-04, 6.01456995e-04, 3.18833099e-06]]), array([[3.16032844e-04, 3.46379129e-04, 7.88537133e-04, ...,
        6.64196521e-04, 5.97277530e-04, 8.06608468e-04],
       [1.39709721e-04,

In [548]:
token2id

{'human': 0,
 'interface': 1,
 'computer': 2,
 'survey': 3,
 'user': 4,
 'system': 5,
 'response': 6,
 'time': 7,
 'eps': 8,
 'trees': 9,
 'graph': 10,
 'minors': 11}

In [549]:
corpus

['Human machine interface for Lab ABC computer applications',
 'A survey of user opinion of computer system response time',
 'The EPS user interface management system',
 'System and human system engineering testing of EPS',
 'Relation of user-perceived response time to error measurement',
 'The generation of random, binary, unordered trees',
 'The intersection graph of paths in trees',
 'Graph minors IV: Width of trees and well-quasi-ordering',
 'Graph minors: A survey']

## TEM

In [887]:
def TEstep(beta, pars, docs2bow):
    Pz, Pd_z, Pw_z = pars
    poster = np.zeros((len(Pz), len(Pd_z), len(Pw_z)))
    for z in range(len(Pz)):
        for d, doc2bow in enumerate(docs2bow):
            for w, ndw in doc2bow:
                poster[z, d, w] = (Pz[z] * Pd_z[d, z] * Pw_z[w, z])**beta
    poster /= poster.sum(axis=0) + 1e-16
    return poster

In [888]:
def TMstep(poster, docs2bow):
    K, N, M = poster.shape
    rePz, rePd_z, rePw_z = np.zeros(K), np.zeros((N, K)), np.zeros((M, K))
    for z in range(K):
        for d, doc2bow in enumerate(docs2bow):
            for w, ndw in doc2bow:
                rePz[z] += ndw * poster[z, d, w]
                rePd_z[d, z] += ndw * poster[z, d, w]
                rePw_z[w, z] += ndw * poster[z, d, w]
    rePz /= sum(rePz)
    rePd_z /= rePd_z.sum(axis=0)
    rePw_z /= rePw_z.sum(axis=0)
    repars = rePz, rePd_z, rePw_z
    return repars

## Split data to train and held-out

In [553]:
docs2bow

[[(0, 1), (1, 1), (2, 1)],
 [(3, 1), (4, 1), (2, 1), (5, 1), (6, 1), (7, 1)],
 [(8, 1), (4, 1), (1, 1), (5, 1)],
 [(5, 2), (0, 1), (8, 1)],
 [(4, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(10, 1), (9, 1)],
 [(10, 1), (11, 1), (9, 1)],
 [(10, 1), (11, 1), (3, 1)]]

In [889]:
from numpy.random import randint

docs2bow_train, docs2bow_heldout = list(), list()
for doc2bow in docs2bow:
    doc2bow_train, doc2bow_heldout = list(), list()
    for w, ndw in doc2bow:
        ndw_train = randint(ndw+1)
        if ndw_train > 0:
            doc2bow_train += [(w, ndw_train)]
        if ndw - ndw_train > 0:
            doc2bow_heldout += [(w, ndw - ndw_train)]
    docs2bow_train += [(doc2bow_train)]
    docs2bow_heldout += [(doc2bow_heldout)]

In [893]:
len(docs2bow_heldout)

1164

In [558]:
docs2bow_train

[[],
 [(3, 1), (2, 1), (7, 1)],
 [(8, 1), (4, 1), (1, 1)],
 [(5, 2), (0, 1), (8, 1)],
 [(7, 1)],
 [],
 [(9, 1)],
 [(11, 1), (9, 1)],
 [(11, 1)]]

In [559]:
docs2bow_heldout

[[(0, 1), (1, 1), (2, 1)],
 [(4, 1), (5, 1), (6, 1)],
 [(5, 1)],
 [],
 [(4, 1), (6, 1)],
 [(9, 1)],
 [(10, 1)],
 [(10, 1)],
 [(10, 1), (3, 1)]]

## TEM for train and heldout

In [923]:
# modify likelihood to solve omitted words or docs from train corpus 
def likelihood(pars, docs2bow):
    Pz, Pd_z, Pw_z = pars
    L = 0
    for d, doc2bow in enumerate(docs2bow):
        for w, ndw in doc2bow:
            Pcocur = sum(Pz[:] * Pd_z[d,:] * Pw_z[w, :])
            # in splitting, may some words or docs be omitted entirely in train corpus, therefore their condit probs remain zero
            # and it makes Pcocur zero, and log-likelihood diverges
            # here is how to avoid this; exclude omitted words or docs from calculating likelihood in heldout
            if Pcocur == 0: continue
            L += ndw * np.log(Pcocur)
    return L

In [895]:
n.shape

(1164, 2161)

In [924]:
def TEMsteps(beta_runtimes, eta, beta, docs2bow_train, docs2bow_heldout):
    # beta = 1
    pars = random_init_pars(10, n.shape)
    for i in range(beta_runtimes):
        print(beta)
        # pars = random_init_pars(2, n.shape)
        # print(pars)
        new_likeli = likelihood(pars, docs2bow_heldout)
        likeli = -np.inf
        while round(new_likeli, 0) > round(likeli, 0):
            likeli = new_likeli
            print(likeli)
            prepars = pars
            poster = TEstep(beta, pars, docs2bow_train)
            pars = TMstep(poster, docs2bow_train)
            # print(pars)
            new_likeli = likelihood(pars, docs2bow_heldout)
        print(new_likeli)
        pars = prepars
        beta *= eta
    return pars

In [925]:
TEMsteps(10, 0.9, 1, docs2bow_train, docs2bow_heldout)

1
-180701.725325079
-158616.34866103966
-158701.26649426567
0.9
-158616.34866103966
-158646.8522618926
0.81
-158616.34866103966
-158606.34840897867
-158606.12166563192
0.7290000000000001
-158606.34840897867
-158569.64561671118
-158536.3681406097
-158505.41987801428
-158478.31193024523
-158457.15474063146
-158444.28998913136
-158442.09397469216
-158452.84850629052
0.6561000000000001
-158442.09397469216
-158365.24114850126
-158295.1525558181
-158228.8267456164
-158164.6263619829
-158101.68576823093
-158039.61171843202
-157978.24908012166
-157917.46207888558
-157857.3411503045
-157798.3590229032
-157740.9308454236
-157685.35423286824
-157631.86974890123
-157580.64410300768
-157531.74716533296
-157485.15158381348
-157440.7508878349
-157398.3931166362
-157357.91896009617
-157319.18577529784
-157282.07583425305
-157246.50388267147
-157212.4308009132
-157179.87289896054
-157148.89570367325
-157119.59297710238
-157092.05883101883
-157066.36061768423
-157042.51847969732
-157020.4961040286
-1570

(array([0.0954901 , 0.10019214, 0.10233699, 0.10207831, 0.10124439,
        0.1019751 , 0.09978392, 0.09317902, 0.10264651, 0.10107352]),
 array([[1.96787798e-05, 3.49494040e-03, 2.36003328e-04, ...,
         1.55258912e-04, 7.31601059e-05, 5.09040393e-04],
        [1.37824393e-04, 2.00678304e-04, 1.98623529e-04, ...,
         1.53492926e-04, 2.53002192e-04, 2.31631695e-04],
        [7.34467530e-05, 3.65689987e-03, 2.90160612e-05, ...,
         1.71023353e-05, 2.77675562e-05, 2.84871251e-04],
        ...,
        [1.58436876e-04, 3.36816917e-04, 5.64757530e-04, ...,
         3.77585066e-04, 3.35866084e-04, 6.88769067e-04],
        [2.25641667e-05, 7.96638114e-05, 6.30312550e-04, ...,
         7.78044623e-05, 1.67211201e-04, 1.16593944e-03],
        [3.44541777e-03, 9.12277924e-04, 1.74017757e-03, ...,
         5.68116037e-04, 4.59805041e-03, 1.12697921e-03]]),
 array([[3.42895205e-03, 1.41079443e-02, 4.43390993e-03, ...,
         1.68001713e-02, 5.12766921e-03, 5.65510295e-03],
       

## Final iterations by approp beta

In [932]:
tem_learned_pars = TEMsteps(1, 0.00, 0.7, docs2bow, docs2bow)

0.7
-364311.0558585023
-333463.4648863414
-333290.21368711063
-333238.93263499596
-333210.73044274317
-333175.30692585255
-333120.7599401818
-333040.69432994333
-332929.9514365614
-332782.98664246016
-332593.1818101252
-332352.5641643942
-332051.8668093064
-331681.0983311855
-331230.92582707957
-330695.15034898586
-330074.0920424113
-329377.64487843093
-328625.8108260613
-327845.323089861
-327063.73796847917
-326304.2095723479
-325582.92695746693
-324908.9780645353
-324285.6349671199
-323712.2037987785
-323185.7910522309
-322702.61863894033
-322258.8295138825
-321850.85116034874
-321475.4202688055
-321129.4811009381
-320810.1440937515
-320514.7135454629
-320240.71013285656
-319985.86029233027
-319748.07963549456
-319525.4967946273
-319316.55327884166
-319120.1557637001
-318935.7634171221
-318763.2826958655
-318602.79214357457
-318454.2676670161
-318317.4442131976
-318191.81160318974
-318076.67304267373
-317971.2152096048
-317874.57297345967
-317785.8838692493
-317704.327822576
-317629.

In [933]:
em_learned_pars = TEMsteps(1, 0.00, 1, docs2bow, docs2bow)

1
-364443.58292915934
-333180.56998512655
-332232.9448119868
-331028.57612672786
-329488.8526294753
-327722.42685964453
-325948.2494200028
-324366.6428614991
-323065.253639357
-322026.7087130883
-321201.1475464667
-320540.32526305
-319997.44813748274
-319534.48410853
-319125.85564006394
-318763.22356472485
-318446.557618491
-318163.8540288744
-317909.2699732542
-317681.76863884466
-317478.99082378834
-317295.9588323624
-317128.71470651176
-316977.37237383355
-316841.8862682555
-316718.9128524525
-316605.2676830774
-316500.4088910738
-316402.23612466257
-316309.33699240163
-316222.49246972357
-316142.42802405823
-316073.10718644055
-316010.1304667774
-315948.451390342
-315888.9153079633
-315833.0899713948
-315780.40361493465
-315731.3442701863
-315685.21885977493
-315641.8101392509
-315601.812619464
-315564.56281933497
-315528.80489432067
-315494.6918040141
-315462.62265816826
-315431.1529058219
-315399.7484729998
-315368.51452647155
-315336.5956652251
-315304.5327719453
-315273.5353057

In [934]:
# likelihood by ndw * log(ndw/N)
def likelihood(pars, docs2bow):
    Pz, Pd_z, Pw_z = pars
    L = 0
    # N: total number of co-occurrences, means words seen in documents
    N = sum(idf)
    for d, doc2bow in enumerate(docs2bow):
        for w, ndw in doc2bow:
            # Pcocur = sum(Pz[:] * Pd_z[d,:] * Pw_z[w, :])
            # if Pcocur == 0: continue
            L += ndw * np.log(ndw/N)
    return L

In [935]:
pars = 0, 0, 0
likelihood(pars, docs2bow)

-359345.2805439831