# Preparing data

In [44]:
stoplist = set('for a of the and to rt'.split())

In [45]:
stoplist = set('for a of the and to rt is are in on from an'.split())

In [43]:
stoplist

{'a',
 'an',
 'and',
 'are',
 'for',
 'from',
 'in',
 'is',
 'of',
 'on',
 'rt',
 'the',
 'to'}

In [1]:
import pickle

## A small corpus

In [6]:
corpus = [
    'Human machine interface for Lab ABC computer applications',
    'A survey of user opinion of computer system response time',
    'The EPS user interface management system',
    'System and human system engineering testing of EPS',
    'Relation of user-perceived response time to error measurement',
    'The generation of random, binary, unordered trees',
    'The intersection graph of paths in trees',
    'Graph minors IV: Width of trees and well-quasi-ordering',
    'Graph minors: A survey'
]

In [7]:
with open('deerwester.txt', 'wb') as f:
    pickle.dump(corpus, f)

In [8]:
# creating generator object for streaming tweets
class Tweets:
    def __iter__(self):
        for tweet in pickle.load(open('deerwester.txt', 'rb')):
            yield tweet

In [9]:
# streaming corpus and storing documents in bow representation
import re
from collections import defaultdict

tweets = Tweets()
token2id = {}
# token2id : dict of (token(str), tokenId(int))
idf = defaultdict(int)
# idf: dict of (tokenId, freq = frequency of tokenId in corpus)
docs2bow = []
# docs2bow: list of [doc2bow]
# doc2bow: list of (tokenIds in doc, docfreq = frequency of tokenId in doc)
for docno, tweet in enumerate(tweets):
    # lowering tweets and removing punctuations from it, then splitting
    document = re.sub(r'[-–,:;|.!?*()+&/~<>="]', ' ', tweet.lower()).split()
    counter = defaultdict(int)
    # counter: dict of (tokenIds in doc, docfreq = frequency of tokenId in doc)
    for word in document:
        if word in stoplist: continue   # check word by stoplist
        if word not in token2id: token2id[word] = len(token2id) # add word as a token if seen for the first time
        counter[word] += 1
        idf[token2id[word]] += 1
    # creating doc2bow for this doc
    doc2bow = [(token2id[word], docfreq) for word, docfreq in counter.items()]
    print(docno, doc2bow)
    # append doc2bow to docs2bow
    docs2bow.append(doc2bow)

0 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
1 [(7, 1), (8, 1), (9, 1), (5, 1), (10, 1), (11, 1), (12, 1)]
2 [(13, 1), (8, 1), (2, 1), (14, 1), (10, 1)]
3 [(10, 2), (0, 1), (15, 1), (16, 1), (13, 1)]
4 [(17, 1), (8, 1), (18, 1), (11, 1), (12, 1), (19, 1), (20, 1)]
5 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)]
6 [(26, 1), (27, 1), (28, 1), (29, 1), (25, 1)]
7 [(27, 1), (30, 1), (31, 1), (32, 1), (25, 1), (33, 1), (34, 1), (35, 1)]
8 [(27, 1), (30, 1), (7, 1)]


## A user timeline corpus

In [3]:
import tweepy as tw

# load keys
with open('keys.txt', 'rb') as f:
    keys = pickle.load(f)
# define keys
consumer_key = keys['consumer_key']
consumer_secret = keys['consumer_secret']
access_token = keys['access_token']
access_token_secret = keys['access_token_secret']
# authenticate and create api object
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

In [4]:
# generator object for iterating through user timeline
class Tweets():
    def __init__(self, pagination_num=3):
        self.pagination_num = pagination_num
        self.cursor = tw.Cursor(api.user_timeline, id="unicef", # id = "indykaila"
                              exclude_replies=True,
                              include_rts=True,
                              tweet_mode='extended').pages(self.pagination_num)
    def __iter__(self):
        for page in self.cursor:
            for status in page:
                yield status.full_text

In [1]:
# streaming corpus and storing documents in bow representation
import re
from collections import defaultdict
from IPython.display import clear_output

tweets = Tweets(100)
token2id = {}
idf = defaultdict(int)
docs2bow = []
for docno, tweet in enumerate(tweets):
    # remove links from tweets
    tweet = re.sub(r'\bhttps:\S+', '', tweet.lower())
    # print(tweet)
    document = re.sub(r'[-–,:;|.!?*()+&/~<>="]', ' ', tweet).split()
    # print(document)
    counter = defaultdict(int)
    for word in document:
        if word in stoplist: continue
        if word not in token2id: token2id[word] = len(token2id)
        counter[word] += 1
        idf[token2id[word]] += 1
    doc2bow = [(token2id[word], docfreq) for word, docfreq in counter.items()]
    # print(docno, doc2bow)
    print(docno)
    docs2bow.append(doc2bow)
clear_output()
print(docno)

## Health news dataset

In [59]:
stoplist = set('for a of the and to rt is are in on from an video'.split())

In [62]:
class HealthTweets():
    def __iter__(self):
        for line in open('bbchealth.txt', 'r').readlines():
            text = line.split('|')[2]
            yield text

In [64]:
# streaming corpus and storing documents in bow representation
import re
from collections import defaultdict
from IPython.display import clear_output

tweets = HealthTweets()
token2id = {}
idf = defaultdict(int)
docs2bow = []
for docno, tweet in enumerate(tweets):
    # remove links from tweets
    tweet = re.sub(r'\bhttp:\S+', '', tweet.lower())
    # print(tweet)
    document = re.sub(r"[-–,:;|.!?*()+&/~<>=']", ' ', tweet).split()
    # print(document)
    counter = defaultdict(int)
    for word in document:
        if word in stoplist: continue
        if word not in token2id: token2id[word] = len(token2id)
        counter[word] += 1
        idf[token2id[word]] += 1
    doc2bow = [(token2id[word], docfreq) for word, docfreq in counter.items()]
    # print(docno, doc2bow)
    print(docno)
    docs2bow.append(doc2bow)
clear_output()
print(docno)

3928


## Corpus read

In [65]:
len(token2id)

4509

In [8]:
token2id

{'human': 0,
 'machine': 1,
 'interface': 2,
 'lab': 3,
 'abc': 4,
 'computer': 5,
 'applications': 6,
 'survey': 7,
 'user': 8,
 'opinion': 9,
 'system': 10,
 'response': 11,
 'time': 12,
 'eps': 13,
 'management': 14,
 'engineering': 15,
 'testing': 16,
 'relation': 17,
 'perceived': 18,
 'error': 19,
 'measurement': 20,
 'generation': 21,
 'random': 22,
 'binary': 23,
 'unordered': 24,
 'trees': 25,
 'intersection': 26,
 'graph': 27,
 'paths': 28,
 'in': 29,
 'minors': 30,
 'iv': 31,
 'width': 32,
 'well': 33,
 'quasi': 34,
 'ordering': 35}

In [66]:
len(idf)

4509

In [10]:
idf

defaultdict(int,
            {0: 2,
             1: 1,
             2: 2,
             3: 1,
             4: 1,
             5: 2,
             6: 1,
             7: 2,
             8: 3,
             9: 1,
             10: 4,
             11: 2,
             12: 2,
             13: 2,
             14: 1,
             15: 1,
             16: 1,
             17: 1,
             18: 1,
             19: 1,
             20: 1,
             21: 1,
             22: 1,
             23: 1,
             24: 1,
             25: 3,
             26: 1,
             27: 3,
             28: 1,
             29: 1,
             30: 2,
             31: 1,
             32: 1,
             33: 1,
             34: 1,
             35: 1})

In [67]:
len(docs2bow)

3929

In [12]:
docs2bow

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (5, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1), (8, 1), (2, 1), (14, 1), (10, 1)],
 [(10, 2), (0, 1), (15, 1), (16, 1), (13, 1)],
 [(17, 1), (8, 1), (18, 1), (11, 1), (12, 1), (19, 1), (20, 1)],
 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(26, 1), (27, 1), (28, 1), (29, 1), (25, 1)],
 [(27, 1), (30, 1), (31, 1), (32, 1), (25, 1), (33, 1), (34, 1), (35, 1)],
 [(27, 1), (30, 1), (7, 1)]]

## Filter once words

In [68]:
# filter once words
bad_ids = set(tokenid for tokenid, freq in idf.items() if freq == 1)

In [69]:
len(bad_ids)

2301

In [15]:
bad_ids

{1,
 3,
 4,
 6,
 9,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 26,
 28,
 29,
 31,
 32,
 33,
 34,
 35}

In [70]:
# update token2id and idf which filtered once words
token2id = {token: tokenid for token, tokenid in token2id.items() if idf[tokenid] > 1}
idf = {tokenid: freq for tokenid, freq in idf.items() if freq > 1}

In [71]:
len(token2id)

2208

In [18]:
token2id

{'human': 0,
 'interface': 2,
 'computer': 5,
 'survey': 7,
 'user': 8,
 'system': 10,
 'response': 11,
 'time': 12,
 'eps': 13,
 'trees': 25,
 'graph': 27,
 'minors': 30}

In [19]:
idf

{0: 2, 2: 2, 5: 2, 7: 2, 8: 3, 10: 4, 11: 2, 12: 2, 13: 2, 25: 3, 27: 3, 30: 2}

In [72]:
# idmap: maps old tokenIds to new ordered tokenIds
idmap = dict(zip(sorted(token2id.values()), range(len(token2id))))

In [73]:
len(idmap)

2208

In [22]:
idmap

{0: 0,
 2: 1,
 5: 2,
 7: 3,
 8: 4,
 10: 5,
 11: 6,
 12: 7,
 13: 8,
 25: 9,
 27: 10,
 30: 11}

In [74]:
# note this cell is one time run
# compactify token2id and idf
token2id = {token: idmap[tokenid] for token, tokenid in token2id.items()}
idf = {idmap[tokenid]: freq for tokenid, freq in idf.items()}

In [24]:
token2id

{'human': 0,
 'interface': 1,
 'computer': 2,
 'survey': 3,
 'user': 4,
 'system': 5,
 'response': 6,
 'time': 7,
 'eps': 8,
 'trees': 9,
 'graph': 10,
 'minors': 11}

In [25]:
idf

{0: 2, 1: 2, 2: 2, 3: 2, 4: 3, 5: 4, 6: 2, 7: 2, 8: 2, 9: 3, 10: 3, 11: 2}

In [26]:
# token2id changed, but docs2bow still has the same old tokenIds
docs2bow

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (5, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1), (8, 1), (2, 1), (14, 1), (10, 1)],
 [(10, 2), (0, 1), (15, 1), (16, 1), (13, 1)],
 [(17, 1), (8, 1), (18, 1), (11, 1), (12, 1), (19, 1), (20, 1)],
 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(26, 1), (27, 1), (28, 1), (29, 1), (25, 1)],
 [(27, 1), (30, 1), (31, 1), (32, 1), (25, 1), (33, 1), (34, 1), (35, 1)],
 [(27, 1), (30, 1), (7, 1)]]

In [75]:
# rebuild docs2bow based on new token2id
docs2bow = [
    [(idmap[tokenid], docfreq) for tokenid, docfreq in doc2bow if tokenid not in bad_ids]
    for doc2bow in docs2bow
]

## save/load the corpus

In [28]:
docs2bow

[[(0, 1), (1, 1), (2, 1)],
 [(3, 1), (4, 1), (2, 1), (5, 1), (6, 1), (7, 1)],
 [(8, 1), (4, 1), (1, 1), (5, 1)],
 [(5, 2), (0, 1), (8, 1)],
 [(4, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(10, 1), (9, 1)],
 [(10, 1), (11, 1), (9, 1)],
 [(10, 1), (11, 1), (3, 1)]]

### `docs2bow`

In [17]:
# save the corpus in bow representation
with open('unicef_corpus2bow.txt', 'wb') as f:
    pickle.dump(docs2bow, f)

In [3]:
# load the corpus in bow representation
with open('unicef_corpus2bow.txt', 'rb') as f:
    docs2bow = pickle.load(f)

### `token2id`

In [20]:
# save token2id
with open('unicef_token2id.txt', 'wb') as f:
    pickle.dump(token2id, f)

In [4]:
# load token2id
with open('unicef_token2id.txt', 'rb') as f:
    token2id = pickle.load(f)

## Additional

In [77]:
# not necessary, just n.shape needed for further uses
# note that we don't use n(d,w) matrix in computations, n(d,w) is presented in docs2bow as ndw that you'll see
# creating words2bod by docs2bow -> n -> n.T -> words2bod
# words2bod shows each word appeared in which docs
import numpy as np
# n: numpy array of n[d][w] = n(d,w)
# d = document number, w = word's tokenId
n = np.zeros((len(docs2bow), len(token2id)))
for docno, doc2bow in enumerate(docs2bow):
    for tokenid, docfreq in doc2bow:
        n[docno, tokenid] += docfreq
n

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [78]:
n.shape

(3929, 2208)

In [32]:
n.T

array([[1., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 1., 0., 1., 0., 0., 0., 0.],
       [0., 1., 1., 2., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 1.]])

In [33]:
words2bod = [
    [(tokenid, int(docfreq)) for tokenid, docfreq in enumerate(rows) if docfreq != 0]
    for rows in n.T
]

In [34]:
words2bod

[[(0, 1), (3, 1)],
 [(0, 1), (2, 1)],
 [(0, 1), (1, 1)],
 [(1, 1), (8, 1)],
 [(1, 1), (2, 1), (4, 1)],
 [(1, 1), (2, 1), (3, 2)],
 [(1, 1), (4, 1)],
 [(1, 1), (4, 1)],
 [(2, 1), (3, 1)],
 [(5, 1), (6, 1), (7, 1)],
 [(6, 1), (7, 1), (8, 1)],
 [(7, 1), (8, 1)]]

# PLSI model

## Parameters and likelihood

In [79]:
K = 20   # K: number of topics considered, namely the size of latent semantic set Z

In [80]:
from numpy.random import rand

def random_init_pars(K, nshape):
    N, M = nshape   # N = number of documents, M = number of tokens
    Pz = rand(K); Pz /= sum(Pz) # P(z)
    Pd_z = rand(N, K); Pd_z /= Pd_z.sum(axis=0) # P(d|z)
    Pw_z = rand(M, K); Pw_z /= Pw_z.sum(axis=0) # P(w|z)
    pars = Pz, Pd_z, Pw_z   # pack parameters in a variable called pars
    return pars

In [10]:
n.shape

(1440, 2481)

In [40]:
random_init_pars(K, n.shape)

(array([0.4001385, 0.5998615]),
 array([[0.11300014, 0.13423582],
        [0.15208975, 0.14145356],
        [0.11296352, 0.14415165],
        [0.10676998, 0.05131162],
        [0.14655433, 0.0403815 ],
        [0.02229351, 0.17183628],
        [0.11544259, 0.166549  ],
        [0.14183071, 0.10509881],
        [0.08905547, 0.04498175]]),
 array([[0.01186479, 0.08050633],
        [0.12389336, 0.03654783],
        [0.03002695, 0.18772072],
        [0.08509484, 0.13924188],
        [0.17923298, 0.00066867],
        [0.12707695, 0.13626144],
        [0.17833826, 0.14977082],
        [0.0883424 , 0.14072519],
        [0.03751731, 0.06700934],
        [0.09520133, 0.02886329],
        [0.0124025 , 0.02764159],
        [0.03100834, 0.0050429 ]]))

In [81]:
def likelihood(pars, docs2bow):
    Pz, Pd_z, Pw_z = pars   # unpack parameters
    L = 0
    # iterate over data in docs2bow and calculate prob of co-occur for them, based on pars
    for d, doc2bow in enumerate(docs2bow):
        for w, ndw in doc2bow:
            Pcocur = sum(Pz[:] * Pd_z[d,:] * Pw_z[w, :])    # P(d,w)
            # adding up all log-likelihood terms
            L += ndw * np.log(Pcocur)
    return L

In [83]:
likelihood(random_init_pars(K, n.shape), docs2bow)

-284886.5492200772

## EM

In [18]:
# Expectation step
def Estep(pars, docs2bow):  # no necessity to pass docs2bow (data) to Estep, but it'll help to decrease computations
    Pz, Pd_z, Pw_z = pars
    posters = np.zeros((len(Pz), len(Pd_z), len(Pw_z)))
    # posters could be an attribute and no need to reset to zeros because it's not accumulative
    # iterate through data and calculate posteriors just for seen pairs of (d, w)
    # so unseen posteriors left to be zero
    for z in range(len(Pz)):
        for d, doc2bow in enumerate(docs2bow):
            for w, ndw in doc2bow:
                posters[z, d, w] = Pz[z] * Pd_z[d, z] * Pw_z[w, z]
    # normalization
    posters /= posters.sum(axis=0) + 1e-16  # a tiny number added just to avoid dividing by zero error for unseen (d, w)s
    return posters

In [19]:
posters = Estep(random_init_pars(K, n.shape), docs2bow)
posters

array([[[0.32303151, 0.54699039, 0.49943317, ..., 0.        ,
         0.        , 0.        ],
        [0.45525299, 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.02473211, 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.48931705]],

       [[0.14450733, 0.11202042, 0.21154905, ..., 0.        ,
         0.        , 0.        ],
        [0.18117445, 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.54418821, 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0. 

In [20]:
posters.shape

(5, 1440, 2481)

In [21]:
# is posters normalized?
posters.sum(axis=0)

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [22]:
# Maximization step
def Mstep(posters, docs2bow):
    # re-estimation of the parameters by posteriors calculated in E-step based on parameters
    K, N, M = posters.shape  # K, N, M could be in an attribute self.archit
    rePz, rePd_z, rePw_z = np.zeros(K), np.zeros((N, K)), np.zeros((M, K))
    # repars should reset to zeros in each M-step because they'd be calculated accumulatively
    # iterate over data and add up terms n(d,w) * poster(z|d,w) to associated repars
    for z in range(K):
        for d, doc2bow in enumerate(docs2bow):
            for w, ndw in doc2bow:
                rePz[z] += ndw * posters[z, d, w]
                rePd_z[d, z] += ndw * posters[z, d, w]
                rePw_z[w, z] += ndw * posters[z, d, w]
    # normalization
    rePz /= sum(rePz)
    rePd_z /= rePd_z.sum(axis=0)
    rePw_z /= rePw_z.sum(axis=0)
    repars = rePz, rePd_z, rePw_z   # pack re-estimated parameters in repars and return it
    return repars

In [23]:
# just one EM-step
pars = random_init_pars(K, n.shape)
print(pars)
print(likelihood(pars, docs2bow))
# EM
posters = Estep(pars, docs2bow)
repars = Mstep(posters, docs2bow)
print(repars)
print(likelihood(repars, docs2bow))

(array([0.11050252, 0.49422261, 0.04013839, 0.02990187, 0.32523461]), array([[1.77758908e-05, 1.36297978e-03, 1.85012743e-04, 2.80483553e-04,
        6.69807474e-04],
       [8.46433053e-04, 7.95912790e-04, 6.46290209e-05, 4.49213506e-04,
        5.32544229e-04],
       [3.08975295e-04, 5.33615596e-05, 2.38025249e-05, 4.60703674e-04,
        6.08415677e-04],
       ...,
       [1.23670783e-03, 9.65226472e-04, 9.69355921e-04, 1.97738257e-04,
        1.35466950e-03],
       [4.34613088e-04, 1.19494630e-03, 2.15061623e-04, 3.16974529e-04,
        1.02119677e-03],
       [1.38109975e-03, 8.80540174e-04, 1.01815017e-04, 4.32503093e-06,
        9.90454460e-04]]), array([[4.37660960e-04, 6.82640865e-05, 7.63038848e-04, 6.16164874e-04,
        7.03154309e-04],
       [6.47898319e-04, 2.00939033e-04, 9.62942829e-05, 5.06505313e-04,
        2.64907051e-04],
       [8.03226433e-04, 6.02588360e-04, 1.15622529e-04, 1.96341919e-04,
        4.76100360e-04],
       ...,
       [3.58202328e-04, 5.13095

In [24]:
# check whether parameters remain normalized
Pz, Pd_z, Pw_z = repars
print(Pz.sum(axis=0))
print(Pd_z.sum(axis=0))
print(Pw_z.sum(axis=0))

1.0
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]


In [25]:
# Expectation Maximization steps
def EMsteps(runtimes, pars, docs2bow):
    print(pars)
    print(likelihood(pars, docs2bow))
    for runtime in range(runtimes):
        posters = Estep(pars, docs2bow)
        pars = Mstep(posters, docs2bow)
        print(likelihood(pars, docs2bow))
    print(pars)

In [47]:
# EM-step several times
pars = random_init_pars(K, n.shape)
EMsteps(20, pars, docs2bow)

(array([0.17723933, 0.2468415 , 0.27925253, 0.24298058, 0.05368606]), array([[1.23898491e-03, 4.07932364e-04, 7.72285110e-04, 4.91432597e-04,
        5.77671454e-04],
       [1.00782306e-04, 7.95301050e-04, 8.53073129e-04, 1.27229277e-03,
        7.45132328e-04],
       [1.04888451e-03, 4.67003607e-04, 2.48203817e-05, 7.77393756e-04,
        7.99954250e-04],
       ...,
       [1.14191162e-03, 1.03176459e-03, 4.33461703e-04, 2.83262170e-04,
        3.04485912e-04],
       [9.53813890e-04, 8.58497212e-04, 9.14528343e-04, 6.86652804e-04,
        1.05175926e-03],
       [8.35314174e-04, 2.39860872e-04, 2.09181507e-04, 1.91365792e-04,
        5.37394701e-04]]), array([[4.72962477e-04, 3.62062127e-04, 4.77349895e-04, 6.82735154e-04,
        3.79659738e-04],
       [5.24745073e-05, 2.00939004e-04, 4.75926570e-04, 2.88202758e-04,
        6.74201723e-04],
       [1.17489047e-04, 1.82402609e-04, 4.21371333e-04, 5.41033939e-04,
        7.08283859e-04],
       ...,
       [2.32574206e-04, 7.17213

In [63]:
# now you can judge the result. We'll do it formally later
token2id

{'human': 0,
 'interface': 1,
 'computer': 2,
 'survey': 3,
 'user': 4,
 'system': 5,
 'response': 6,
 'time': 7,
 'eps': 8,
 'trees': 9,
 'graph': 10,
 'minors': 11}

In [64]:
corpus

['Human machine interface for Lab ABC computer applications',
 'A survey of user opinion of computer system response time',
 'The EPS user interface management system',
 'System and human system engineering testing of EPS',
 'Relation of user-perceived response time to error measurement',
 'The generation of random, binary, unordered trees',
 'The intersection graph of paths in trees',
 'Graph minors IV: Width of trees and well-quasi-ordering',
 'Graph minors: A survey']

## TEM

In [84]:
# Tempered Expectation step with control parameter beta
def TEstep(beta, pars, docs2bow):
    Pz, Pd_z, Pw_z = pars
    posters = np.zeros((len(Pz), len(Pd_z), len(Pw_z)))
    for z in range(len(Pz)):
        for d, doc2bow in enumerate(docs2bow):
            for w, ndw in doc2bow:
                posters[z, d, w] = (Pz[z] * Pd_z[d, z] * Pw_z[w, z])**beta  # beta
    posters /= posters.sum(axis=0) + 1e-16
    return posters

In [85]:
# Tempered Maximization step
def TMstep(posters, docs2bow):  # note beta has no role in TM-step. it played its role in TE-step
    K, N, M = posters.shape
    rePz, rePd_z, rePw_z = np.zeros(K), np.zeros((N, K)), np.zeros((M, K))
    for z in range(K):
        for d, doc2bow in enumerate(docs2bow):
            for w, ndw in doc2bow:
                rePz[z] += ndw * posters[z, d, w]
                rePd_z[d, z] += ndw * posters[z, d, w]
                rePw_z[w, z] += ndw * posters[z, d, w]
    rePz /= sum(rePz)
    rePd_z /= rePd_z.sum(axis=0)
    rePw_z /= rePw_z.sum(axis=0)
    repars = rePz, rePd_z, rePw_z
    return repars

## Split data to train and held-out

In [13]:
len(docs2bow)

1440

In [68]:
docs2bow

[[(0, 1), (1, 1), (2, 1)],
 [(3, 1), (4, 1), (2, 1), (5, 1), (6, 1), (7, 1)],
 [(8, 1), (4, 1), (1, 1), (5, 1)],
 [(5, 2), (0, 1), (8, 1)],
 [(4, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(10, 1), (9, 1)],
 [(10, 1), (11, 1), (9, 1)],
 [(10, 1), (11, 1), (3, 1)]]

In [86]:
# iterate over the corpus and randomly erase words
# erased words will be writed in held-out corpus
# unerased words remain in corpus as training corpus
from numpy.random import randint

docs2bow_train, docs2bow_heldout = list(), list()
for doc2bow in docs2bow:
    doc2bow_train, doc2bow_heldout = list(), list()
    for w, ndw in doc2bow:
        ndw_train = randint(ndw+1)
        if ndw_train > 0:
            doc2bow_train += [(w, ndw_train)]
        if ndw - ndw_train > 0:
            doc2bow_heldout += [(w, ndw - ndw_train)]
    docs2bow_train += [(doc2bow_train)]
    docs2bow_heldout += [(doc2bow_heldout)]

In [16]:
len(docs2bow_heldout)

1440

In [71]:
docs2bow_train

[[(0, 1)],
 [(4, 1), (5, 1), (6, 1)],
 [(8, 1), (4, 1), (5, 1)],
 [(5, 1), (0, 1), (8, 1)],
 [(4, 1), (6, 1), (7, 1)],
 [],
 [],
 [(10, 1), (11, 1)],
 [(11, 1)]]

In [18]:
len(docs2bow_train)

1440

In [73]:
docs2bow_heldout

[[(1, 1), (2, 1)],
 [(3, 1), (2, 1), (7, 1)],
 [(1, 1)],
 [(5, 1)],
 [],
 [(9, 1)],
 [(10, 1), (9, 1)],
 [(9, 1)],
 [(10, 1), (3, 1)]]

## TEM for train and heldout

In [87]:
# modify likelihood to solve the problem of omitted words or docs
# in splitting, may some words be omitted entirely from training corpus
# therefore their condit probs P(w|z) remain zero in training procedure
# why? note that EM-steps works only with training corpus and obviously omitted words left unseen in training
# and also note that repars accumulate from zero for "seen" data, so repars for unseen data remain zero through M-step
# so in evaluating performance on held-out data by likelihood(pars, heldout), Pcocur(omitted) would be 0 (P(w|z) = 0)
# and it diverges log-likelihood!
# so for avoiding this problem, we ignore omitted words in log-likelihood calculations
# similar problem could happen for omitted docs
def likelihood(pars, docs2bow):
    Pz, Pd_z, Pw_z = pars
    L = 0
    for d, doc2bow in enumerate(docs2bow):
        for w, ndw in doc2bow:
            Pcocur = sum(Pz[:] * Pd_z[d,:] * Pw_z[w, :])
            # modification
            if Pcocur == 0: continue
            L += ndw * np.log(Pcocur)
    return L

In [88]:
# TEM-steps on training corpus and evaluating performance by likelihood on held-out corpus
def TEMsteps(beta_runtimes, first_beta, eta, docs2bow_train, docs2bow_heldout):
    beta = first_beta
    # pars = random_init_pars(K, n.shape)
    pars = random_init_pars(K, (len(docs2bow), len(token2id)))
    for i in range(beta_runtimes):
        print(beta)
        # pars = random_init_pars(K, n.shape)
        # print(pars)
        new_likeli = likelihood(pars, docs2bow_heldout) # first likelihood in new beta
        likeli = 2 * new_likeli    # for assuring the entrance to while loop
        # while round(new_likeli, 0) > round(likeli, 0):  # check if likelihood increased?
        while (new_likeli - likeli) / likeli < -0.0001:  # define a relative condition
            likeli = new_likeli
            print(likeli)   # print increased likelihood (or first likelihood)
            # one TEM step
            prepars = pars  # save pars before executing TEM, for undoing pars if khodayi nakarde likelihood decreased
            posters = TEstep(beta, pars, docs2bow_train)    # TE-step
            pars = TMstep(posters, docs2bow_train)  # TM-step
            # print(pars)
            new_likeli = likelihood(pars, docs2bow_heldout) # calculating likelihood for re-estimated pars
        print(new_likeli)   # print decreased (inappropriate) likelihood
        pars = prepars  # undo pars
        beta *= eta # new beta
    return pars

In [89]:
# to find the approp beta, try TEM for various betas, on training and held-out corpus
TEMsteps(20, 1.00, 0.90, docs2bow_train, docs2bow_heldout)

1.0
-142034.84801177596
-114597.9118182885
-114744.60556501515
0.9
-114597.9118182885
-114681.07421818357
0.81
-114597.9118182885
-114635.51834282668
0.7290000000000001
-114597.9118182885
-114603.31951495273
0.6561000000000001
-114597.9118182885
-114580.85690436112
-114571.01725802285
0.5904900000000002
-114580.85690436112
-114554.34638887564
-114537.22356496235
-114526.09642314336
0.5314410000000002
-114537.22356496235
-114517.25843671068


KeyboardInterrupt: 

## Final TEM by appropriate beta

In [90]:
# to obtain the learned pars, one TEM by approp beta, on the whole corpus
tem_learned_pars = TEMsteps(1, 0.6, 0.00, docs2bow, docs2bow)

0.6
-284971.88829544524
-268684.7852031237
-268614.0707105244
-268652.8475304686


In [91]:
# one EM (TEM by beta = 1), on the whole corpus
em_learned_pars = TEMsteps(1, 1.00, 0.00, docs2bow, docs2bow)

1.0
-285288.4394184393
-267813.03912732063
-265702.0525638421
-262431.01265061455
-258045.55771821545
-253556.00927200477
-249984.59537374915
-247477.68256215408
-245704.39277598343
-244398.41420856086
-243414.15568376673
-242662.37876061452
-242071.4543976675
-241592.91907326685
-241206.6328143457
-240907.64427584526
-240676.65556688447
-240486.09137127933
-240319.65002794177
-240172.4842573757
-240040.83141388625
-239924.7322200468
-239818.5380009141
-239720.330684687
-239628.9580664988
-239542.05817770082
-239469.77303903268
-239408.1277675109
-239351.10913901663
-239296.27930801772
-239241.8001750799
-239193.54131252322
-239150.59123493123
-239108.61459593775
-239069.07533886583
-239033.01205629032
-238997.55811592986
-238959.68848476443
-238921.90771125632
-238886.6331557354
-238855.2269356264
-238826.43392605553
-238797.813693915
-238770.08775136797
-238742.50516290125
-238716.34175785256
-238693.9971708892


## Looking at parameters

In [92]:
id2token = {tokenid: token for token, tokenid in token2id.items()}

In [108]:
id2token

{0: 'human',
 1: 'interface',
 2: 'computer',
 3: 'survey',
 4: 'user',
 5: 'system',
 6: 'response',
 7: 'time',
 8: 'eps',
 9: 'trees',
 10: 'graph',
 11: 'minors'}

### P(w|z)

In [93]:
# import pprint

def token2Pw_z(pars, id2token, first_probs=None):
    Pz, Pd_z, Pw_z = pars
    for z in range(len(Pz)):
        token2Pw_z = {id2token[tokenid]: prob for tokenid, prob in enumerate(Pw_z[:, z])}
        sorted_token2Pw_z = dict(sorted(token2Pw_z.items(), key=lambda item: -item[1])[:first_probs])
        print(f'P(w|z={z}) = {sorted_token2Pw_z}\n')
        # pprint.pprint(sorted_token2Pw_z)

In [94]:
token2Pw_z(tem_learned_pars, id2token, first_probs=20)

P(w|z=0) = {'ebola': 0.027648390464278125, 'nhs': 0.022792457862969993, 'care': 0.012495959350300212, 'audio': 0.010957717665466441, 'health': 0.010206425415155884, 'hospital': 0.009480355076725224, 's': 0.008716434315693094, 'over': 0.00861416786580953, 'patients': 0.007894588007376904, 'amp': 0.006880905874243768, 'e': 0.006530517360499943, 'be': 0.0064264357580104845, 'new': 0.006406982748547574, 'risk': 0.00633922889990938, 'uk': 0.006096585924608676, 'call': 0.005702676330427416, 'child': 0.005591937530943784, 'mental': 0.0050070047400813675, 'help': 0.0049528568887827785, 'may': 0.004927010574704252}

P(w|z=1) = {'ebola': 0.022687556086690056, 'nhs': 0.022553664685959523, 'cancer': 0.01875419828543511, 'health': 0.012882196768281118, 'new': 0.011185796077352825, 'over': 0.008421344890084071, 'audio': 0.008076736108589467, 's': 0.00806834783743448, 'e': 0.00707670620214765, 'patients': 0.006962451551150965, 'amp': 0.006941114003582712, 'mental': 0.006234011338789937, 'be': 0.00555

In [32]:
token2Pw_z(em_learned_pars, id2token, first_probs=20)

P(w|z=0) = {'vaccines': 0.03247460585596786, 'in': 0.03170834945392084, 'covid': 0.02855633930916494, '19': 0.027949705922424323, 'is': 0.015350104962010456, 'doses': 0.014929734914714329, 'first': 0.013026888964876386, 'have': 0.011130983926283833, 'vaccine': 0.010408665479738401, '#covax': 0.010092831511789885, 'unicef': 0.009943940401509244, 'with': 0.009400984150527168, 'through': 0.009339046359508217, 'covax': 0.009133484885749813, 'countries': 0.008779356187299407, 'this': 0.00875529302862728, 'are': 0.008734285537327227, '#covid19': 0.008360411248748234, 'today': 0.008251810107146491, 'has': 0.007948913906560807}

P(w|z=1) = {'in': 0.03137692864558328, 'year': 0.021847861682374643, 'her': 0.020524772888915393, 'is': 0.016656840376821865, 'old': 0.016008465075770438, 'with': 0.014949047723987629, 'water': 0.013281882248494383, 'must': 0.01293388765432907, 'from': 0.012294802532501177, 'are': 0.012027752831292522, 'we': 0.011575922857492607, 'that': 0.011538450841227366, 'schools'

### P(w|d)

In [33]:
def doc2token(docno, id2token, docs2bow):
    converted_doc = {id2token[tokenid]: docfreq for tokenid, docfreq in docs2bow[docno]}
    return converted_doc

In [36]:
doc2token(1, id2token, docs2bow)

{'covid': 1,
 '19': 1,
 'disrupted': 1,
 'education': 2,
 'around': 1,
 'world': 1,
 'but': 1,
 'there’s': 1,
 'always': 1,
 'way': 1,
 'keep': 1,
 'children': 3,
 'learning': 2,
 'in': 3,
 '2020': 1,
 'unicef': 1,
 'reached': 1,
 '48': 1,
 'million': 1,
 'out': 1,
 'school': 2,
 'with': 1,
 'early': 1,
 'primary': 1,
 'amp': 1,
 'secondary': 1,
 'now': 1,
 '2021': 1,
 'we’re': 1,
 'working': 1,
 'get': 1,
 'safely': 1,
 'back': 1}

In [37]:
def doc2Pw_d(docno, pars, id2token, docs2bow):
    Pz, Pd_z, Pw_z = pars
    Pdz = Pz * Pd_z
    Pz_d = (Pdz.T / Pdz.sum(axis=1)).T  # P(z|d) = Pz_d[d, z]
    Pw_docno = (Pz_d[docno] * Pw_z).sum(axis=1)
    # translate and sort Pw_docno
    token2Pw_docno = {id2token[tokenid]: prob for tokenid, prob in enumerate(Pw_docno)}
    sorted_token2Pw_docno = dict(sorted(token2Pw_docno.items(), key=lambda item: -item[1])[:2*len(docs2bow[docno])]) # length
    print(doc2token(docno, id2token, docs2bow))
    print(sorted_token2Pw_docno)

In [38]:
doc2Pw_d(10, em_learned_pars, id2token, docs2bow)

{'@unicefindia': 1, '#unicef': 1, 'supports': 1, 'india’s': 1, '#covid19': 1, 'emergency': 1, 'response': 1, 'critical': 1, 'lifesaving': 1, 'supplies': 1, 'will': 1, 'help': 1, 'fight': 1, 'against': 1}
{'vaccines': 0.03247460584737785, 'in': 0.031708349461713574, 'covid': 0.02855633930141877, '19': 0.027949705915046728, 'is': 0.01535010496442578, 'doses': 0.014929734910028398, 'first': 0.013026888960787693, 'have': 0.01113098392505983, 'vaccine': 0.010408665476693859, '#covax': 0.010092831508622092, 'unicef': 0.009943940401563534, 'with': 0.009400984148924412, 'through': 0.009339046357002237, 'covax': 0.009133484882883127, 'countries': 0.00877935618563042, 'this': 0.008755293030000636, 'are': 0.00873428553850697, '#covid19': 0.008360411246128989, 'today': 0.008251810105222262, 'has': 0.007948913906076969, 'now': 0.007413839305961486, 'health': 0.0072169582039948625, '000': 0.007136728428247235, 'work': 0.0068405383705121454, 'will': 0.006581083396570647, 'more': 0.006300083062627015,

## Filter out more tokens from corpus

In [20]:
bad_tokens = set('is are in on from an'.split())

In [21]:
bad_tokens

{'an', 'are', 'from', 'in', 'is', 'on'}

In [22]:
bad_ids = set(token2id[bad_token] for bad_token in bad_tokens)

In [23]:
bad_ids

{24, 45, 68, 75, 150, 295}

In [53]:
n[:, 24].sum()

874.0

In [55]:
# filter token2id
token2id = {token: tokenid for token, tokenid in token2id.items() if tokenid not in bad_ids}
# creating idmap
idmap = dict(zip(sorted(token2id.values()), range(len(token2id))))
# compactify token2id
token2id = {token: idmap[tokenid] for token, tokenid in token2id.items()}
# id2token
id2token = {tokenid: token for token, tokenid in token2id.items()}

In [59]:
# rebuild filtered docs2bow
docs2bow = [
    [(idmap[tokenid], docfreq) for tokenid, docfreq in doc2bow if tokenid not in bad_ids]
    for doc2bow in docs2bow
]

### Train again by new filtered corpus

In [64]:
tem_learned_pars = TEMsteps(1, 0.75, 0.00, docs2bow, docs2bow)

0.75
-425431.66268389224
-391439.11257212405
-391138.267289122
-390969.0717964464
-390818.6538914301
-390650.4862082611
-390448.5779116147
-390204.582284762
-389913.7764248532
-389573.5008412661
-389182.69413731724
-388742.18318297184
-388255.2727892071
-387728.0546009698
-387169.10366643424
-386588.63671553467
-385997.4402678283
-385405.90921703994
-384823.41192367853
-384258.0258708974
-383716.63024695887
-383205.12450906227
-382728.21987988666
-382288.62353162246
-381886.39245234616
-381519.243740481
-381183.6785522993
-380876.30958475795
-380594.8119565753
-380337.8152701607
-380103.8778292938
-379890.8424754466
-379696.19332616864
-379517.6557088851
-379353.25884685153
-379201.12307417206
-379059.4064489057
-378926.4659374031
-378801.1888155188
-378683.1974101788
-378572.5376312022
-378469.1387275922
-378372.6534928521
-378282.6427066944
-378198.6882268026
-378120.3101748087
-378046.91194053355
-377977.83030696196
-377912.3938625884
-377849.9373187074
-377789.78956861724
-377731.2

In [65]:
token2Pw_z(tem_learned_pars, id2token, first_probs=15)

P(w|z=0) = {'covid': 0.02468356647913886, '19': 0.024253387009451437, 'health': 0.022261930699514716, 'children': 0.01969548855770016, 'as': 0.014908684959769932, 'their': 0.014148953752200037, 'we': 0.014108597642450538, 'need': 0.0134506872765165, 'with': 0.01271699538036316, 'support': 0.010442469174984214, 'our': 0.009198552431727314, 'unicef': 0.008986724080209932, 'protect': 0.008824943871514635, 'this': 0.008752493028241057, 'pandemic': 0.00806233140197669}

P(w|z=1) = {'we': 0.02369161019191525, 'children': 0.018073847248061034, 'water': 0.016281902600475736, 'education': 0.015550203128473423, 'must': 0.013947298068170222, 'world': 0.0127433691651005, 'schools': 0.010981692987583705, 'year': 0.01096883433121957, 'learning': 0.01049214327022325, 'child': 0.0101754646051107, '@unicefchief': 0.009806186236152769, 'school': 0.009777940250866196, 'can': 0.009587456818836222, 'her': 0.009584724517253985, 'they': 0.009276802224399649}

P(w|z=2) = {'our': 0.01532605410362877, 'you': 0.

In [66]:
em_learned_pars = TEMsteps(1, 1.00, 0.00, docs2bow, docs2bow)

1.0
-425819.4926255317
-391217.5947833386
-390335.4293423235
-389370.9419203345
-388284.9127037604
-387163.9989295611
-386108.68397401087
-385159.5892519591
-384316.224334903
-383565.135951779
-382893.17425778176
-382297.7264387191
-381780.42531736754
-381339.1619803692
-380965.4924255965
-380649.1401449335
-380380.75739436585
-380146.92629415356
-379938.0748204201
-379751.28196120146
-379586.8616816973
-379443.37589151104
-379317.22124338406
-379203.49519475654
-379098.2239139739
-379000.81620901683
-378910.9957664596
-378828.7140838424
-378754.35189902125
-378687.1250606837
-378624.75672718964
-378566.1002667818
-378511.0950549714
-378458.853139742
-378408.70581985585
-378359.372272804
-378312.0420342798
-378267.78088753414
-378225.22952131595
-378184.3021556253
-378145.1990986536
-378105.3688898736
-378063.0785134567
-378019.60331470566
-377980.2499396444
-377947.5022322393


In [67]:
token2Pw_z(em_learned_pars, id2token, first_probs=15)

P(w|z=0) = {'covid': 0.024270215995917794, '19': 0.02365216072391313, 'you': 0.02009835596304525, 'school': 0.013256782456743239, 'year': 0.012749361797425744, 'your': 0.012628073123806665, 'what': 0.012374361802135176, 'children': 0.011592369310568526, 'our': 0.010876614159713508, 'out': 0.010561089965584426, 'as': 0.010139672062222448, 'schools': 0.00998182832315023, 'can': 0.009600616384674462, 'health': 0.009061027215229986, 'be': 0.008949571306068636}

P(w|z=1) = {'vaccines': 0.03896814669276154, 'covid': 0.03165215591574358, '19': 0.031203825858483112, 'this': 0.015478098727342984, 'doses': 0.013917198266272917, 'vaccine': 0.013462840697844262, 'first': 0.013196162462808731, 'countries': 0.012771046363391015, 'as': 0.012584793086886856, 'have': 0.012359710933277228, 'unicef': 0.010901370446547537, '#covax': 0.009987636402766863, 'with': 0.009953558324920111, 'today': 0.009660247537424197, 'against': 0.009005049440376079}

P(w|z=2) = {'children': 0.025916549719365144, 'unicef': 0.

In [79]:
doc2Pw_d(4, tem_learned_pars, id2token, docs2bow)

{'deadly': 1, 'wave': 1, 'covid': 1, '19': 1, 'surging': 1, 'nepal': 1, 'much': 2, 'needed': 2, 'medical': 1, 'supplies': 1, 'unicef': 1, 'others': 1, 'have': 1, 'arrived': 1, 'recent': 1, 'weeks': 1, 'but': 1, 'more': 1, 'support': 1, 'save': 1, 'lives': 1, '👉': 1}
{'covid': 0.025798001562756627, '19': 0.025501475349848597, 'children': 0.01726045979981774, 'health': 0.016424699514085876, 'as': 0.012124974051717212, 'we': 0.011910178823188133, 'with': 0.011366823740050396, 'their': 0.01057672682376312, 'this': 0.010257751148089975, 'vaccines': 0.009571095883265887, 'need': 0.00951070985231948, 'unicef': 0.009073333589515405, 'our': 0.007567600826086653, 'support': 0.007565807968559286, 'pandemic': 0.0069677644904859145, 'how': 0.006492333725106847, 'has': 0.006401788749532125, 'protect': 0.006291225801196093, 'more': 0.006072502725161493, 'they': 0.00598152630680896, 'must': 0.005794469516482443, 'people': 0.005715443027971143, 'families': 0.005670245464871051, 'have': 0.00564600841108

In [80]:
doc2Pw_d(4, em_learned_pars, id2token, docs2bow)

{'deadly': 1, 'wave': 1, 'covid': 1, '19': 1, 'surging': 1, 'nepal': 1, 'much': 2, 'needed': 2, 'medical': 1, 'supplies': 1, 'unicef': 1, 'others': 1, 'have': 1, 'arrived': 1, 'recent': 1, 'weeks': 1, 'but': 1, 'more': 1, 'support': 1, 'save': 1, 'lives': 1, '👉': 1}
{'children': 0.022066839648354698, '19': 0.013985919398151653, 'covid': 0.013655859800531574, 'with': 0.012511277091520511, 'unicef': 0.012385279806845884, 'health': 0.011090333422539393, 'this': 0.010330779815727033, 'by': 0.00976739772130558, 'water': 0.009521062385345813, 'how': 0.009270792905703514, 'child': 0.008226983666709244, 'their': 0.007100656997252054, 'at': 0.006831066725838957, 'families': 0.006667384163975103, 'people': 0.006605730057359597, 'working': 0.006149019789198803, 'young': 0.005972979057476755, 'climate': 0.005661515042242542, 'vaccines': 0.005482015420961938, 'have': 0.005342986250172782, 'has': 0.00518666133428618, 'lives': 0.005173801195348305, 'safe': 0.005168448814537085, 'every': 0.00509726843

## Change topic numbers K

In [84]:
len(token2id)

2475

In [81]:
# define TEMsteps func again to be compatible with changing topic numbers
# just an argument K is added to function inputs
def TEMsteps(K, beta_runtimes, first_beta, eta, docs2bow_train, docs2bow_heldout):
    beta = first_beta
    pars = random_init_pars(K, (len(docs2bow), len(token2id)))
    for i in range(beta_runtimes):
        print(beta)
        new_likeli = likelihood(pars, docs2bow_heldout)
        likeli = 2 * new_likeli
        while (new_likeli - likeli) / likeli < -0.0001:
            likeli = new_likeli
            print(likeli)
            # one TEM step
            prepars = pars
            posters = TEstep(beta, pars, docs2bow_train)
            pars = TMstep(posters, docs2bow_train)
            # print(pars)
            new_likeli = likelihood(pars, docs2bow_heldout)
        print(new_likeli)
        pars = prepars  # undo pars
        beta *= eta # new beta
    return pars

In [107]:
em_learned_pars = TEMsteps(2, 1, 1.00, 0.00, docs2bow, docs2bow)

1.0
-430513.12830310495
-391763.5390693877
-391284.4763126484
-390905.75406712235
-390543.5970663452
-390199.77735845326
-389892.4450339308
-389629.84841684834
-389409.2997886119
-389226.0748744146
-389073.9291497022
-388944.7253454876
-388831.84204608266
-388732.2642768752
-388644.2559939053
-388565.8610729162
-388495.0246026755
-388430.2726783615
-388370.8803717051
-388315.82688669226
-388263.4012173367
-388211.4414686427
-388157.9246248506
-388102.51112295716
-388046.0089056616
-387988.70039470855
-387932.02301489457
-387878.39951969957
-387828.5114933154
-387780.68568455626
-387733.29003234074
-387686.41234263324
-387640.51409769314
-387595.5714967665
-387551.20442028233
-387508.95022844756
-387472.018944851


In [108]:
token2Pw_z(em_learned_pars, id2token, first_probs=20)

P(w|z=0) = {'covid': 0.021758770844347353, '19': 0.02153851286123545, 'we': 0.013392025450965812, 'vaccines': 0.013178057790070892, '@unicefchief': 0.011251972598036016, 'with': 0.011048746448163262, 'health': 0.010569961046098466, 'how': 0.010317035244652405, 'this': 0.010172914675159675, 'can': 0.009531869855657139, 'as': 0.00915957737799965, 'our': 0.00913525268087066, 'people': 0.009134438220804704, 'children': 0.0090451011069225, 'young': 0.008173632157243156, 'unicef': 0.0073523078355777795, 'have': 0.006890015556415201, 'out': 0.006601635478875142, 'pandemic': 0.006216327861884523, '#covid19': 0.006147663585431794}

P(w|z=1) = {'children': 0.020218434627579017, 'covid': 0.012628672163225196, '19': 0.012614058389276489, 'we': 0.011552539656240297, 'year': 0.009143000308086468, 'their': 0.008839372658277693, 'have': 0.008501061728781024, 'education': 0.007677880295079404, 'this': 0.007376809714590481, 'at': 0.007224233738027378, 'be': 0.007197963780074489, 'they': 0.00701267604229