In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer, treebank
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm

In [2]:
tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(stopwords.words('english'))
p_stemmer = PorterStemmer()

In [3]:
with open("./LDA_test.txt", "r") as f:
    f1 = f.read()

In [4]:
sents = f1.split("\n")

In [5]:
texts = []

In [6]:
for i in sents:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stemmed_tokens)

In [7]:
texts[:2]

[['thai', 'polit', 'countri', 'young', 'men'],
 ['cricket',
  'religi',
  'follow',
  'india',
  'sometim',
  'even',
  'religion',
  'take',
  'back',
  'seat',
  'come',
  'unit',
  'peopl',
  'cricket']]

In [72]:
NUM_TOPICS = 3
topic_allocation = []

In [73]:
for text in texts:
    topic_allocation.append([(i, np.random.randint(NUM_TOPICS)) for i in text])

In [74]:
topic_allocation

[[('thai', 2), ('polit', 2), ('countri', 1), ('young', 0), ('men', 1)],
 [('cricket', 1),
  ('religi', 2),
  ('follow', 0),
  ('india', 0),
  ('sometim', 1),
  ('even', 2),
  ('religion', 2),
  ('take', 0),
  ('back', 2),
  ('seat', 2),
  ('come', 0),
  ('unit', 1),
  ('peopl', 0),
  ('cricket', 0)],
 [('cricket', 1),
  ('nation', 2),
  ('sport', 1),
  ('play', 2),
  ('two', 2),
  ('team', 1),
  ('eleven', 0),
  ('player', 1),
  ('score', 0),
  ('run', 2),
  ('point', 2),
  ('run', 0),
  ('two', 1),
  ('set', 2),
  ('three', 1),
  ('small', 1),
  ('wooden', 2),
  ('post', 0),
  ('call', 1),
  ('wicket', 2)],
 [('england', 0),
  ('dismiss', 0),
  ('record', 1),
  ('low', 1),
  ('score', 2),
  ('new', 1),
  ('zealand', 1),
  ('home', 1),
  ('team', 0),
  ('took', 1),
  ('cautiou', 2),
  ('approach', 0),
  ('led', 0),
  ('captain', 1),
  ('kane', 0),
  ('williamson', 1),
  ('post', 2),
  ('18th', 0),
  ('test', 2),
  ('centuri', 1),
  ('new', 0),
  ('zealand', 2),
  ('record', 1),
  ('102

In [75]:
def df_generator(topic_allocation):    
    token_list = []
    topic_list = []
    doc_list = []
    for i, text in enumerate(topic_allocation):
        for token in text:
            topic_list.append(token[1])
            token_list.append(token[0])
            doc_list.append(i)
    return pd.DataFrame({"token": token_list, "topic_no": topic_list, "doc_no": doc_list})

In [76]:
def token_dist_maker(df):   
    token_dist = df.pivot_table(values=["topic_no"], index=["token"], columns=["topic_no"], aggfunc="count")
    token_dist.reset_index(inplace=True)
    token_dist.columns = ['token', 'grp_1', 'grp_2', 'grp_3']
    token_dist.fillna(0.01 , inplace=True)
    return token_dist

In [77]:
def doc_dist_maker(df):    
    doc_dist = df.pivot_table(values=["topic_no"], index=["doc_no"], columns=["topic_no"], aggfunc="count")
    doc_dist.reset_index(inplace=True)
    doc_dist.columns = ['doc_no', 'grp_1', 'grp_2', 'grp_3']
    doc_dist.fillna(0.01 , inplace=True)
    return doc_dist

In [78]:
df_generator(topic_allocation).head()

Unnamed: 0,doc_no,token,topic_no
0,0,thai,2
1,0,polit,2
2,0,countri,1
3,0,young,0
4,0,men,1


In [79]:
token_dist_0 = token_dist_maker(df_generator(topic_allocation))
token_dist_0.head(10)

Unnamed: 0,token,grp_1,grp_2,grp_3
0,102,0.01,0.01,1.0
1,18th,1.0,0.01,0.01
2,1st,0.01,1.0,0.01
3,2019,0.01,2.0,1.0
4,6,0.01,0.01,1.0
5,addit,1.0,0.01,0.01
6,allianc,1.0,0.01,1.0
7,also,2.0,0.01,1.0
8,alway,0.01,1.0,1.0
9,approach,1.0,0.01,0.01


In [80]:
doc_dist_0 = doc_dist_maker(df_generator(topic_allocation))
doc_dist_0.head(10)

Unnamed: 0,doc_no,grp_1,grp_2,grp_3
0,0,1.0,2.0,2.0
1,1,6.0,3.0,5.0
2,2,4.0,8.0,8.0
3,3,8.0,10.0,6.0
4,4,6.0,8.0,8.0
5,5,2.0,0.01,2.0
6,6,3.0,1.0,1.0
7,7,4.0,2.0,3.0
8,8,3.0,4.0,4.0
9,9,2.0,1.0,2.0


In [19]:
ITER = 10
doc_dist = doc_dist_0.copy()
token_dist = token_dist_0.copy()

In [34]:
for num in tqdm(range(ITER)):
    for doc_number, sent in enumerate(topic_allocation):
        t1_freq = doc_dist.loc[(doc_dist.doc_no == doc_number), "grp_1"].values[0]
        t2_freq = doc_dist.loc[(doc_dist.doc_no == doc_number), "grp_2"].values[0]
        t3_freq = doc_dist.loc[(doc_dist.doc_no == doc_number), "grp_3"].values[0]
        for i, (token, topic) in enumerate(sent):
            p1_freq = token_dist.loc[token_dist.token == token, "grp_1"].values[0]
            p2_freq = token_dist.loc[token_dist.token == token, "grp_2"].values[0]
            p3_freq = token_dist.loc[token_dist.token == token, "grp_3"].values[0]
            
            p1t1 = t1_freq * p1_freq
            p2t2 = t2_freq * p2_freq
            p3t3 = t3_freq * p3_freq  
            total = p1t1 + p2t2 + p3t3
            
            p_p1t1 = p1t1/total
            p_p2t2 = p2t2/total
            p_p3t3 = p3t3/total
            
            topic = np.random.choice([0, 1, 2], p=[p_p1t1, p_p2t2, p_p3t3])
            topic_allocation[doc_number][i] = (token, topic)
            
            token_dist = token_dist_maker(df_generator(topic_allocation))
            doc_dist = doc_dist_maker(df_generator(topic_allocation))

100%|██████████| 10/10 [01:55<00:00, 11.59s/it]


In [35]:
# token_dist = token_dist_maker(df_generator(topic_allocation))
token_dist.head(10)

Unnamed: 0,token,grp_1,grp_2,grp_3
0,102,0.01,1.0,0.01
1,18th,0.01,0.01,1.0
2,1st,0.01,1.0,0.01
3,2019,0.01,3.0,0.01
4,6,1.0,0.01,0.01
5,addit,1.0,0.01,0.01
6,allianc,2.0,0.01,0.01
7,also,0.01,3.0,0.01
8,alway,0.01,0.01,2.0
9,approach,1.0,0.01,0.01


In [43]:
# doc_dist = doc_dist_maker(df_generator(topic_allocation))
doc_dist.head(20)

Unnamed: 0,doc_no,grp_1,grp_2,grp_3
0,0,2.0,0.01,3.0
1,1,3.0,10.0,1.0
2,2,6.0,5.0,9.0
3,3,9.0,7.0,8.0
4,4,9.0,13.0,0.01
5,5,1.0,3.0,0.01
6,6,2.0,3.0,0.01
7,7,2.0,7.0,0.01
8,8,4.0,7.0,0.01
9,9,0.01,5.0,0.01


In [63]:
sents[19]

'The theory of political religion concerns governmental ideologies whose cultural and political backing is so strong that they are said to attain power equivalent to those of a state religion, with which they often exhibit significant similarities in both theory and practice.[6] In addition to basic forms of politics, like parliament and elections, it also holds an aspect of sacralization related to the institutions contained within the regime and also provides the inner measures traditionally considered to be religious territory, such as ethics, values, symbols, myths, rituals and for example a national liturgical calendar.Politics is in good phase today. '

In [24]:
doc_dist_0.head(10)

Unnamed: 0,doc_no,grp_1,grp_2,grp_3
0,0,2.0,2.0,1.0
1,1,5.0,5.0,4.0
2,2,7.0,6.0,7.0
3,3,10.0,5.0,9.0
4,4,13.0,6.0,3.0
5,5,1.0,2.0,1.0
6,6,1.0,2.0,2.0
7,7,2.0,6.0,1.0
8,8,2.0,4.0,5.0
9,9,2.0,2.0,1.0
