In [5]:
import numpy as np
import time
import codecs
import re
import json
import nltk
from nltk.corpus import stopwords

In [10]:
def preprocessing(json_file_path):
    # Get English stopwords
    stop_words = set(stopwords.words('english'))
    
    # Read dataset file
    with codecs.open(json_file_path, 'r', 'utf-8') as file:
        data = json.load(file)
    
    word2id = {}
    id2word = {}
    docs = []
    currentDocument = []
    currentWordId = 0

    for document in data:
        body_text = document.get("summary", "")
        if not body_text:
            continue
        
        # Tokenize words using nltk
        words = nltk.word_tokenize(body_text)
        for word in words:
            word = word.lower().strip()
            # Word length greater than 1 and does not contain numbers and is not a stopword
            if len(word) > 1 and word.isalpha() and word not in stop_words:
                if word in word2id:
                    currentDocument.append(word2id[word])
                else:
                    currentDocument.append(currentWordId)
                    word2id[word] = currentWordId
                    id2word[currentWordId] = word
                    currentWordId += 1
        docs.append(currentDocument)
        currentDocument = []
    
    return docs, word2id, id2word

In [11]:
def randomInitialize():
	for d, doc in enumerate(docs):
		zCurrentDoc = []
		for w in doc:
			pz = np.divide(np.multiply(ndz[d, :], nzw[:, w]), nz)
			z = np.random.multinomial(1, pz / pz.sum()).argmax()
			zCurrentDoc.append(z)
			ndz[d, z] += 1
			nzw[z, w] += 1
			nz[z] += 1
		Z.append(zCurrentDoc)

In [12]:
def gibbsSampling():
	for d, doc in enumerate(docs):
		for index, w in enumerate(doc):
			z = Z[d][index]
			ndz[d, z] -= 1
			nzw[z, w] -= 1
			nz[z] -= 1
			pz = np.divide(np.multiply(ndz[d, :], nzw[:, w]), nz)
			z = np.random.multinomial(1, pz / pz.sum()).argmax()
			Z[d][index] = z 
			ndz[d, z] += 1
			nzw[z, w] += 1
			nz[z] += 1

In [13]:
def perplexity():
	nd = np.sum(ndz, 1)
	n = 0
	ll = 0.0
	for d, doc in enumerate(docs):
		for w in doc:
			ll = ll + np.log(((nzw[:, w] / nz) * (ndz[d, :] / nd[d])).sum())
			n = n + 1
	return np.exp(ll/(-n))

In [14]:
alpha = 5
beta = 0.1	
iterationNum = 50
Z = []
K = 3
docs, word2id, id2word = preprocessing("combined_summary.json")
N = len(docs)
M = len(word2id)
ndz = np.zeros([N, K]) + alpha
nzw = np.zeros([K, M]) + beta
nz = np.zeros([K]) + M * beta
randomInitialize()
for i in range(0, iterationNum):
	gibbsSampling()
	print(time.strftime('%X'), "Iteration: ", i, " Completed", " Perplexity: ", perplexity())
 
topicwords = []
maxTopicWordsNum = 10
for z in range(0, K):
	ids = nzw[z, :].argsort()
	topicword = []
	for j in ids:
		topicword.insert(0, id2word[j])
	topicwords.append(topicword[0 : min(10, len(topicword))])

23:39:58 Iteration:  0  Completed  Perplexity:  1601.4458812967484
23:39:58 Iteration:  1  Completed  Perplexity:  1564.8555620861898
23:39:59 Iteration:  2  Completed  Perplexity:  1525.4060103569052
23:39:59 Iteration:  3  Completed  Perplexity:  1483.049077948842
23:39:59 Iteration:  4  Completed  Perplexity:  1440.6696384791242
23:40:00 Iteration:  5  Completed  Perplexity:  1400.1632282464623
23:40:00 Iteration:  6  Completed  Perplexity:  1363.7493807685983
23:40:00 Iteration:  7  Completed  Perplexity:  1330.5314447940789
23:40:01 Iteration:  8  Completed  Perplexity:  1307.2173167075662
23:40:01 Iteration:  9  Completed  Perplexity:  1286.9411632313015
23:40:02 Iteration:  10  Completed  Perplexity:  1272.0184361807794
23:40:02 Iteration:  11  Completed  Perplexity:  1259.5159278123035
23:40:02 Iteration:  12  Completed  Perplexity:  1248.9676010688079
23:40:03 Iteration:  13  Completed  Perplexity:  1238.830571996407
23:40:03 Iteration:  14  Completed  Perplexity:  1228.639669

In [15]:
topicwords

[['seats',
  'bjp',
  'india',
  'israel',
  'gaza',
  'israeli',
  'congress',
  'modi',
  'nda',
  'sabha'],
 ['china',
  'sea',
  'philippines',
  'south',
  'chinese',
  'president',
  'philippine',
  'ukraine',
  'military',
  'dialogue'],
 ['minister',
  'national',
  'president',
  'coalition',
  'article',
  'meeting',
  'despite',
  'new',
  'june',
  'including']]