# On Finding the Natural Number of Topics with Latent Dirichlet Allocation

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plotter
import feather
import multiprocessing

from scipy import stats, linalg
from gensim import corpora, models, similarities, matutils
from collections import defaultdict



In [2]:
INFILE = r'.\output\notes_clean_compact.feather'
INFILE_STOPWORDS = r'.\data\stopwords.csv'
MINIMUM_TERM_N = 25 #if the word frequency is < N it is not used
MAXIMUM_DOC_PROPORTION = 0.3 #if the word appears in more than x% of documents it is not used
NUM_WORKERS = multiprocessing.cpu_count() - 1

In [3]:
df_compact = feather.read_dataframe(INFILE)
documents = df_compact.notes_standard_transfrom.tolist()
TOTAL_DOCUMENTS = len(documents)

In [4]:
# Remove common words and tokenize
stoplist = set(pd.read_csv(r'.\data\stopwords.csv').STOPWORDS_CUSTOM.tolist())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

Document pruning logic is implemented here. We only keep tokens that appear at a minimum `MINIMUM_TERM_N` times and are in not above `MAXIMUM_DOC_PROPORTION` percentage of the document

In [5]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > MINIMUM_TERM_N]
         for text in texts]

dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_above=MAXIMUM_DOC_PROPORTION)
dictionary.compactify()

Converting documents to bag-of-words representation. We can hold everything in memory as it is a relatiely small corpus

In [6]:
bm_corpus = [dictionary.doc2bow(text) for text in texts]

Let us now define Arun et al metric. From the paper:

We view LDA as a matrix factorization method which factorizes a document-word frequency matrix $M$ into two matrices $M1$ and $M2$ of order $T∗W$ and $D∗T$ respectively where $T$ is the number of topics and $W$ is the size of the vocabulary of the corpus. We propose a new measure that computes the symmetric Kullback-Leibler divergence of the Singluar value distributions of matrix $M1$ and the distribution of the vector $L∗M2$ where $L$ is a $1 ∗ D$ vector containing the lengths of each document in the corpus.

$CM1$ - distribution of topics in the corpus $C$ got from the matrix $M1$

$CM2$ - distribution over singular values of matrix $M1$


In [7]:
%load_ext line_profiler

In [8]:
def arun(corpus, dictionary, max_topics, min_topics=1, step=1):
    score = []
    for i in range(min_topics,max_topics,step):
        
        #Fitting LDA model. Using multi-core for performance boost
        lda = models.ldamulticore.LdaMulticore(corpus=corpus
                                               ,id2word=dictionary
                                               ,num_topics=i
                                               ,workers=NUM_WORKERS)
        
        m1 = lda.expElogbeta # gives the topic-word distribution for each topic, i.e. p(word|topic)
        u,cm1,v = np.linalg.svd(m1, full_matrices=False) #lets use this; fast and we do not use full matrix anyway
        
        #Document-topic matrix
        lda_topics = lda[bmc_corpus]
        m2 = matutils.corpus2dense(lda_topics, lda.num_topics, TOTAL_DOCUMENTS)
        m2 = m2.transpose()
        cm2 = l.dot(m2)
        cm2 = cm2 + 0.0001
        cm2norm = np.linalg.norm(l)
        cm2 = cm2/cm2norm
        
        #Calculate divergence measure
        score_iteration = sym_kl(cm1,cm2)
        
        #Append score for this run
        score.append(score_iteration)
        
        print('Finished running topic {0:3d}; Divergence measure:{1:.3f}'.format(i, score_iteration))
    return score

Defining the symmetric KL divergence function
$ProposedMeasure(M1,M2) = KL(CM1||CM2) + KL(CM2||CM1)$

In [9]:
# Define symmetric KL divergence function
def sym_kl(p,q):
    return np.sum([stats.entropy(p,q),stats.entropy(q,p)])

Now, we need the $L$ vector which, as mentioned above, is a vector containing the lengths of each document in the corpus.

In [10]:
l = np.array([sum(cnt for _, cnt in doc) for doc in bmc_corpus])

In [11]:
%lprun -f arun arun(bmc_corpus,dictionary,max_topics=2)

Finished running topic   1; Divergence measure:0.000


In [20]:
#Fitting LDA model. Using multi-core for performance boost
lda = models.ldamulticore.LdaMulticore(corpus=bmc_corpus
                                       ,id2word=dictionary
                                       ,num_topics=2
                                       ,workers=NUM_WORKERS)

In [23]:
%time matutils.corpus2dense(lda[bmc_corpus], lda.num_topics, TOTAL_DOCUMENTS)

Wall time: 1min 26s


array([[ 0.35267299,  0.9405632 ,  0.95922583, ...,  0.90538317,
         0.40841088,  0.10225472],
       [ 0.64732701,  0.05943678,  0.04077417, ...,  0.09461683,
         0.59158915,  0.89774531]], dtype=float32)

In [24]:
%time matutils.corpus2csc(lda[bmc_corpus], num_terms=lda.num_topics, num_docs=TOTAL_DOCUMENTS).todense()

Wall time: 1min 21s


matrix([[ 0.35269361,  0.94057698,  0.95922504, ...,  0.90539204,
          0.40837986,  0.10225515],
        [ 0.64730639,  0.05942302,  0.04077496, ...,  0.09460796,
          0.59162014,  0.89774485]])

In [1]:
kl

NameError: name 'kl' is not defined

In [None]:

# Plot kl divergence against number of topics
plt.plot(kl)
plt.ylabel('Symmetric KL Divergence')
plt.xlabel('Number of Topics')
plt.savefig('kldiv.png', bbox_inches='tight')