Spark LDA for NASA ADS v1.0 
8/2/2016
@author: Tarun Ruchandani

To Do:
x Generate document term matrix for LDA
x Get LDA Results on small Corpus

o Generate JS Matrix: figure out the right implementation - also try ifa library
o Build Mapequation with JS Matrix
o Extend to a larger corpus
o Build Docker Container
o Deploy to adsqb

In [28]:
#All 'em libraries

import numpy as np
import textmining
import pyspark
import lda
import lda.datasets
import ads
import pandas as pd
from textblob import TextBlob
from numpy  import array
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [29]:
# Extract Cosmology papers
ads.config.token = 'TkEI7jQxScyoCtzHfhcpCWVelRqySmH5XBksQCFA'
papers = list(ads.SearchQuery(q='cosmology'))

paper_abstracts=list()

for paper in papers:
    paper_abstracts.append(paper.abstract)

# Select first 25 papers which have abstracts. Some later ones don't. Revisit this when scaling.
paper_abstracts = paper_abstracts[0:25]




In [76]:
# Build Term-Document Matrix

tdm = textmining.TermDocumentMatrix()
for paper in paper_abstracts:
    tdm.add_doc(paper)
    
paper_abstracts_tdm = list()

for row in tdm.rows(cutoff=0):
    paper_abstracts_tdm.append(row)

paper_abstracts_tdm_df = pd.DataFrame(paper_abstracts_tdm)
paper_abstracts_tdm_df.head()

# Vocab of terms in abstracts
vocab = paper_abstracts_tdm_df._slice(slice(1))
vocab

d_t_freq = paper_abstracts_tdm_df._slice(slice(1,24))

b=d_t_freq.values

b=b.astype(int)


# d_t_freq_m = d_t_freq.as_matrix
# d_t_freq_np = np.ndarray(b, dtype=np.int64)
# d_t_freq_np


In [79]:
# running LDA on TDM
# Parameters:	
# rdd – RDD of documents, which are tuples of document IDs and term (word) count vectors. 
# The term count vectors are “bags of words” with a fixed-size vocabulary 
# (where the vocabulary size is the length of the vector). Document IDs must be unique and >= 0.

# k – Number of topics to infer, i.e., the number of soft cluster centers. (default: 10)

# maxIterations – Maximum number of iterations allowed. (default: 20)

# docConcentration – Concentration parameter (commonly named “alpha”) 
# for the prior placed on documents’ distributions over topics (“theta”). (default: -1.0)

# topicConcentration – Concentration parameter (commonly named “beta” or “eta”) 
# for the prior placed on topics’ distributions over terms. (default: -1.0)

# seed – Random seed for cluster initialization. Set as None to generate seed based on system time. (default: None)

# checkpointInterval – Period (in iterations) between checkpoints. (default: 10)

# optimizer – LDAOptimizer used to perform the actual calculation. Currently “em”, “online” are supported. 
# (default: “em”)

model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit(b)

topic_word = model.topic_word_  # model.components_ also works


for i, topic_dist in enumerate(topic_word):
     topic_words = np.array(vocab)


In [82]:
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))

type(doc_topic): <class 'numpy.ndarray'>
shape: (23, 20)


In [81]:
# Document-topic Probab

doc_topic = model.doc_topic_

for n in range(5):
    sum_pr = sum(doc_topic[n,:])
    print("document: {} sum: {}".format(n, sum_pr))

for n in range(10):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}\n...".format(n,
                                            topic_most_pr
                                            ))


document: 0 sum: 0.9999999999999998
document: 1 sum: 1.0
document: 2 sum: 1.0000000000000002
document: 3 sum: 0.9999999999999999
document: 4 sum: 0.9999999999999998
doc: 0 topic: 12
...
doc: 1 topic: 14
...
doc: 2 topic: 10
...
doc: 3 topic: 16
...
doc: 4 topic: 14
...
doc: 5 topic: 16
...
doc: 6 topic: 14
...
doc: 7 topic: 17
...
doc: 8 topic: 14
...
doc: 9 topic: 14
...


In [97]:
# Building JS Matrix


def multi_js(p, q):
    """Jensen-Shannon divergence (symmetric) between two multinomials,
    expressed in nats."""
    if (len(q.shape) == 2):
        axis = 1
    else:
        axis = 0
    # D_{JS}(P\|Q) = (D_{KL}(P\|Q) + D_{KL}(Q\|P)) / 2
    return 0.5 * ((q * (np.log(q.clip(1e-10,1))
                        - np.log(p.clip(1e-10,1)))).sum(axis)
                      + (p * (np.log(p.clip(1e-10,1))
                              - np.log(q.clip(1e-10,1)))).sum(axis))


JS_D1D2 = multi_js(doc_topic[0],doc_topic[1])
JS_D1D2

2.87007848208287

In [94]:
# Alternative JS implementation

def jsd(x,y): #Jensen-shannon divergence
    import warnings
    warnings.filterwarnings("ignore", category = RuntimeWarning)
    x = np.array(x)
    y = np.array(y)
    d1 = x*np.log2(2*x/(x+y))
    d2 = y*np.log2(2*y/(x+y))
    d1[np.isnan(d1)] = 0
    d2[np.isnan(d2)] = 0
    d = 0.5*np.sum(d1+d2)    
    return d

jsd(doc_topic[0],doc_topic[1])

0.58639318010578223

In [176]:
# Constructing JS Matrix

JSM = list()
for index1,item1 in enumerate(doc_topic):
    for index2,item2 in enumerate(doc_topic):
        result = jsd(doc_topic[index1],doc_topic[index2])
        JSM.append(result)
    js_df = pd.DataFrame(JSM)
    js_df = js_df.transpose()
    js_df[index1] = js_df

print(js_df)


# for index1,item1 in enumerate(doc_topic):
#     for index2,item2 in enumerate(doc_topic):
#         result = jsd(doc_topic[index1],doc_topic[index2])
#         js_df[index1,index2] = pd.DataFrame(result)
        
# print(js_df)



   0         1         2         3         4         5         6         7    \
0  0.0  0.586393  0.571829  0.651409  0.493998  0.518462  0.475741  0.455646   

        8         9   ...        519       520       521       522       523  \
0  0.561006  0.349864 ...   0.407603  0.434972  0.656174  0.599095  0.500193   

        524       525       526       527  528  
0  0.422044  0.407688  0.571135  0.562574  0.0  

[1 rows x 529 columns]
