In [1]:
import numpy as np
import os
from __future__ import division, print_function
import lda
os.chdir("E:\\UVA\\Capstone\\DataSet\\UCI")
from scipy import linalg,dot

In [2]:
def extract_matrix(data_file,vocab_file,max_doc,word_sparsity,word_ubiquity):
    word_dict = {}
    vocab_dict = {}
    vocab_list = []
    doc = ""
    last_doc_id = 1
    doc_dict = {}
    col_index = 0
    
    #Extract from Vocab file
    
    vocab_file.seek(0)
    line = vocab_file.readline()
    
    #Create Word vector along with its position index
    while line:
        vocab_list.append(line.strip())
        vocab_dict[line.strip()] = col_index
        col_index += 1
        line = vocab_file.readline()
    
    #Extract from data file
    
    data_file.seek(0)
    line = data_file.readline()
    doc_vec = [0] * len(vocab_dict)
    #doc_vec = np.zeros(len(vocab_dict))
    #tdm = np.empty((0,len(vocab_dict)), int)
    tdm = []
    
    count = 0
    while line:
        col = line.strip().split(' ')
        if len(col) == 3:
            doc_id = int(col[0])
            word_id = int(col[1]) - 1
            word_count = int(col[2])
                  
            doc_vec[word_id] += word_count
            
            if doc_id != last_doc_id:
                #tdm = np.append(tdm,doc_vec, axis=0)
                tdm.append(doc_vec)
                doc_vec = [0] * len(vocab_dict)
            if doc_id==max_doc+1:
                break
            
            last_doc_id = doc_id
        line = data_file.readline()
    
    tdm = np.asarray(tdm)
    print("Original Matrix: dim = " + str(tdm.shape))
    nrow,ncol = tdm.shape
    tdm = tdm[:,tdm.sum(axis=0)>10] #if the word appears in more than 10 docs, then its considered valid
    print("Deleted all zero columns: dim: " + str(tdm.shape))
    tdm = np.delete(tdm, np.nonzero((tdm==0).sum(axis=0) > nrow*(1-word_sparsity)), axis=1)
    print("After removing sparse words : dim = " + str(tdm.shape))
    tdm = np.delete(tdm, np.nonzero((tdm==0).sum(axis=0) < nrow*(1-word_ubiquity)), axis=1)
    print("After removing obiquitous words: dim = " +str(tdm.shape))
    vocab_file.close()
    data_file.close()
    return tdm

In [3]:
data_file = open("docword.nytimes.txt\\docword.nytimes.txt", 'r')
vocab_file = open("vocab.nytimes.txt", 'r')

num_doc = 35000 #Number of documents (Matrix row length)
word_sparsity = 0 #0.01 means remove words that appear in less than 1% of documents (sparse words)
word_ubiquity = 0.8 #remove words that appear in more than  80% of documents (stopwords)
tdm = extract_matrix(data_file,vocab_file,num_doc,word_sparsity,word_ubiquity)
tdm.shape

Original Matrix: dim = (34982, 102660)
Deleted all zero columns: dim: (34982, 46135)
After removing sparse words : dim = (34982, 46135)
After removing obiquitous words: dim = (34982, 46135)


(34982, 46135)

In [4]:
binfile = open("matrix_35000_docs.bin","wb")
np.save(binfile,tdm)
binfile.close()

In [3]:
mat = np.load("matrix_35000_docs.bin")
type(mat),mat.shape,mat[:1]

(numpy.ndarray, (34982, 46135), array([[0, 0, 0, ..., 0, 0, 0]]))

In [8]:
np.savetxt("C:\\Users\\Savi\\Downloads\\BIDMach_1.0.3-win-x86_64\\mat25000.txt", mat, delimiter=' ')

In [None]:
val a = loadFMat("mat1000.txt")
val k = 50
val (nn,opts) = LDA.learner(a,k) #https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf
#val (nn, opts) = LDA.learnPar(a) 
#val (nn,opts) = LDA.learnBatch(a,k)
#val (nn,opts) = LDAgibbs.learner(a,k)
opts.what
opts.nthreads=number of gpus
opts.npasses = 1
nn.train
nn.modelmats(0) # the topic-word matrix, one topic per row. (looks like topic document matrix)
nn.datamats(1) # the factorized topic-document matrix, one document per column. 

In [4]:
# document-term matrix
X = mat
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))

# vocab file
vocab = [line for line in open("E:\\UVA\\Capstone\\DataSet\\UCI\\vocab.nytimes.txt")]
print("type(vocab): {}".format(type(vocab)))
print("len(vocab): {}\n".format(len(vocab)))

type(X): <class 'numpy.ndarray'>
shape: (34982, 46135)

type(vocab): <class 'list'>
len(vocab): 102660



In [5]:
model = lda.LDA(n_topics=50, n_iter=1, random_state=1)
%timeit -n1 -r1 model.fit(X)

1 loops, best of 1: 2min 23s per loop


In [123]:
# topic-word probabilities
topic_word = model.topic_word_
print("type(topic_word): {}".format(type(topic_word)))
print("shape: {}".format(topic_word.shape))

type(topic_word): <class 'numpy.ndarray'>
shape: (20, 7455)


In [124]:
#the probabilities of the words should be normalized.
for n in range(5):
    sum_pr = sum(topic_word[n,:])
    print("topic: {} sum: {}".format(n, sum_pr))

topic: 0 sum: 1.000000000000148
topic: 1 sum: 0.9999999999999072
topic: 2 sum: 0.9999999999999583
topic: 3 sum: 0.999999999999847
topic: 4 sum: 0.9999999999999427


In [125]:
#get the top 5 words for each topic (by probability):
n = 5
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
    print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))

*Topic 0
- chastened
 barefooted
 colonization
 blowup
 combed

*Topic 1
- aftershock
 adventurous
 bask
 bookshelf
 bdaley

*Topic 2
- collapsing
 commander
 clarity
 chimney
 arranger

*Topic 3
- chinned
 abolishing
 abduction
 bleaker
 advantaged

*Topic 4
- bulgarian
 carbonated
 blueberry
 berries
 bluefin

*Topic 5
- comforted
 blurting
 cockfighting
 birdseed
 backgammon

*Topic 6
- bitching
 checkout
 accelerates
 classically
 cetacean

*Topic 7
- berries
 agordon
 choreographing
 capstone
 burst

*Topic 8
- arrayed
 cheered
 biomechanic
 berries
 charging

*Topic 9
- backspin
 billing
 aspartame
 calla
 changeable

*Topic 10
- allow
 antler
 attractively
 atypically
 batsman

*Topic 11
- agencies
 aftertaste
 coagulant
 brian
 agencias

*Topic 12
- audio
 bluebell
 alibi
 carbonated
 cloture

*Topic 13
- ancestry
 bleating
 burlesque
 bafflement
 arresting

*Topic 14
- chilean
 biked
 cardinal
 admission
 backer

*Topic 15
- amplia
 clarity
 commander
 arranger
 bayonet

*Topi

In [126]:
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))

type(doc_topic): <class 'numpy.ndarray'>
shape: (100, 20)
