# Parsing, similarity and LDA

In [1]:
"""
Load dependencies
""" 

try:
    from nltk.tokenize import wordpunct_tokenize
except Exception:
    pass

import numpy as np
import codecs
import nltk
import re
import csv
import json
import sys
import operator
from nltk import PorterStemmer
from math import log
from collections import Counter
import matplotlib
import matplotlib.pyplot as plt
import pickle
import logging
import numpy as np
from ptm import AuthorTopicModel
from ptm.utils import convert_cnt_to_list, get_top_words


In [2]:
class Book():    
    """
    The Doc class represents a class of individual documents
    """    
    def __init__(self, author, title, text):
        
        content = []
        for chap in text:
            join_par = ''.join(chap)
            content.append(join_par)
        
        self.author = author
        self.title = title
        self.text = ''.join(content).lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
    
    def friendly_string(self):
        """ 
        Description: generate a friendly string to describe the document
        """
        return "{0} {1} {2}".format(self.author, self.title, self.text[1:20])
        
    def token_clean(self,length):
        """ 
        Description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """
        self.tokens = np.array([t for t in self.tokens if
                                (t.isalpha() and len(t) > length)])

    def stopword_remove(self, stopwords):
        """
        Description: remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])

    def stem(self):
        """
        Description: stem tokens with Porter Stemmer.
        """
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])

    def term_vector(self, corpus_token_list):
        """
        Description: generate a term-vector for this document.  The result
                     corresponds with a single row of the document-term-matrix
                     of the corpus
        input: corpus_token_list: a list of tokens from the corpus, a subset
                                  of which will be found in this document.
        """
        vector = [None] * len(corpus_token_list)
        counter = Counter(self.tokens)
        for i in range(len(corpus_token_list)):
            count = counter[corpus_token_list[i]]
            vector[i] = count

        return vector

In [3]:
class Corpus():
    """
    The Corpus class represents a document collection.
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the
        class is instantiated.
        """
        # Initialise documents by invoking the appropriate class
        self.docs = [Book(doc[1], doc[0], doc[2]) for doc in doc_data]         
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        # Get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        # Stopword removal, token cleaning and stemming to docs
        self.clean_docs(5)
        
        # Create vocabulary
        self.corpus_tokens()
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs.
        """
        for doc in self.docs:
            #doc.stopword_remove(self.stopwords)
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        Description: parses a file of stopwords, removes words of length
        'length' and  stems it.
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """        
        with codecs.open(stopword_file, 'r', 'utf-8') as f: raw = f.read()        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
             
    def corpus_tokens(self):
        """
        Description: create a set of all all tokens or in other words a
        vocabulary
        """        
        # Initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
    
    def document_term_matrix(self):
        """
        Description: generate the document-term matrix for the corpus
        """        
        result = []
        for doc in self.docs:
            vector = doc.term_vector(list(self.token_set))
            result.append(vector)        
        
        return result

    def tf_idf(self):
        """
        Description: generate the TF-IDF matrix for this corpus
        """

        # Generate a copy of the document-term matrix to work with in this
        # function and initialize other local variables.
        dt_matrix = self.document_term_matrix()
        tf_matrix = []
        idf_matrix = []
        tf_idf_matrix = []

        # Build a term frequency matrix from the document term matrix.
        # tf(d,v) = { 0 if x(d,v) = 0, 1 + log(x(d), v) otherwise }
        for dt_doc_vector in dt_matrix:
            tf_doc_vector = [(0 if x == 0 else 1 + log(x)) for x in dt_doc_vector]
            tf_matrix.append(tf_doc_vector)

        # Build a document frequency matrix for each term.
        # Initialize with zeros.
        df_vector = np.zeros(len(self.token_set))
        for dt_doc_vector in dt_matrix:
            # Increment the counters based on an indicator function which
            # is 1 if there is at least one instance of the term in the doc.
            df_vector = np.add(df_vector, [int(x > 0) for x in dt_doc_vector])

        # Build an inverse document frequency vector.
        idf_doc_vector = [log(len(self.docs) / x) for x in df_vector]

        # Build the TF-IDF weighting matrix.
        for tf_doc_vector in tf_matrix:
            tf_idf_vector = np.multiply(tf_doc_vector, idf_doc_vector)
            tf_idf_matrix.append(tf_idf_vector)

        return tf_idf_matrix

    def dict_rank(self, dictionary, use_tf_idf, n):        
        """
        Description: rank the documents in this corpus against the provided
        dictionary.  Return the top n documents.
        input: dictionary: the dictionary against which to rank the documents
               use_tf_idf: True if the TF-IDF matrix is to be used; False if
                           the document-term matrix is to be used.
               n: the number of top-ranked documents to return
        """
        if (use_tf_idf):
            dtm = self.tf_idf()
        else:
            dtm = self.document_term_matrix()
            
        # Get rid of words in the document term matrix not in the dictionary
        dict_tokens_set = set(item for item in dictionary)
        intersection = dict_tokens_set & self.token_set
        vec_positions = [int(token in intersection) for token in self.token_set] 

        # Get the score of each document
        sums = np.zeros(len(dtm))
        for j in range(len(dtm)):
            sums[j] = sum([a * b for a, b in zip(dtm[j], vec_positions)])

        # Order them and return the n top documents
        order = sorted(range(len(sums)), key = lambda k: sums[k], reverse=True)
        ordered_doc_data_n = [None] * len(dtm)
        ordered_sums = np.zeros(len(dtm))

        counter = 0        
        for num in order:
            ordered_doc_data_n[counter] = self.docs[num]
            ordered_sums[counter] = sums[num]
            counter += 1

        return zip(ordered_doc_data_n[0:n], ordered_sums[0:n])


# Data

In [4]:
file_handle = open("bookshelf.json")
file_content = file_handle.read()
bookshelf = json.loads(file_content)

print len(bookshelf)

8


In [5]:
# read the bookshelf.json file
file_handle = open("bookshelf.json")
file_content = file_handle.read()
bookshelf = json.loads(file_content)

print len(bookshelf)

# Instantiate the corpus class
#corpus = Corpus(pres_speech_list, './../data/stopwords/stopwords.txt', 2)

#print "The corpus has been loaded with {0} documents.".format(len(corpus.docs))


corpus = Corpus(bookshelf, 'stopwords.txt', 5)

print "The corpus has been loaded with {0} documents.".format(len(corpus.docs))


8
The corpus has been loaded with 8 documents.


# Similarity

In [6]:
# print len(corpus.token_set)
# print len(corpus.document_term_matrix()[0])

def cosine_similarity(v1, v2):
    """
    Calculate the cosine similarity of two vectors (vectors of terms
    in a document).
    """
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    return np.dot(v1, v2) / (norm1 * norm2)

simils = np.full((8, 8), 0)

for i in range(8):
    for j in range(8):

        simils[i][j] = round(cosine_similarity(corpus.document_term_matrix()[i], corpus.document_term_matrix()[j]),2)

# print np.array(corpus.document_term_matrix()).shape



In [7]:
# np.full((8, 8), 0)
import pandas

pandas.DataFrame(simils, ["Smith", "Marx", "Bastiat", "Mises", "Ricardo", "Friedman", "Krugman", "Mankiw"], 
                      ["Smith", "Marx", "Bastiat", "Mises", "Ricardo", "Friedman", "Krugman", "Mankiw"])


Unnamed: 0,Smith,Marx,Bastiat,Mises,Ricardo,Friedman,Krugman,Mankiw
Smith,1.0,0.53,0.37,0.35,0.69,0.17,0.23,0.17
Marx,0.53,1.0,0.58,0.5,0.79,0.28,0.26,0.23
Bastiat,0.37,0.58,1.0,0.59,0.52,0.36,0.38,0.39
Mises,0.35,0.5,0.59,1.0,0.5,0.5,0.37,0.44
Ricardo,0.69,0.79,0.52,0.5,1.0,0.26,0.28,0.19
Friedman,0.17,0.28,0.36,0.5,0.26,1.0,0.34,0.46
Krugman,0.23,0.26,0.38,0.37,0.28,0.34,1.0,0.4
Mankiw,0.17,0.23,0.39,0.44,0.19,0.46,0.4,1.0


# LDA: Topic

In [9]:
import lda

X = np.array(corpus.document_term_matrix())
vocab = corpus.token_set
titles = ["Smith", "Marx", "Bastiat", "Mises", "Ricardo", "Friedman", "Krugman", "Mankiw"]

#X.shape

#X.sum()

model = lda.LDA(n_topics=10, n_iter=1500, random_state=1)
model.fit(X)  

<lda.lda.LDA instance at 0x1196b2f38>

In [10]:
vocab = list(corpus.token_set)
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: produc commod profit quantiti increas capit employ countri
Topic 1: exchang econom demand credit price commod monetari interest
Topic 2: gener peopl anoth carri public accord befor increas
Topic 3: labour countri greater employ differ produc silver therefor
Topic 4: polici monetari incom region economi nextgraph econom interest
Topic 5: labour commod capit capitalist surplu produc circul machineri
Topic 6: differ exchang increas consequ system capit chang amount
Topic 7: natur foreign interest countri suppos littl govern maintain
Topic 8: product therefor process social work industri factori becom
Topic 9: franc produc peopl becaus principl nation protect result


In [11]:
doc_topic = model.doc_topic_
for i in range(8):    
    print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))

Smith (top topic: 5)
Marx (top topic: 3)
Bastiat (top topic: 9)
Mises (top topic: 1)
Ricardo (top topic: 0)
Friedman (top topic: 4)
Krugman (top topic: 4)
Mankiw (top topic: 4)


# LDA: Author and Topic

In [12]:
len(corpus.token_set)

10337

In [13]:
X = np.array(corpus.document_term_matrix())
vocab = list(corpus.token_set)

titles = ["Smith", "Marx", "Bastiat", "Mises", "Ricardo", "Friedman", "Krugman", "Mankiw"]

sums = X.sum(axis=0)
include = []

for i in range(len(sums)):
    if sums[i] > 1:
        include.append(i)

sX = X[:,include]
#print(list(vocab)[0,2])

svocab = []

for i in include:
    word = vocab[i]
    #print word
    svocab.append(word)


In [14]:
# translate every document into list of word indices with respect to svocab (subsetted vocabulary)

new_X = []
for i in range(len(X)):
    row = sX[i]
    new_row = []
    
    for j in range(len(row)):
        if row[j] == 0:
            next
        else:
            new_row = new_row + [j]*row[j]
    
    new_X.append(new_row)

# print np.array(new_X)



In [15]:
logger = logging.getLogger('AuthorTopicModel')
logger.propagate=False

In [16]:
import locale
locale.getdefaultlocale() # should give ('en_US', 'UTF-8')

('en_US', 'UTF-8')

In [31]:
##################### WITH NEW DATA
doc_author = np.array([[0],[1],[2],[3],[4],[5],[6],[7]])
author_name = ["Marx", "Smith", "Bastiat", "Mises", "Ricardo", "Friedman", "Krugman", "Mankiw"]
voca = svocab

corpus = new_X
n_doc = len(corpus)
n_topic = 10
n_author = 8
n_voca = len(voca)
max_iter = 100

print(n_doc)

######################

8


In [20]:
model = AuthorTopicModel(n_doc, n_voca, n_topic, n_author)
model.fit(corpus, doc_author, max_iter=max_iter)

2016-06-25 15:05:15 INFO:AuthorTopicModel:[INIT] 0	elapsed_time:12.77	log_likelihood:-3457166.54
2016-06-25 15:05:28 INFO:AuthorTopicModel:[INIT] 1	elapsed_time:13.38	log_likelihood:-3402450.39
2016-06-25 15:05:41 INFO:AuthorTopicModel:[INIT] 2	elapsed_time:13.24	log_likelihood:-3359059.32
2016-06-25 15:05:54 INFO:AuthorTopicModel:[INIT] 3	elapsed_time:13.20	log_likelihood:-3310747.61
2016-06-25 15:06:06 INFO:AuthorTopicModel:[INIT] 4	elapsed_time:11.85	log_likelihood:-3233483.41
2016-06-25 15:06:18 INFO:AuthorTopicModel:[INIT] 5	elapsed_time:11.79	log_likelihood:-3133701.17
2016-06-25 15:06:30 INFO:AuthorTopicModel:[INIT] 6	elapsed_time:11.69	log_likelihood:-3047030.51
2016-06-25 15:06:41 INFO:AuthorTopicModel:[INIT] 7	elapsed_time:11.54	log_likelihood:-2987115.59
2016-06-25 15:06:53 INFO:AuthorTopicModel:[INIT] 8	elapsed_time:11.35	log_likelihood:-2945092.49
2016-06-25 15:07:04 INFO:AuthorTopicModel:[INIT] 9	elapsed_time:11.29	log_likelihood:-2914733.25
2016-06-25 15:07:15 INFO:Autho

In [39]:
for k in range(n_topic):
    top_words = get_top_words(model.TW, voca, k, 10)
    print('topic ', k , ','.join(top_words))

('topic ', 0, u'franc,industri,produc,result,foreign,countri,protect,peopl,servic,principl')
('topic ', 1, u'polici,incom,region,economi,interest,nextgraph,howev,manufactur,effect,consumpt')
('topic ', 2, u'product,number,differ,therefor,exist,increas,necessari,consequ,properti,societi')
('topic ', 3, u'econom,monetari,theori,market,fiduciari,possibl,problem,commod,price,polici')
('topic ', 4, u'labour,product,commod,capit,capitalist,surplu,process,work,social,produc')
('topic ', 5, u'exchang,demand,credit,interest,object,quantiti,countri,increas,differ,circul')
('topic ', 6, u'labour,produc,countri,quantiti,profit,employ,capit,manufactur,proport,commod')
('topic ', 7, u'commod,product,increas,capit,produc,natur,employ,becaus,market,consum')
('topic ', 8, u'greater,silver,market,differ,countri,natur,howev,expenc,present,perhap')
('topic ', 9, u'peopl,nation,anoth,product,becom,accord,possibl,noth,becaus,twenti')


In [40]:
author_id = 0
fig = plt.figure(figsize=(12,6))
plt.bar(range(n_topic), model.AT[author_id]/np.sum(model.AT[author_id]))
plt.title(author_name[author_id])
plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(model.TW, voca, k, 10)) for k in range(n_topic)])
# plt.show()
plt.gcf().subplots_adjust(bottom=0.4)
fig.savefig('marx.png')

In [41]:
author_id = 1
fig = plt.figure(figsize=(12,6))
plt.bar(range(n_topic), model.AT[author_id]/np.sum(model.AT[author_id]))
plt.title(author_name[author_id])
plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(model.TW, voca, k, 10)) for k in range(n_topic)])
# plt.show()
plt.gcf().subplots_adjust(bottom=0.4)
fig.savefig('smith.png')

In [42]:
author_id = 4
fig = plt.figure(figsize=(12,6))
plt.bar(range(n_topic), model.AT[author_id]/np.sum(model.AT[author_id]))
plt.title(author_name[author_id])
plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(model.TW, voca, k, 10)) for k in range(n_topic)])
# plt.show()
plt.gcf().subplots_adjust(bottom=0.4)
fig.savefig('ricardo.png')

In [43]:
author_id = 5
fig = plt.figure(figsize=(12,6))
plt.bar(range(n_topic), model.AT[author_id]/np.sum(model.AT[author_id]))
plt.title(author_name[author_id])
plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(model.TW, voca, k, 10)) for k in range(n_topic)])
# plt.show()
plt.gcf().subplots_adjust(bottom=0.4)
fig.savefig('friedman.png')

In [44]:
author_id = 6
fig = plt.figure(figsize=(12,6))
plt.bar(range(n_topic), model.AT[author_id]/np.sum(model.AT[author_id]))
plt.title(author_name[author_id])
plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(model.TW, voca, k, 10)) for k in range(n_topic)])
# plt.show()
plt.gcf().subplots_adjust(bottom=0.4)
fig.savefig('krugman.png')