##HW2  Question 1

### Class of Documents

In [41]:
import numpy as np
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

class Doc():
    
    """ The Doc class rpresents a class of individula documents
    
    """
    
    def __init__(self, docid, doc, author, year):
        self.docid = docid
        self.text = doc.lower()
        self.text = re.sub(u'[\u2019\']', '', self.text)
        self.tokens = np.array(wordpunct_tokenize(self.text))
        self.stem = None
        self.author = author
        self.year = year
        
    def tf(self, wordlist):
        
        """
        Returns ARRAY with wordlist frequency
        """
        
        count = np.zeros(len(wordlist))
        
        for wid, word in np.ndenumerate(wordlist):
            count[wid] = (self.tokens == word).sum()
        return count
        
    
    def word_exists(self, wordlist):
        
        """
        Returns ARRAY of binary value where 1 inidicates presence of a word
        """
        
        is_word = np.zeros(len(wordlist))
        
        for wid, word in np.ndenumerate(wordlist):
            if word in self.tokens:
                is_word[wid] = 1
        return is_word
            
    def token_clean(self,length):

        """ 
        strip out non-alpha tokens and length one tokens
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        Remove stopwords from tokens.
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        Stem tokens with Porter Stemmer.
        """
        
        self.stems = n.array([PorterStemmer().stem(t) for t in self.tokens])

###Class of Document Collections

In [162]:
import numpy as np
import codecs

class RawDocs():
    """ The RawDocs class rpresents a class of document collections
     
    """
    def __init__(self, doc_data, stopword_file):

        self.docs = [Doc(docid, doc[2], doc[1], doc[0]) for docid, doc in enumerate(doc_data)]

        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        self.stopwords = set(raw.splitlines())

        self.N = len(self.docs)
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            
    def count(self, dictionary):
        """ 
        word count frequency of dictionary in document collection
        """
        
        return ({(doc.docid, doc.year, doc.author) : \
                 doc.tf(dictionary) for doc in self.docs})
    
    def idf(self, dictionary):
        """ 
        returns array of inverted document frequency for given dictionary 
        over collection of docs
        """
        
        is_word_docs = np.array([doc.word_exists(dictionary) for doc in self.docs])
        
        return(np.log(self.N / sum([is_word for is_word in is_word_docs])))
    
    def tf_idf(self, dictionary):
        """ 
        returns tf-idf score of given dictionary of words for every document 
        """
        
        tf = self.count(dictionary)
        idf = self.idf(dictionary)
        
        tf_idf_docs = dict()
        #convert counts t log counts
        for doc in self.docs:
            tf_idf_docs[(doc.docid, doc.year, doc.author) ] = \
            np.log(tf[(doc.docid, doc.year, doc.author)] + 1) * idf
            
        return(tf_idf_docs)
    
    def rank_tfidf(self, dictionary):
        
        """
        Calculates document rank based on tfidf
        """
        
        docs_tfidf = self.tf_idf(dictionary)
        doc_rank = []
        
        for key in docs_tfidf.keys():
            docs_tfidf[key] = sum(docs_tfidf[key])
            doc_rank.append((key, docs_tfidf[key]))
            
        return(np.sort(np.array(doc_rank), axis=0)[::-1])
    
    def rank_count(self, dictionary):
        
        """
        Calculates document rank based on word frequency
        """
        
        docs_count = self.count(dictionary)
        doc_rank = []
        
        for key in docs_count.keys():
            docs_count[key] = sum(docs_count[key])
            doc_rank.append((key, docs_count[key]))
            
        return(np.sort(np.array(doc_rank), axis=0)[::-1])

### Demo

####import documents for consumption

In [157]:
# import documents for consumption
import json
fd = open('./data/pres_speech.json', 'r')
text = fd.read()
fd.close()

pres_speech = json.loads(text)

####Instantiate RawDocs class and preprocess documents preprocessing

In [163]:
speech_docs = RawDocs(pres_speech[0:20], './data/stopwords.txt')
speech_docs.clean_docs(2)

#### Method outputs

In [160]:
sample_list = ['danger', 'bless', 'race', 'human', 'fear', 'believe', 'influence']

speech_docs.count(sample_list)

{(0, '2013', 'Obama'): array([ 0.,  2.,  3.,  2.,  1.,  4.,  2.]),
 (1, '2014', 'Obama'): array([  1.,   3.,   5.,   1.,   3.,  10.,   0.]),
 (2, '2009', 'Obama'): array([ 0.,  2.,  0.,  1.,  1.,  2.,  0.]),
 (3, '2010', 'Obama'): array([ 1.,  2.,  0.,  2.,  2.,  3.,  1.]),
 (4, '2011', 'Obama'): array([ 0.,  2.,  9.,  0.,  0.,  8.,  0.]),
 (5, '2012', 'Obama'): array([ 1.,  2.,  1.,  2.,  0.,  3.,  2.]),
 (6, '2005', 'Bush'): array([ 1.,  1.,  0.,  5.,  2.,  2.,  0.]),
 (7, '2006', 'Bush'): array([ 1.,  1.,  0.,  7.,  4.,  2.,  1.]),
 (8, '2007', 'Bush'): array([ 3.,  1.,  0.,  2.,  0.,  2.,  0.]),
 (9, '2008', 'Bush'): array([ 1.,  1.,  0.,  2.,  0.,  2.,  1.]),
 (10, '2001', 'Bush'): array([ 0.,  1.,  0.,  0.,  0.,  1.,  0.]),
 (11, '2002', 'Bush'): array([ 4.,  1.,  1.,  2.,  1.,  1.,  0.]),
 (12, '2003', 'Bush'): array([ 3.,  1.,  0.,  5.,  2.,  1.,  0.]),
 (13, '2004', 'Bush'): array([ 5.,  1.,  1.,  1.,  1.,  3.,  1.]),
 (14, '1997', 'Clinton'): array([ 0.,  2.,  0.,  3.,  0.,  

In [150]:
speech_docs.tf_idf(sample_list)

{(0,
  '2013',
  'Obama'): array([ 0.        ,  0.05635144,  1.10696672,  0.17854529,  0.41438903,
         0.        ,  0.87725037]),
 (1,
  '2014',
  'Obama'): array([ 0.41438903,  0.0711076 ,  1.43073373,  0.11264954,  0.82877806,
         0.        ,  0.        ]),
 (2,
  '2009',
  'Obama'): array([ 0.        ,  0.05635144,  0.        ,  0.11264954,  0.41438903,
         0.        ,  0.        ]),
 (3,
  '2010',
  'Obama'): array([ 0.41438903,  0.05635144,  0.        ,  0.17854529,  0.65679108,
         0.        ,  0.55348336]),
 (4,
  '2011',
  'Obama'): array([ 0.        ,  0.05635144,  1.83863192,  0.        ,  0.        ,
         0.        ,  0.        ]),
 (5,
  '2012',
  'Obama'): array([ 0.41438903,  0.05635144,  0.55348336,  0.17854529,  0.        ,
         0.        ,  0.87725037]),
 (6,
  '2005',
  'Bush'): array([ 0.41438903,  0.0355538 ,  0.        ,  0.29119483,  0.65679108,
         0.        ,  0.        ]),
 (7,
  '2006',
  'Bush'): array([ 0.41438903,  0.0355538

In [164]:
speech_docs.rank_tfidf(sample_list)

array([[(19, '1994', 'Clinton'), 2.8576579630159151],
       [(18, '1993', 'Clinton'), 2.7407391953998514],
       [(17, '2000', 'Clinton'), 2.6335028523744848],
       [(16, '1999', 'Clinton'), 2.3035563399622987],
       [(15, '1998', 'Clinton'), 2.1441530197427303],
       [(14, '1997', 'Clinton'), 2.0800194940857195],
       [(13, '2004', 'Bush'), 1.9811881102499056],
       [(12, '2003', 'Bush'), 1.9421005939447376],
       [(11, '2002', 'Bush'), 1.8949833614924929],
       [(10, '2001', 'Bush'), 1.8595602020754241],
       [(9, '2008', 'Bush'), 1.8123177719097618],
       [(8, '2007', 'Bush'), 1.7689381799393922],
       [(7, '2006', 'Bush'), 1.3979287404015897],
       [(6, '2005', 'Bush'), 1.3271932015554397],
       [(5, '2012', 'Obama'), 1.1819714852707415],
       [(4, '2011', 'Obama'), 1.0428771584901479],
       [(3, '2010', 'Obama'), 0.98989990542990902],
       [(2, '2009', 'Obama'), 0.58339001281760938],
       [(1, '2014', 'Obama'), 0.28165051907844041],
       [(0, '2

In [165]:
speech_docs.rank_count(sample_list)

array([[(19, '1994', 'Clinton'), 23.0],
       [(18, '1993', 'Clinton'), 19.0],
       [(17, '2000', 'Clinton'), 18.0],
       [(16, '1999', 'Clinton'), 17.0],
       [(15, '1998', 'Clinton'), 16.0],
       [(14, '1997', 'Clinton'), 14.0],
       [(13, '2004', 'Bush'), 13.0],
       [(12, '2003', 'Bush'), 12.0],
       [(11, '2002', 'Bush'), 12.0],
       [(10, '2001', 'Bush'), 12.0],
       [(9, '2008', 'Bush'), 12.0],
       [(8, '2007', 'Bush'), 11.0],
       [(7, '2006', 'Bush'), 11.0],
       [(6, '2005', 'Bush'), 11.0],
       [(5, '2012', 'Obama'), 10.0],
       [(4, '2011', 'Obama'), 8.0],
       [(3, '2010', 'Obama'), 7.0],
       [(2, '2009', 'Obama'), 7.0],
       [(1, '2014', 'Obama'), 6.0],
       [(0, '2013', 'Obama'), 2.0]], dtype=object)