##HW2  Question 1

### Class of Documents

In [236]:
import numpy as np
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

class Doc():
    
    """ The Doc class rpresents a class of individula documents
    
    """
    
    def __init__(self, docid, doc, author, year):
        self.docid = docid
        self.text = doc.lower()
        self.text = re.sub(u'[\u2019\']', '', self.text)
        self.tokens = np.array(wordpunct_tokenize(self.text))
        self.stem = None
        self.author = author
        self.year = year
        
    def tf(self, wordlist):
        
        """
        Returns ARRAY with wordlist frequency
        """
        
        count = np.zeros(len(wordlist))
        
        for wid, word in np.ndenumerate(wordlist):
            count[wid] = (self.tokens == word).sum()
        return count
        
    
    def word_exists(self, wordlist):
        
        """
        Returns ARRAY of binary value where 1 inidicates presence of a word
        """
        
        is_word = np.zeros(len(wordlist))
        
        for wid, word in np.ndenumerate(wordlist):
            if word in self.tokens:
                is_word[wid] = 1
        return is_word
            
    def token_clean(self,length):

        """ 
        strip out non-alpha tokens and length one tokens
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        Remove stopwords from tokens.
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        Stem tokens with Porter Stemmer.
        """
        
        self.stems = n.array([PorterStemmer().stem(t) for t in self.tokens])

###Class of Document Collections

In [237]:
import numpy as np
import codecs

class RawDocs():
    """ The RawDocs class rpresents a class of document collections
     
    """
    def __init__(self, doc_data, stopword_file):

        self.docs = [Doc(docid, doc[2], doc[1], doc[0]) for docid, doc in enumerate(doc_data)]

        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        self.stopwords = set(raw.splitlines())

        self.N = len(self.docs)
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            
    def count(self, dictionary):
        """ 
        word count frequency of dictionary in document collection
        """
        
        return ({(doc.docid, doc.year, doc.author) : \
                 doc.tf(dictionary) for doc in self.docs})
    
    def idf(self, dictionary):
        """ 
        returns array of inverted document frequency for given dictionary 
        over collection of docs
        """
        
        is_word_docs = np.array([doc.word_exists(dictionary) for doc in self.docs])
        
        return(np.log(self.N / sum([is_word for is_word in is_word_docs])))
    
    def tf_idf(self, dictionary):
        """ 
        returns tf-idf score of given dictionary of words for every document 
        """
        
        tf = self.count(dictionary)
        idf = self.idf(dictionary)
        
        tf_idf_docs = dict()
        #convert counts t log counts
        for doc in self.docs:
            tf_idf_docs[(doc.docid, doc.year, doc.author) ] = \
            np.log(tf[(doc.docid, doc.year, doc.author)] + 1) * idf
            
        return(tf_idf_docs)
    
    def rank_tfidf(self, dictionary):
        
        """
        Calculates document rank based on tfidf
        """
        
        docs_tfidf = self.tf_idf(dictionary)
        
        doc_rank = [[key, sum(docs_tfidf[key])] for key in docs_tfidf.keys()]
            
            
        return(np.sort(np.array(doc_rank), axis=0)[::-1])
    
    def rank_count(self, dictionary):
        
        """
        Calculates document rank based on word frequency
        """
        
        docs_count = self.count(dictionary)
        
        doc_rank = [[key, sum(docs_count[key])] for key in docs_count.keys()]
                    
        return(np.sort(np.array(doc_rank), axis=0)[::-1])

### Demo

####import documents for consumption

In [238]:
# import documents for consumption
import json
fd = open('./data/pres_speech.json', 'r')
text = fd.read()
fd.close()

pres_speech = json.loads(text)

####Instantiate RawDocs class and preprocess documents preprocessing

In [239]:
speech_docs = RawDocs(pres_speech[0:20], './data/stopwords.txt')
speech_docs.clean_docs(2)

#### Method outputs

In [240]:
sample_list = ['danger', 'race', 'human', 'fear', 'influence', 'peace', 'love', 'war', 'tyranny']

speech_docs.count(sample_list)

{(0, '2013', 'Obama'): array([ 0.,  3.,  2.,  1.,  2.,  1.,  1.,  3.,  0.]),
 (1, '2014', 'Obama'): array([ 1.,  5.,  1.,  3.,  0.,  3.,  1.,  7.,  1.]),
 (2, '2009', 'Obama'): array([ 0.,  0.,  1.,  1.,  0.,  1.,  0.,  5.,  0.]),
 (3, '2010', 'Obama'): array([ 1.,  0.,  2.,  2.,  1.,  2.,  2.,  4.,  0.]),
 (4, '2011', 'Obama'): array([ 0.,  9.,  0.,  0.,  0.,  1.,  3.,  4.,  0.]),
 (5, '2012', 'Obama'): array([ 1.,  1.,  2.,  0.,  2.,  0.,  0.,  6.,  1.]),
 (6,
  '2005',
  'Bush'): array([  1.,   0.,   5.,   2.,   0.,  11.,   0.,   5.,   2.]),
 (7, '2006', 'Bush'): array([ 1.,  0.,  7.,  4.,  1.,  6.,  3.,  2.,  1.]),
 (8,
  '2007',
  'Bush'): array([  3.,   0.,   2.,   0.,   0.,   3.,   2.,  10.,   0.]),
 (9, '2008', 'Bush'): array([ 1.,  0.,  2.,  0.,  1.,  8.,  0.,  3.,  2.]),
 (10, '2001', 'Bush'): array([ 0.,  0.,  0.,  0.,  0.,  6.,  1.,  1.,  0.]),
 (11,
  '2002',
  'Bush'): array([  4.,   1.,   2.,   1.,   0.,   4.,   2.,  12.,   2.]),
 (12,
  '2003',
  'Bush'): array([  3.,  

In [241]:
speech_docs.tf_idf(sample_list)

{(0,
  '2013',
  'Obama'): array([ 0.        ,  1.10696672,  0.17854529,  0.41438903,  0.87725037,
         0.0355538 ,  0.41438903,  0.        ,  0.        ]),
 (1,
  '2014',
  'Obama'): array([ 0.41438903,  1.43073373,  0.11264954,  0.82877806,  0.        ,
         0.0711076 ,  0.41438903,  0.        ,  0.72768125]),
 (2,
  '2009',
  'Obama'): array([ 0.        ,  0.        ,  0.11264954,  0.41438903,  0.        ,
         0.0355538 ,  0.        ,  0.        ,  0.        ]),
 (3,
  '2010',
  'Obama'): array([ 0.41438903,  0.        ,  0.17854529,  0.65679108,  0.55348336,
         0.05635144,  0.65679108,  0.        ,  0.        ]),
 (4,
  '2011',
  'Obama'): array([ 0.        ,  1.83863192,  0.        ,  0.        ,  0.        ,
         0.0355538 ,  0.82877806,  0.        ,  0.        ]),
 (5,
  '2012',
  'Obama'): array([ 0.41438903,  0.55348336,  0.17854529,  0.        ,  0.87725037,
         0.        ,  0.        ,  0.        ,  0.72768125]),
 (6,
  '2005',
  'Bush'): array([ 

In [242]:
speech_docs.rank_tfidf(sample_list)

array([[(19, '1994', 'Clinton'), 4.5978773281949756],
       [(18, '1993', 'Clinton'), 4.0012911525378563],
       [(17, '2000', 'Clinton'), 3.999728240209798],
       [(16, '1999', 'Clinton'), 3.9242739884052078],
       [(15, '1998', 'Clinton'), 3.0270942427285847],
       [(14, '1997', 'Clinton'), 2.7513492962309956],
       [(13, '2004', 'Bush'), 2.7029637833547655],
       [(12, '2003', 'Bush'), 2.649201633214],
       [(11, '2002', 'Bush'), 2.643181473218295],
       [(10, '2001', 'Bush'), 2.5163512777260344],
       [(9, '2008', 'Bush'), 2.4124680568551571],
       [(8, '2007', 'Bush'), 2.0375395537903396],
       [(7, '2006', 'Bush'), 1.8948711445507287],
       [(6, '2005', 'Bush'), 1.8044919823257539],
       [(5, '2012', 'Obama'), 1.7352220365271203],
       [(4, '2011', 'Obama'), 1.5169918240189491],
       [(3, '2010', 'Obama'), 1.3627470039418017],
       [(2, '2009', 'Obama'), 0.56259237166353726],
       [(1, '2014', 'Obama'), 0.51420117363538864],
       [(0, '2013', '

In [243]:
speech_docs.rank_count(sample_list)

array([[(19, '1994', 'Clinton'), 30.0],
       [(18, '1993', 'Clinton'), 29.0],
       [(17, '2000', 'Clinton'), 28.0],
       [(16, '1999', 'Clinton'), 26.0],
       [(15, '1998', 'Clinton'), 25.0],
       [(14, '1997', 'Clinton'), 22.0],
       [(13, '2004', 'Bush'), 20.0],
       [(12, '2003', 'Bush'), 19.0],
       [(11, '2002', 'Bush'), 19.0],
       [(10, '2001', 'Bush'), 19.0],
       [(9, '2008', 'Bush'), 17.0],
       [(8, '2007', 'Bush'), 17.0],
       [(7, '2006', 'Bush'), 17.0],
       [(6, '2005', 'Bush'), 14.0],
       [(5, '2012', 'Obama'), 13.0],
       [(4, '2011', 'Obama'), 13.0],
       [(3, '2010', 'Obama'), 13.0],
       [(2, '2009', 'Obama'), 8.0],
       [(1, '2014', 'Obama'), 8.0],
       [(0, '2013', 'Obama'), 7.0]], dtype=object)

In [244]:
sum(np.array([ 0,  0,  0.17854529,  0.65679108,  0.87725037,  0.09190525,  0,  0,  0]))

1.8044919899999998