##HW2  Question 1

### Class of Documents

In [4]:
import numpy as np
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

class Doc():
    
    """ The Doc class rpresents a class of individula documents
    
    """
    
    def __init__(self, docid, doc, author, year):
        self.docid = docid
        self.text = doc.lower()
        self.text = re.sub(u'[\u2019\']', '', self.text)
        self.tokens = np.array(wordpunct_tokenize(self.text))
        self.stem = None
        self.author = author
        self.year = year
        
    def tf(self, wordlist):
        
        """
        Returns ARRAY with wordlist frequency
        """
        
        count = np.zeros(len(wordlist))
        
        for wid, word in np.ndenumerate(wordlist):
            count[wid] = (self.tokens == word).sum()
        return count
        
    
    def word_exists(self, wordlist):
        
        """
        Returns ARRAY of binary value where 1 inidicates presence of a word
        """
        
        is_word = np.zeros(len(wordlist))
        
        for wid, word in np.ndenumerate(wordlist):
            if word in self.tokens:
                is_word[wid] = 1
        return is_word
            
    def token_clean(self,length):

        """ 
        strip out non-alpha tokens and length one tokens
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        Remove stopwords from tokens.
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        Stem tokens with Porter Stemmer.
        """
        
        self.stems = n.array([PorterStemmer().stem(t) for t in self.tokens])

###Class of Document Collections

In [5]:
import numpy as np
import codecs

class RawDocs():
    
    docid = 0
    """ The RawDocs class rpresents a class of document collections
     
    """
    def __init__(self, doc_data, stopword_file):

        self.docs = []
        for doc in doc_data :
            self.docs.append(Doc(RawDocs.docid, doc[2], doc[1], doc[0]))
            RawDocs.docid += 1

        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        self.stopwords = set(raw.splitlines())

        self.N = len(self.docs)
        RawDocs.docid = 0
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            
    def count(self, dictionary):
        """ 
        word count frequency of dictionary in document collection
        """
        
        return ({(doc.docid, doc.year, doc.author) : \
                 doc.tf(dictionary) for doc in self.docs})
    
    def idf(self, dictionary):
        """ 
        returns array of inverted document frequency for given dictionary 
        over collection of docs
        """
        
        is_word_docs = np.array([doc.word_exists(dictionary) for doc in self.docs])
        
        return(np.log(self.N / sum([is_word for is_word in is_word_docs])))
    
    def tf_idf(self, dictionary):
        """ 
        returns tf-idf score of given dictionary of words for every document 
        """
        #tf and idf are calls to functions of the class Doc to calculate word frequency and inverse df respectively
        tf = self.count(dictionary)
        idf = self.idf(dictionary)
        
        tf_idf_docs = dict()
        
        for doc in self.docs:
            tf_idf_docs[(doc.docid, doc.year, doc.author) ] = \
            np.log(tf[(doc.docid, doc.year, doc.author)] + 1) * idf
            
        return(tf_idf_docs)
    
    def rank_tfidf(self, dictionary):
        
        """
        Calculates document rank based on tfidf
        """
        
        docs_tfidf = self.tf_idf(dictionary)
        
        doc_rank = [[key, sum(docs_tfidf[key])] for key in docs_tfidf.keys()]
        
        doc_rank.sort(key=lambda x: x[1], reverse = True)
        
        return(doc_rank)   
        #return(np.sort(np.array(doc_rank), axis=0)[::-1])
    
    def rank_count(self, dictionary):
        
        """
        Calculates document rank based on word frequency
        """
        
        docs_count = self.count(dictionary)
        
        doc_rank = [[key, sum(docs_count[key])] for key in docs_count.keys()]
        
        doc_rank.sort(key=lambda x: x[1], reverse = True)
        
        return(doc_rank)
        #return(np.sort(np.array(doc_rank), axis=0)[::-1])

### Demo

####import documents for consumption

In [6]:
# import documents for consumption
import json
fd = open('./data/pres_speech.json', 'r')
text = fd.read()
fd.close()

pres_speech = json.loads(text)

####Instantiate RawDocs class and preprocess documents preprocessing

In [7]:
speech_docs = RawDocs(pres_speech[0:20], './data/stopwords.txt')
speech_docs.clean_docs(2)

#### Method outputs

In [8]:
#define sample
sample_list = ['danger', 'race', 'human', 'fear', 'influence', 'peace', 'love', 'war', 'tyranny']
#conut fucnction
speech_docs.count(sample_list)

{(0, u'2013', u'Obama'): array([ 0.,  3.,  2.,  1.,  2.,  1.,  1.,  3.,  0.]),
 (1, u'2014', u'Obama'): array([ 1.,  5.,  1.,  3.,  0.,  3.,  1.,  7.,  1.]),
 (2, u'2009', u'Obama'): array([ 0.,  0.,  1.,  1.,  0.,  1.,  0.,  5.,  0.]),
 (3, u'2010', u'Obama'): array([ 1.,  0.,  2.,  2.,  1.,  2.,  2.,  4.,  0.]),
 (4, u'2011', u'Obama'): array([ 0.,  9.,  0.,  0.,  0.,  1.,  3.,  4.,  0.]),
 (5, u'2012', u'Obama'): array([ 1.,  1.,  2.,  0.,  2.,  0.,  0.,  6.,  1.]),
 (6,
  u'2005',
  u'Bush'): array([  1.,   0.,   5.,   2.,   0.,  11.,   0.,   5.,   2.]),
 (7, u'2006', u'Bush'): array([ 1.,  0.,  7.,  4.,  1.,  6.,  3.,  2.,  1.]),
 (8,
  u'2007',
  u'Bush'): array([  3.,   0.,   2.,   0.,   0.,   3.,   2.,  10.,   0.]),
 (9, u'2008', u'Bush'): array([ 1.,  0.,  2.,  0.,  1.,  8.,  0.,  3.,  2.]),
 (10, u'2001', u'Bush'): array([ 0.,  0.,  0.,  0.,  0.,  6.,  1.,  1.,  0.]),
 (11,
  u'2002',
  u'Bush'): array([  4.,   1.,   2.,   1.,   0.,   4.,   2.,  12.,   2.]),
 (12,
  u'2003',


In [9]:
#tf idf function
speech_docs.tf_idf(sample_list)

{(0,
  u'2013',
  u'Obama'): array([ 0.        ,  1.10696672,  0.17854529,  0.41438903,  0.87725037,
         0.0355538 ,  0.41438903,  0.        ,  0.        ]),
 (1,
  u'2014',
  u'Obama'): array([ 0.41438903,  1.43073373,  0.11264954,  0.82877806,  0.        ,
         0.0711076 ,  0.41438903,  0.        ,  0.72768125]),
 (2,
  u'2009',
  u'Obama'): array([ 0.        ,  0.        ,  0.11264954,  0.41438903,  0.        ,
         0.0355538 ,  0.        ,  0.        ,  0.        ]),
 (3,
  u'2010',
  u'Obama'): array([ 0.41438903,  0.        ,  0.17854529,  0.65679108,  0.55348336,
         0.05635144,  0.65679108,  0.        ,  0.        ]),
 (4,
  u'2011',
  u'Obama'): array([ 0.        ,  1.83863192,  0.        ,  0.        ,  0.        ,
         0.0355538 ,  0.82877806,  0.        ,  0.        ]),
 (5,
  u'2012',
  u'Obama'): array([ 0.41438903,  0.55348336,  0.17854529,  0.        ,  0.87725037,
         0.        ,  0.        ,  0.        ,  0.72768125]),
 (6,
  u'2005',
  u'Bu

In [10]:
#get ranked docs usinf tf idf
speech_docs.rank_tfidf(sample_list)

[[(13, u'2004', u'Bush'), 4.5978773281949756],
 [(11, u'2002', u'Bush'), 4.0012911525378563],
 [(1, u'2014', u'Obama'), 3.999728240209798],
 [(7, u'2006', u'Bush'), 3.9242739884052078],
 [(0, u'2013', u'Obama'), 3.0270942427285847],
 [(5, u'2012', u'Obama'), 2.7513492962309956],
 [(4, u'2011', u'Obama'), 2.7029637833547655],
 [(15, u'1998', u'Clinton'), 2.649201633214],
 [(6, u'2005', u'Bush'), 2.643181473218295],
 [(3, u'2010', u'Obama'), 2.5163512777260344],
 [(9, u'2008', u'Bush'), 2.4124680568551571],
 [(17, u'2000', u'Clinton'), 2.0375395537903396],
 [(12, u'2003', u'Bush'), 1.8948711445507287],
 [(19, u'1994', u'Clinton'), 1.8044919823257539],
 [(8, u'2007', u'Bush'), 1.7352220365271203],
 [(16, u'1999', u'Clinton'), 1.5169918240189491],
 [(18, u'1993', u'Clinton'), 1.3627470039418017],
 [(2, u'2009', u'Obama'), 0.56259237166353726],
 [(10, u'2001', u'Bush'), 0.51420117363538864],
 [(14, u'1997', u'Clinton'), 0.34340625056533536]]

In [11]:
#get ranked docs usinf count
speech_docs.rank_count(sample_list)

[[(12, u'2003', u'Bush'), 30.0],
 [(13, u'2004', u'Bush'), 29.0],
 [(11, u'2002', u'Bush'), 28.0],
 [(6, u'2005', u'Bush'), 26.0],
 [(7, u'2006', u'Bush'), 25.0],
 [(1, u'2014', u'Obama'), 22.0],
 [(8, u'2007', u'Bush'), 20.0],
 [(17, u'2000', u'Clinton'), 19.0],
 [(15, u'1998', u'Clinton'), 19.0],
 [(14, u'1997', u'Clinton'), 19.0],
 [(4, u'2011', u'Obama'), 17.0],
 [(9, u'2008', u'Bush'), 17.0],
 [(16, u'1999', u'Clinton'), 17.0],
 [(3, u'2010', u'Obama'), 14.0],
 [(0, u'2013', u'Obama'), 13.0],
 [(19, u'1994', u'Clinton'), 13.0],
 [(5, u'2012', u'Obama'), 13.0],
 [(10, u'2001', u'Bush'), 8.0],
 [(2, u'2009', u'Obama'), 8.0],
 [(18, u'1993', u'Clinton'), 7.0]]

As we can see there is an expected difference in ranking between tf idf and tf methods. For a more detailed explanation of why we expect this result please refer to python ntebook for question 2 week 2.