In [6]:
import json

In [7]:
#open url information
fd = open('./data/project_successful_meta.json', 'r')
text = fd.read()
fd.close()

pmeta = json.loads(text)

In [14]:
from math import log

class Doc():
    
    def __init__(self, docid, doc, author, year):
        self.docid = docid
        self.text = doc.lower()
        self.text = re.sub(u'[\u2019\']', '', self.text)
        self.tokens = wordpunct_tokenize(self.text)
        self.stem = None
        self.author = author
        self.year = year
        
    def count(self, wordlist):
        
        """
        Returns dict woth wordlist frequency
        """
        
        count = {word:0 for word in wordlist}
        
        for token in self.tokens:
            if token in wordlist:
                count[token] += 1
                
        return count
        
    
    def log_count(self, wordlist):
        
        """
        Return dict with log count of wordlist
        """
        
        log_count = self.count(wordlist)
        for token in log_count.keys():
            log_count[token] = log(1 + log_count[token])
        return(log_count)
            
    def token_clean(self,length):

        """ 
        strip out non-alpha tokens and length one tokens
        """

        self.tokens = [t for t in self.tokens if (t.isalpha() and len(t) > length)]


    def stopword_remove(self, stopwords):

        """
        Remove stopwords from tokens.
        """

        
        self.tokens = [t for t in self.tokens if t not in stopwords]


    def stem(self):

        """
        Stem tokens with Porter Stemmer.
        """
        
        self.stems = [PorterStemmer().stem(t) for t in self.tokens]

In [10]:
import codecs,re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

In [25]:
class DocsCollection():
    
    def __init__(self, doc_data, stopword_file):

        self.docs = [Doc(docid, doc[2], doc[1], doc[0]) for docid, doc in enumerate(doc_data)]

        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        self.stopwords = set(raw.splitlines())

        self.N = len(self.docs)
        
    def clean_docs(self, length):
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            
    def count(self, dictionary):
        return ({doc.docid : doc.count(dictionary) for doc in self.docs})
    
    def log_count(self, dictionary):
        return ({doc.docid : doc.log_count(dictionary) for doc in self.docs})
    
    def idf(self, dictionary):
        return({ for doc in self.docs})

In [16]:
import json
fd = open('./data/pres_speech.json', 'r')
text = fd.read()
fd.close()

pres_speech = json.loads(text)

In [26]:
speech_docs = DocsCollection(pres_speech[0: 10], './data/stopwords.txt')
speech_docs.clean_docs(2)

In [23]:
speech_docs.docs[0].tokens[0:10]

[u'please',
 u'everybody',
 u'seat',
 u'speaker',
 u'vice',
 u'president',
 u'members',
 u'congress',
 u'fellow',
 u'americans']

In [30]:
speech_docs.log_count(['america', 'sex', 'courage'])

{0: {'america': 3.2188758248682006, 'courage': 1.3862943611198906, 'sex': 0.0},
 1: {'america': 3.5263605246161616, 'courage': 0.6931471805599453, 'sex': 0.0},
 2: {'america': 2.9444389791664403, 'courage': 0.0, 'sex': 0.0},
 3: {'america': 2.9444389791664403, 'courage': 0.6931471805599453, 'sex': 0.0},
 4: {'america': 2.9444389791664403, 'courage': 0.0, 'sex': 0.0},
 5: {'america': 3.4339872044851463, 'courage': 0.6931471805599453, 'sex': 0.0},
 6: {'america': 2.772588722239781, 'courage': 1.0986122886681098, 'sex': 0.0},
 7: {'america': 3.6635616461296463, 'courage': 1.6094379124341003, 'sex': 0.0},
 8: {'america': 3.2188758248682006, 'courage': 1.6094379124341003, 'sex': 0.0},
 9: {'america': 3.4339872044851463, 'courage': 1.3862943611198906, 'sex': 0.0}}