In [1]:
'''
Speech Transcript processing / cleaning to feed the NLP pipeline
'''

import nltk
import string
import os
import ftfy

import unidecode

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

path = '/Users/smuddu/galvanize/capstone/prototyping/Speeches/Obama'
token_dict = {}
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))

    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

for subdir, dirs, files in os.walk(path):
    for file in files:
        if (file != 'links'):
            print "-- processing: ", file
            file_path = subdir + os.path.sep + file
            shakes = open(file_path, 'r')
            _raw_input = shakes.read()
            #text = _raw_input.decode("utf-8","replace").encode("utf-8")
            #print ftfy.guess_bytes(_raw_input)
            #text = ftfy.fix_text(_raw_input)

            text = unidecode.unidecode_expect_nonascii(_raw_input)
            lowers = text.lower()
            _tmp1 = lowers.replace('\n',' ').replace('\r',' ')
            while "  " in _tmp1:
                _tmp1 = _tmp1.replace('  ',' ')

            final = _tmp1
            no_punctuation = final.translate(None, string.punctuation)
            token_dict[file] = no_punctuation


#this can take some time
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())




-- processing:  20040727.txt
-- processing:  20050106.txt
-- processing:  20050604.txt
-- processing:  20051215.txt
-- processing:  20060131.txt
-- processing:  20060720.txt
-- processing:  20070116.txt
-- processing:  20070210.txt
-- processing:  20070304.txt
-- processing:  20070313.txt
-- processing:  20070321.txt
-- processing:  20071110.txt
-- processing:  20080103.txt
-- processing:  20080108.txt
-- processing:  20080120.txt
-- processing:  20080126.txt
-- processing:  20080318.txt
-- processing:  20080603.txt
-- processing:  20080715.txt
-- processing:  20080724.txt
-- processing:  20080828.txt
-- processing:  20081001.txt
-- processing:  20081104.txt
-- processing:  20081107.txt
-- processing:  20081118.txt
-- processing:  20081124.txt
-- processing:  20081125.txt


  _warn_if_not_unicode(string)


-- processing:  20081201.txt
-- processing:  20081203.txt
-- processing:  20090108.txt
-- processing:  20090118.txt
-- processing:  20090120.txt
-- processing:  20090124.txt
-- processing:  20090126.txt
-- processing:  20090204.txt
-- processing:  20090209.txt
-- processing:  20090224.txt
-- processing:  20090227.txt
-- processing:  20090402.txt
-- processing:  20090403.txt
-- processing:  20090405.txt
-- processing:  20090406.txt
-- processing:  20090407.txt
-- processing:  20090414.txt
-- processing:  20090423.txt
-- processing:  20090509.txt
-- processing:  20090518.txt
-- processing:  20090521.txt
-- processing:  20090529.txt
-- processing:  20090604.txt
-- processing:  20090615.txt
-- processing:  20090623.txt
-- processing:  20090707.txt
-- processing:  20090713.txt
-- processing:  20090714.txt
-- processing:  20090829.txt
-- processing:  20090908.txt
-- processing:  20090909.txt
-- processing:  20090911.txt
-- processing:  20090923.txt
-- processing:  20091110.txt
-- processing:

In [2]:
from sklearn.decomposition import NMF

In [3]:
model = NMF(n_components=2, init='random', random_state=0)

In [4]:
type(tfs)

scipy.sparse.csr.csr_matrix

In [5]:
model.fit(tfs)

NMF(alpha=0.0, beta=1, eta=0.1, init='random', l1_ratio=0.0, max_iter=200,
  n_components=2, nls_max_iter=2000, random_state=0, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [8]:
model.components_.shape

(2, 10788)

In [18]:
id2word = {}
for k in tfidf.vocabulary_.keys():
    id2word[tfidf.vocabulary_[k]] = k

In [32]:
topic_importance = dict(zip(id2word.values(),list(model.components_[0])))

In [33]:
topic_importance

{u'woodl': 0.0017972654688997714,
 u'foul': 0.00010537533682778691,
 u'suzann': 0.0018549269109445008,
 u'thensen': 0.0,
 u'resignationfir': 0.00010537533682778691,
 u'wondrou': 0.0032210639616865088,
 u'payoff': 0.0,
 u'shura': 0.0051340550547891188,
 u'fouryear': 0.0,
 u'starso': 0.011224098629401224,
 u'sputter': 0.0,
 u'lord': 0.06278643763488885,
 u'consideredo': 0.003310745593717634,
 u'skylin': 0.0020044619148753165,
 u'bhinneka': 0.007966331015555014,
 u'publicpriv': 0.005475618590893171,
 u'honeywel': 0.0,
 u'yellow': 0.0,
 u'politician': 0.020326602703588437,
 u'disturb': 0.014436944992421905,
 u'prize': 0.054735955398270475,
 u'showcas': 0.0,
 u'wednesday': 0.0079637762994431757,
 u'path': 0.25781795587309048,
 u'habea': 0.0158290296924898,
 u'guardsmen': 0.04215625185334896,
 u'charter': 0.073985535261380853,
 u'lemieux': 0.0053228234842576383,
 u'americanisra': 0.015782156236485111,
 u'second': 0.13720235841554973,
 u'sooth': 0.0016144789683508302,
 u'275': 0.0,
 u'gorman'

In [36]:
import operator

In [39]:
sorted_topic_imp = sorted(topic_importance.items(), key=operator.itemgetter(1),reverse=True)

In [40]:
sorted_topic_imp

[(u'thi', 2.928887293881115),
 (u'peopl', 1.9566230325983132),
 (u'nation', 1.8429473750194783),
 (u'war', 1.6119034128415382),
 (u'world', 1.5930355090163431),
 (u'secur', 1.5744217085883456),
 (u'american', 1.5656467929915652),
 (u'unit', 1.3872685041320123),
 (u'iraq', 1.3755016508411448),
 (u'ha', 1.3347482803035606),
 (u'state', 1.3335133358151747),
 (u'peac', 1.3024653588283523),
 (u'america', 1.2959312796980655),
 (u'wa', 1.1845202430169746),
 (u'countri', 1.0506786511634345),
 (u'new', 0.97157703924211491),
 (u'afghanistan', 0.9528916142120688),
 (u'time', 0.95001754493297907),
 (u'militari', 0.9450448263924025),
 (u'audio', 0.94101445334891587),
 (u'hi', 0.92170232834400689),
 (u'nuclear', 0.89698545146521724),
 (u'work', 0.87849746184271416),
 (u'troop', 0.8739399559357558),
 (u'live', 0.83879423028090194),
 (u'right', 0.83134618517635861),
 (u'come', 0.8093773360055676),
 (u'human', 0.80462804058504933),
 (u'forc', 0.80203098643326154),
 (u'know', 0.78562062728344717),
 (u't

In [41]:
topic_importance2 = dict(zip(id2word.values(),list(model.components_[1])))
sorted_topic_imp2 = sorted(topic_importance2.items(), key=operator.itemgetter(1),reverse=True)

In [42]:
sorted_topic_imp2

[(u'thi', 2.2176064045455179),
 (u'american', 0.96001330361074433),
 (u'job', 0.8819440894669498),
 (u'health', 0.87417632320177396),
 (u'make', 0.86373726815079466),
 (u'insur', 0.81306230783367028),
 (u'care', 0.77955766246786562),
 (u'tax', 0.77158066876142628),
 (u'busi', 0.757442849451614),
 (u'wa', 0.74153755477287231),
 (u'work', 0.73689950683988825),
 (u'peopl', 0.70651236623997282),
 (u'year', 0.70616726334018121),
 (u'just', 0.69981165180226168),
 (u'becaus', 0.68767606143028226),
 (u'economi', 0.6783925350630966),
 (u'thato', 0.66513810483315805),
 (u'reform', 0.64174724809441308),
 (u'ito', 0.63925909057253039),
 (u'know', 0.63380525906854979),
 (u'think', 0.61203218441114371),
 (u'new', 0.59306843883192217),
 (u'need', 0.59069592370422408),
 (u'cut', 0.57781252148998186),
 (u'deficit', 0.5681661716567612),
 (u'compani', 0.56685963134266604),
 (u'ha', 0.55743257842458682),
 (u'time', 0.55691311988646519),
 (u'want', 0.55319864200504487),
 (u'got', 0.54297484034527299),
 (u'