In [26]:
import feedparser
import re

In [27]:
feedlist=['http://today.reuters.com/rss/topNews',
          'http://today.reuters.com/rss/domesticNews',
          'http://today.reuters.com/rss/worldNews',
          'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml',
          'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml',
          'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml',
          'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml',
          'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
          'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
          'http://news.google.com/?output=rss',
          'http://feeds.salon.com/salon/news',
          'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
          'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
          'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
          'http://rss.cnn.com/rss/edition.rss',
          'http://rss.cnn.com/rss/edition_world.rss',
          'http://rss.cnn.com/rss/edition_us.rss']

In [28]:
def stripHTML(h):
    p=''
    s=0
    for c in h:
        if c=='<': s=1
        elif c=='>':
            s=0
            p+=' '
        elif s==0: p+=c
    return p

In [29]:
def separatewords(text):
    splitter = re.compile('\\W*')
    return [s.lower() for s in splitter.split(text) if len(s)>3]

In [41]:
def getarticlewords():
    mincount = 0.000000001
    allwords = {}
    articlewords = []
    articletitles = []
    ec = 0
    # Loop over every feed
    for feed in feedlist:
        f = feedparser.parse(feed)
        # Loop over every article
        for e in f.entries:
            # Ignore identical articles
            if e.title in articletitles: continue

            # Extract the words
            txt = e.title.encode('utf8') + stripHTML(e.description.encode('utf8'))
            words = separatewords(txt)
            articlewords.append({})
            articletitles.append(e.title)

            # Increase the counts for this word in allwords and in articlewords
            for word in words:
                #allwords.setdefault(word,0)
                allwords.setdefault(word,mincount)
                allwords[word]+=1
                #articlewords[ec].setdefault(word,0)
                articlewords[ec].setdefault(word,mincount)
                articlewords[ec][word]+=1
            #
            ec+=1
    return allwords,articlewords,articletitles

In [42]:
def makematrix(allw,articlew):
    mincount = 0.000000001
    
    wordvec = []

    # Only take words that are common but not too common
    for w,c in allw.items():
        if c>1 and c<len(articlew)*0.6:
            wordvec.append(w) 

    # Create the word matrix
    #l1=[[(word in f and f[word] or 0) for word in wordvec] for f in articlew]
    l1=[[(word in f and f[word] or mincount) for word in wordvec] for f in articlew]
    return l1,wordvec

In [43]:
allwords,articlewords,articletitles = getarticlewords()

In [None]:
articletitles

In [44]:
wordmatrix,wordvec = makematrix(allwords,articlewords)

In [45]:
wordvec[0:10]

['demo',
 'ncis',
 'ceres',
 'bomb',
 'garagiola',
 'gitmo',
 'ending',
 'foot',
 'debris',
 'kill']

In [46]:
articletitles[1]

u'TRAGIC ERRORS: EU reportedly warned Belgium about security holes weeks before Brussels bomb attacks'

In [47]:
wordmatrix[1][0:10]

[1e-09, 1e-09, 1e-09, 1.000000001, 1e-09, 1e-09, 1e-09, 1e-09, 1e-09, 1e-09]

In [48]:
len(allwords)

270

# NumPy

In [1]:
from numpy import *

In [2]:
ll = [[1,2,3],[4,5,6]]

In [3]:
ll

[[1, 2, 3], [4, 5, 6]]

In [6]:
m1 = matrix(ll)

In [7]:
m1

matrix([[1, 2, 3],
        [4, 5, 6]])

In [9]:
m2 = matrix([[1,2],[3,4],[5,6]])

In [10]:
m2

matrix([[1, 2],
        [3, 4],
        [5, 6]])

In [11]:
m3 = m1 * m2

In [12]:
m3

matrix([[22, 28],
        [49, 64]])

In [13]:
shape(m1)

(2, 3)

In [14]:
shape(m2)

(3, 2)

In [15]:
shape(m3)

(2, 2)

In [16]:
a1 = m1.A

In [17]:
a1

array([[1, 2, 3],
       [4, 5, 6]])

In [18]:
a2 = array([[1,2,3],[1,2,3]])

In [19]:
a3 = a1*a2

In [20]:
a3

array([[ 1,  4,  9],
       [ 4, 10, 18]])

In [49]:
import nnmf

In [24]:
w,h = nnmf.factorize(m3,pc=3,iter=100)

7404.66924047
5.81347612459
2.88969958203
1.77007942151
1.22043407884
0.906871287119
0.708936204475
0.574475710165
0.477862374676
0.405275870018


In [25]:
w*h

matrix([[ 21.57002216,  28.32739263],
        [ 49.18855122,  63.85475405]])

In [50]:
v = matrix(wordmatrix)

In [51]:
shape(v)

(41, 270)

In [53]:
weights,feat = nnmf.factorize(v,pc=20,iter=100)

288300.697677
93.3008778803
82.4351637254
82.1201854643
82.0686780271
82.0531831437
82.0443314221
82.0424818897
82.041918377
82.041587767


In [54]:
def showfeatures(w,h,titles,wordvec,out='features.txt'): 
    outfile = file(out,'w')  
    pc,wc = shape(h)
    toppatterns = [[] for i in range(len(titles))]
    patternnames = []
  
    # Loop over all the features
    for i in range(pc):
        slist = []
        # Create a list of words and their weights
        for j in range(wc):
            slist.append((h[i,j],wordvec[j]))
        # Reverse sort the word list
        slist.sort()
        slist.reverse()

        # Print the first six elements
        n = [s[1] for s in slist[0:6]]
        outfile.write(str(n) + '\n')
        patternnames.append(n)

        # Create a list of articles for this feature
        flist=[]
        for j in range(len(titles)):
            # Add the article with its weight
            flist.append((w[j,i],titles[j]))
            toppatterns[j].append((w[j,i],i,titles[j]))

        # Reverse sort the list
        flist.sort()
        flist.reverse()

        # Show the top 3 articles
        for f in flist[0:3]:
            outfile.write(str(f)+'\n')
        outfile.write('\n')

    outfile.close()
    # Return the pattern names for later use
    return toppatterns,patternnames

In [55]:
topp,pn = showfeatures(weights,feat,articletitles,wordvec)

In [56]:
def showarticles(titles,toppatterns,patternnames,out='articles.txt'):
    outfile = file(out,'w')  
  
    # Loop over all the articles
    for j in range(len(titles)):
        outfile.write(titles[j].encode('utf8')+'\n')

        # Get the top features for this article and
        # reverse sort them
        toppatterns[j].sort()
        toppatterns[j].reverse()

        # Print the top three patterns
        for i in range(3):
            outfile.write(str(toppatterns[j][i][0]) + ' ' + str(patternnames[toppatterns[j][i][1]])+'\n')
        outfile.write('\n')

    outfile.close()

In [57]:
showarticles(articletitles,topp,pn)

# Yahoo

In [58]:
import nnmf
import urllib2
from numpy import *

In [59]:
tickers=['YHOO','AVP','BIIB','BP','CL','CVX',
         'DNA','EXPE','GOOG','PG','XOM','AMGN']

In [60]:
shortest=300
prices={}
dates=None

In [62]:
for t in tickers:
    # Open the URL
    rows = urllib2.urlopen('http://ichart.finance.yahoo.com/table.csv?s=%s&d=11&e=26&f=2006&g=d&a=3&b=12&c=1996' %t + \
                           '&ignore=.csv').readlines()


    # Extract the volume field from every line
    prices[t]=[float(r.split(',')[5]) for r in rows[1:] if r.strip()!='']
    if len(prices[t])<shortest: shortest=len(prices[t])

    if not dates:
        dates=[r.split(',')[0] for r in rows[1:] if r.strip()!='']

error: [Errno 10054] 

In [None]:
l1 = [[prices[tickers[i]][j] for i in range(len(tickers))] for j in range(shortest)]

w,h = nnmf.factorize(matrix(l1),pc=5)

In [None]:
print h
print w

In [None]:
# Loop over all the features
for i in range(shape(h)[0]):
    print "Feature %d" % i

    # Get the top stocks for this feature
    ol = [(h[i,j],tickers[j]) for j in range(shape(h)[1])]
    ol.sort()
    ol.reverse()
    for j in range(12):
        print ol[j]
    print
  
    # Show the top dates for this feature
    porder = [(w[d,i],d) for d in range(300)]
    porder.sort()
    porder.reverse()
    print [(p[0],dates[p[1]]) for p in porder[0:3]]
    print