In [63]:
from numpy import zeros
from scipy.linalg import svd
import numpy as np
import math

In [64]:
titles =["The Neatest Little Guide to Stock Market Investing", 
"Investing For Dummies, 4th Edition", 
"The Little Book of Common Sense Investing: The Only Way to Guarantee Your Fair Share of Stock Market Returns", 
"The Little Book of Value Investing", 
"Value Investing: From Graham to Buffett and Beyond", 
"Rich Dad's Guide to Investing: What the Rich Invest in, That the Poor and the Middle Class Do Not!", 
"Investing in Real Estate, 5th Edition", 
"Stock Investing For Dummies", 
"Rich Dad's Advisors: The ABC's of Real Estate Investing: The Secrets of Finding Hidden Profits Most Investors Miss" ]
stopwords = ['and','edition','for','in','little','of','the','to']
ignorechars = ''',:'!'''

In [65]:
class LSA(object):
    def __init__(self, stopwords, ignorechars):
        self.stopwords = stopwords
        self.ignorechars = ignorechars
        self.wdict = {}
        self.dcount = 0        
    def parse(self, doc):
        words = doc.split();
        for w in words:
            w = w.lower()
            if w in self.stopwords:
                continue
            elif w in self.wdict:
                self.wdict[w].append(self.dcount)
            else:
                self.wdict[w] = [self.dcount]
        self.dcount += 1      
    def build(self):
        self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 1]
        self.keys.sort()
        self.A = zeros([len(self.keys), self.dcount])
        for i, k in enumerate(self.keys):
            for d in self.wdict[k]:
                self.A[i,d] += 1
    def calc(self):
        self.U, self.S, self.Vt = svd(self.A)
    def TFIDF(self):
        WordsPerDoc = np.sum(self.A, axis=0)        
        DocsPerWord = np.sum(np.asarray(self.A > 0, 'i'), axis=1)
        rows, cols = self.A.shape
        for i in range(rows):
            for j in range(cols):
                self.A[i,j] = (self.A[i,j] / WordsPerDoc[j]) * math.log(float(cols) / DocsPerWord[i])
    def printA(self):
        print 'Here is the count matrix'
        print self.A
    def printSVD(self):
        print 'Here are the singular values'
        print self.S
        print 'Here are the first 3 columns of the U matrix'
        print -1*self.U[:, 0:3]
        print 'Here are the first 3 rows of the Vt matrix'
        print -1*self.Vt[0:3, :]

In [66]:
mylsa = LSA(stopwords, ignorechars)

In [67]:
for t in titles:
    mylsa.parse(t)


In [68]:
#mylsa.printA()

In [69]:
mylsa.build()




In [70]:
mylsa.TFIDF()

In [71]:
mylsa.calc()

In [72]:
mylsa.printSVD()

Here are the singular values
[ 1.10050818  0.97111081  0.88818237  0.8151959   0.57681318  0.50386107
  0.41739399  0.29469036  0.12254768]
Here are the first 3 columns of the U matrix
[[ 0.30368189  0.26258271  0.12201155]
 [ 0.20451719 -0.30548707 -0.26615503]
 [ 0.15883567 -0.1483611   0.0134995 ]
 [ 0.35643125 -0.1077943   0.54458916]
 [ 0.34909864  0.06158502 -0.24937918]
 [ 0.1715291   0.02117527  0.25793345]
 [ 0.34741532 -0.52026179  0.0822702 ]
 [ 0.28889919 -0.42798281 -0.40301943]
 [ 0.23013547 -0.00439076  0.4855784 ]
 [ 0.55250651  0.58821369 -0.28950995]]
Here are the first 3 rows of the Vt matrix
[[ 0.21790601  0.19037163  0.28411384  0.45351144  0.50617851  0.30870447
   0.3325941   0.21005532  0.35160361]
 [-0.06680008 -0.06524492  0.12148752  0.41749563  0.48123244 -0.39544821
  -0.43551881 -0.03510608 -0.47259497]
 [ 0.35516941  0.36040149  0.25408632  0.02558472 -0.35897723 -0.40410337
   0.24986029  0.48051204 -0.30539295]]
