Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions chapter2/recommendations.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,11 @@
'Superman Returns': 5.0,
'You, Me and Dupree': 3.5,
},
'Toby': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0,
'Superman Returns': 4.0},
'Toby': {
'Snakes on a Plane': 4.5,
'You, Me and Dupree': 1.0,
'Superman Returns': 4.0
},
}


Expand Down Expand Up @@ -187,7 +190,7 @@ def calculateSimilarItems(prefs, n=10):
# Status updates for large datasets
c += 1
if c % 100 == 0:
print '%d / %d' % (c, len(itemPrefs))
print ('%d / %d') % (c, len(itemPrefs))
# Find the most similar items to this one
scores = topMatches(itemPrefs, item, n=n, similarity=sim_distance)
result[item] = scores
Expand Down
14 changes: 7 additions & 7 deletions chapter3/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import random

def readfile(filename):
lines = [line for line in file(filename)]
lines = [line for line in open(filename)]

# First line is the column titles
colnames = lines[0].strip().split('\t')[1:]
Expand Down Expand Up @@ -105,16 +105,16 @@ def hcluster(rows, distance=pearson):
def printclust(clust, labels=None, n=0):
# indent to make a hierarchy layout
for i in range(n):
print ' ',
print (' ')
if clust.id < 0:
# negative id means that this is branch
print '-'
print ('-')
else:
# positive id means that this is an endpoint
if labels == None:
print clust.id
print (clust.id)
else:
print labels[clust.id]
print (labels[clust.id])

# now print the right and left branches
if clust.left != None:
Expand Down Expand Up @@ -236,7 +236,7 @@ def kcluster(rows, distance=pearson, k=4):

lastmatches = None
for t in range(100):
print 'Iteration %d' % t
print ('Iteration %d' % t)
bestmatches = [[] for i in range(k)]

# Find which centroid is the closest for each row
Expand Down Expand Up @@ -321,7 +321,7 @@ def scaledown(data, distance=pearson, rate=0.01):

# Keep track of the total error
totalerror += abs(errorterm)
print totalerror
print (totalerror)

# If the answer got worse by moving the points, we are done
if lasterror and lasterror < totalerror:
Expand Down
92 changes: 46 additions & 46 deletions chapter3/feedlist.txt
Original file line number Diff line number Diff line change
@@ -1,97 +1,97 @@
http://feeds.feedburner.com/37signals/beMH
http://feeds.feedburner.com/blogspot/bRuz
http://battellemedia.com/index.xml
http://blog.guykawasaki.com/index.rdf
#http://battellemedia.com/index.xml
#http://blog.guykawasaki.com/index.rdf
http://blog.outer-court.com/rss.xml
http://feeds.searchenginewatch.com/sewblog
http://blog.topix.net/index.rdf
http://blogs.abcnews.com/theblotter/index.rdf
#http://feeds.searchenginewatch.com/sewblog
#http://blog.topix.net/index.rdf
#http://blogs.abcnews.com/theblotter/index.rdf
http://feeds.feedburner.com/ConsumingExperienceFull
http://flagrantdisregard.com/index.php/feed/
http://featured.gigaom.com/feed/
#http://flagrantdisregard.com/index.php/feed/
#http://featured.gigaom.com/feed/
http://gizmodo.com/index.xml
http://gofugyourself.typepad.com/go_fug_yourself/index.rdf
http://googleblog.blogspot.com/rss.xml
http://feeds.feedburner.com/GoogleOperatingSystem
http://headrush.typepad.com/creating_passionate_users/index.rdf
http://feeds.feedburner.com/instapundit/main
http://jeremy.zawodny.com/blog/rss2.xml
#http://feeds.feedburner.com/instapundit/main
#http://jeremy.zawodny.com/blog/rss2.xml
http://joi.ito.com/index.rdf
http://feeds.feedburner.com/Mashable
http://michellemalkin.com/index.rdf
#http://michellemalkin.com/index.rdf
http://moblogsmoproblems.blogspot.com/rss.xml
http://newsbusters.org/node/feed
http://beta.blogger.com/feeds/27154654/posts/full?alt=rss
#http://beta.blogger.com/feeds/27154654/posts/full?alt=rss
http://feeds.feedburner.com/paulstamatiou
http://powerlineblog.com/index.rdf
http://feeds.feedburner.com/Publishing20
#http://powerlineblog.com/index.rdf
#http://feeds.feedburner.com/Publishing20
http://radar.oreilly.com/index.rdf
http://scienceblogs.com/pharyngula/index.xml
http://scobleizer.wordpress.com/feed/
#http://scienceblogs.com/pharyngula/index.xml
#http://scobleizer.wordpress.com/feed/
http://sethgodin.typepad.com/seths_blog/index.rdf
http://rss.slashdot.org/Slashdot/slashdot
#http://rss.slashdot.org/Slashdot/slashdot
http://thinkprogress.org/feed/
http://feeds.feedburner.com/andrewsullivan/rApM
http://wilwheaton.typepad.com/wwdnbackup/index.rdf
#http://wilwheaton.typepad.com/wwdnbackup/index.rdf
http://www.43folders.com/feed/
http://www.456bereastreet.com/feed.xml
http://www.autoblog.com/rss.xml
http://www.bloggersblog.com/rss.xml
http://www.bloglines.com/rss/about/news
#http://www.bloglines.com/rss/about/news
http://www.blogmaverick.com/rss.xml
http://www.boingboing.net/index.rdf
http://www.buzzmachine.com/index.xml
#http://www.buzzmachine.com/index.xml
http://www.captainsquartersblog.com/mt/index.rdf
http://www.coolhunting.com/index.rdf
#http://www.coolhunting.com/index.rdf
http://feeds.copyblogger.com/Copyblogger
http://feeds.feedburner.com/crooksandliars/YaCP
http://feeds.dailykos.com/dailykos/index.xml
http://www.deadspin.com/index.xml
http://www.downloadsquad.com/rss.xml
http://www.engadget.com/rss.xml
http://www.gapingvoid.com/index.rdf
http://www.gawker.com/index.xml
http://www.gothamist.com/index.rdf
http://www.huffingtonpost.com/raw_feed_index.rdf
http://www.hyperorg.com/blogger/index.rdf
#http://www.downloadsquad.com/rss.xml
https://www.engadget.com/rss.xml
#http://www.gapingvoid.com/index.rdf
#http://www.gawker.com/index.xml
http://feeds.gothamistllc.com/gothamist05
#http://www.huffingtonpost.com/raw_feed_index.rdf
#http://www.hyperorg.com/blogger/index.rdf
http://www.joelonsoftware.com/rss.xml
http://www.joystiq.com/rss.xml
http://www.kotaku.com/index.xml
#http://www.joystiq.com/rss.xml
#http://www.kotaku.com/index.xml
http://feeds.kottke.org/main
http://www.lifehack.org/feed/
http://www.lifehacker.com/index.xml
http://littlegreenfootballs.com/weblog/lgf-rss.php
http://www.makezine.com/blog/index.xml
#http://littlegreenfootballs.com/weblog/lgf-rss.php
#http://www.makezine.com/blog/index.xml
http://www.mattcutts.com/blog/feed/
http://xml.metafilter.com/rss.xml
#http://xml.metafilter.com/rss.xml
http://www.mezzoblue.com/rss/index.xml
http://www.micropersuasion.com/index.rdf
#http://www.micropersuasion.com/index.rdf
http://www.neilgaiman.com/journal/feed/rss.xml
http://www.oilman.ca/feed/
#http://www.oilman.ca/feed/
http://www.perezhilton.com/index.xml
http://www.plasticbag.org/index.rdf
http://www.powazek.com/rss.xml
http://www.problogger.net/feed/
http://feeds.feedburner.com/QuickOnlineTips
http://www.readwriteweb.com/rss.xml
#http://www.readwriteweb.com/rss.xml
http://www.schneier.com/blog/index.rdf
http://scienceblogs.com/sample/combined.xml
#http://scienceblogs.com/sample/combined.xml
http://www.seroundtable.com/index.rdf
http://www.shoemoney.com/feed/
http://www.sifry.com/alerts/index.rdf
http://www.simplebits.com/xml/rss.xml
#http://www.sifry.com/alerts/index.rdf
#http://www.simplebits.com/xml/rss.xml
http://feeds.feedburner.com/Spikedhumor
http://www.stevepavlina.com/blog/feed
http://www.talkingpointsmemo.com/index.xml
#http://www.talkingpointsmemo.com/index.xml
http://www.tbray.org/ongoing/ongoing.rss
http://feeds.feedburner.com/TechCrunch
http://www.techdirt.com/techdirt_rss.xml
http://www.techeblog.com/index.php/feed/
http://www.thesuperficial.com/index.xml
#http://www.techeblog.com/index.php/feed/
#http://www.thesuperficial.com/index.xml
http://www.tmz.com/rss.xml
http://www.treehugger.com/index.rdf
#http://www.treehugger.com/index.rdf
http://www.tuaw.com/rss.xml
http://www.valleywag.com/index.xml
http://www.we-make-money-not-art.com/index.rdf
http://www.wired.com/rss/index.xml
http://www.wonkette.com/index.xml
#http://www.valleywag.com/index.xml
#http://www.we-make-money-not-art.com/index.rdf
#http://www.wired.com/rss/index.xml
#http://www.wonkette.com/index.xml
9 changes: 5 additions & 4 deletions chapter3/generatefeedvector.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@ def getwords(html):

apcount = {}
wordcounts = {}
feedlist = [line for line in file('feedlist.txt')]
feedlist = [line for line in open('chapter3/feedlist.txt')]
for feedurl in feedlist:
if feedurl.startswith('#'): continue
try:
(title, wc) = getwordcounts(feedurl)
wordcounts[title] = wc
Expand All @@ -51,21 +52,21 @@ def getwords(html):
if count > 1:
apcount[word] += 1
except:
print 'Failed to parse feed %s' % feedurl
print ('Failed to parse feed %s' % feedurl)

wordlist = []
for (w, bc) in apcount.items():
frac = float(bc) / len(feedlist)
if frac > 0.1 and frac < 0.5:
wordlist.append(w)

out = file('blogdata1.txt', 'w')
out = open('chapter3/blogdata1.txt', 'w')
out.write('Blog')
for word in wordlist:
out.write('\t%s' % word)
out.write('\n')
for (blog, wc) in wordcounts.items():
print blog
print (blog)
out.write(blog)
for word in wordlist:
if word in wc:
Expand Down
16 changes: 8 additions & 8 deletions chapter4/searchengine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import urllib2
from BeautifulSoup import *
import urllib
from bs4 import *
from urlparse import urljoin
from pysqlite2 import dbapi2 as sqlite
import nn
Expand Down Expand Up @@ -37,7 +37,7 @@ def getentryid(self,table,field,value,createnew=True):
# Index an individual page
def addtoindex(self,url,soup):
if self.isindexed(url): return
print 'Indexing '+url
print ('Indexing '+url)

# Get the individual words
text=self.gettextonly(soup)
Expand Down Expand Up @@ -101,7 +101,7 @@ def crawl(self,pages,depth=2):
try:
c=urllib2.urlopen(page)
except:
print "Could not open %s" % page
print ("Could not open %s" % page)
continue
try:
soup=BeautifulSoup(c.read())
Expand All @@ -120,7 +120,7 @@ def crawl(self,pages,depth=2):

self.dbcommit()
except:
print "Could not parse page %s" % page
print ("Could not parse page %s" % page)

pages=newpages

Expand Down Expand Up @@ -150,7 +150,7 @@ def calculatepagerank(self,iterations=20):
self.dbcommit()

for i in range(iterations):
print "Iteration %d" % (i)
print ("Iteration %d" % (i))
for (urlid,) in self.con.execute('select rowid from urllist'):
pr=0.15

Expand Down Expand Up @@ -205,7 +205,7 @@ def getmatchrows(self,q):

# Create the query from the separate parts
fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist)
print fullquery
print (fullquery)
cur=self.con.execute(fullquery)
rows=[row for row in cur]

Expand Down Expand Up @@ -237,7 +237,7 @@ def query(self,q):
rankedscores.sort()
rankedscores.reverse()
for (score,urlid) in rankedscores[0:10]:
print '%f\t%s' % (score,self.geturlname(urlid))
print ('%f\t%s' % (score,self.geturlname(urlid)))
return wordids,[r[1] for r in rankedscores[0:10]]

def normalizescores(self,scores,smallIsBetter=0):
Expand Down