From aeda146f36bca897fca342fd4e30853bcab6d959 Mon Sep 17 00:00:00 2001 From: jezhang Date: Tue, 10 Jan 2017 17:18:15 +0800 Subject: [PATCH] 20170110 --- chapter2/recommendations.py | 9 ++-- chapter3/clusters.py | 14 +++--- chapter3/feedlist.txt | 92 +++++++++++++++++----------------- chapter3/generatefeedvector.py | 9 ++-- chapter4/searchengine.py | 16 +++--- 5 files changed, 72 insertions(+), 68 deletions(-) diff --git a/chapter2/recommendations.py b/chapter2/recommendations.py index 31da0ea..40b54b7 100644 --- a/chapter2/recommendations.py +++ b/chapter2/recommendations.py @@ -47,8 +47,11 @@ 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, }, - 'Toby': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0, - 'Superman Returns': 4.0}, + 'Toby': { + 'Snakes on a Plane': 4.5, + 'You, Me and Dupree': 1.0, + 'Superman Returns': 4.0 + }, } @@ -187,7 +190,7 @@ def calculateSimilarItems(prefs, n=10): # Status updates for large datasets c += 1 if c % 100 == 0: - print '%d / %d' % (c, len(itemPrefs)) + print ('%d / %d') % (c, len(itemPrefs)) # Find the most similar items to this one scores = topMatches(itemPrefs, item, n=n, similarity=sim_distance) result[item] = scores diff --git a/chapter3/clusters.py b/chapter3/clusters.py index 2bc9807..be6dd4b 100644 --- a/chapter3/clusters.py +++ b/chapter3/clusters.py @@ -5,7 +5,7 @@ import random def readfile(filename): - lines = [line for line in file(filename)] + lines = [line for line in open(filename)] # First line is the column titles colnames = lines[0].strip().split('\t')[1:] @@ -105,16 +105,16 @@ def hcluster(rows, distance=pearson): def printclust(clust, labels=None, n=0): # indent to make a hierarchy layout for i in range(n): - print ' ', + print (' ') if clust.id < 0: # negative id means that this is branch - print '-' + print ('-') else: # positive id means that this is an endpoint if labels == None: - print clust.id + print (clust.id) else: - print labels[clust.id] + print (labels[clust.id]) # now print the right and left branches if clust.left != None: @@ -236,7 +236,7 @@ def kcluster(rows, distance=pearson, k=4): lastmatches = None for t in range(100): - print 'Iteration %d' % t + print ('Iteration %d' % t) bestmatches = [[] for i in range(k)] # Find which centroid is the closest for each row @@ -321,7 +321,7 @@ def scaledown(data, distance=pearson, rate=0.01): # Keep track of the total error totalerror += abs(errorterm) - print totalerror + print (totalerror) # If the answer got worse by moving the points, we are done if lasterror and lasterror < totalerror: diff --git a/chapter3/feedlist.txt b/chapter3/feedlist.txt index 0e1c711..f31120b 100644 --- a/chapter3/feedlist.txt +++ b/chapter3/feedlist.txt @@ -1,97 +1,97 @@ http://feeds.feedburner.com/37signals/beMH http://feeds.feedburner.com/blogspot/bRuz -http://battellemedia.com/index.xml -http://blog.guykawasaki.com/index.rdf +#http://battellemedia.com/index.xml +#http://blog.guykawasaki.com/index.rdf http://blog.outer-court.com/rss.xml -http://feeds.searchenginewatch.com/sewblog -http://blog.topix.net/index.rdf -http://blogs.abcnews.com/theblotter/index.rdf +#http://feeds.searchenginewatch.com/sewblog +#http://blog.topix.net/index.rdf +#http://blogs.abcnews.com/theblotter/index.rdf http://feeds.feedburner.com/ConsumingExperienceFull -http://flagrantdisregard.com/index.php/feed/ -http://featured.gigaom.com/feed/ +#http://flagrantdisregard.com/index.php/feed/ +#http://featured.gigaom.com/feed/ http://gizmodo.com/index.xml http://gofugyourself.typepad.com/go_fug_yourself/index.rdf http://googleblog.blogspot.com/rss.xml http://feeds.feedburner.com/GoogleOperatingSystem http://headrush.typepad.com/creating_passionate_users/index.rdf -http://feeds.feedburner.com/instapundit/main -http://jeremy.zawodny.com/blog/rss2.xml +#http://feeds.feedburner.com/instapundit/main +#http://jeremy.zawodny.com/blog/rss2.xml http://joi.ito.com/index.rdf http://feeds.feedburner.com/Mashable -http://michellemalkin.com/index.rdf +#http://michellemalkin.com/index.rdf http://moblogsmoproblems.blogspot.com/rss.xml http://newsbusters.org/node/feed -http://beta.blogger.com/feeds/27154654/posts/full?alt=rss +#http://beta.blogger.com/feeds/27154654/posts/full?alt=rss http://feeds.feedburner.com/paulstamatiou -http://powerlineblog.com/index.rdf -http://feeds.feedburner.com/Publishing20 +#http://powerlineblog.com/index.rdf +#http://feeds.feedburner.com/Publishing20 http://radar.oreilly.com/index.rdf -http://scienceblogs.com/pharyngula/index.xml -http://scobleizer.wordpress.com/feed/ +#http://scienceblogs.com/pharyngula/index.xml +#http://scobleizer.wordpress.com/feed/ http://sethgodin.typepad.com/seths_blog/index.rdf -http://rss.slashdot.org/Slashdot/slashdot +#http://rss.slashdot.org/Slashdot/slashdot http://thinkprogress.org/feed/ http://feeds.feedburner.com/andrewsullivan/rApM -http://wilwheaton.typepad.com/wwdnbackup/index.rdf +#http://wilwheaton.typepad.com/wwdnbackup/index.rdf http://www.43folders.com/feed/ http://www.456bereastreet.com/feed.xml http://www.autoblog.com/rss.xml http://www.bloggersblog.com/rss.xml -http://www.bloglines.com/rss/about/news +#http://www.bloglines.com/rss/about/news http://www.blogmaverick.com/rss.xml http://www.boingboing.net/index.rdf -http://www.buzzmachine.com/index.xml +#http://www.buzzmachine.com/index.xml http://www.captainsquartersblog.com/mt/index.rdf -http://www.coolhunting.com/index.rdf +#http://www.coolhunting.com/index.rdf http://feeds.copyblogger.com/Copyblogger http://feeds.feedburner.com/crooksandliars/YaCP http://feeds.dailykos.com/dailykos/index.xml http://www.deadspin.com/index.xml -http://www.downloadsquad.com/rss.xml -http://www.engadget.com/rss.xml -http://www.gapingvoid.com/index.rdf -http://www.gawker.com/index.xml -http://www.gothamist.com/index.rdf -http://www.huffingtonpost.com/raw_feed_index.rdf -http://www.hyperorg.com/blogger/index.rdf +#http://www.downloadsquad.com/rss.xml +https://www.engadget.com/rss.xml +#http://www.gapingvoid.com/index.rdf +#http://www.gawker.com/index.xml +http://feeds.gothamistllc.com/gothamist05 +#http://www.huffingtonpost.com/raw_feed_index.rdf +#http://www.hyperorg.com/blogger/index.rdf http://www.joelonsoftware.com/rss.xml -http://www.joystiq.com/rss.xml -http://www.kotaku.com/index.xml +#http://www.joystiq.com/rss.xml +#http://www.kotaku.com/index.xml http://feeds.kottke.org/main http://www.lifehack.org/feed/ http://www.lifehacker.com/index.xml -http://littlegreenfootballs.com/weblog/lgf-rss.php -http://www.makezine.com/blog/index.xml +#http://littlegreenfootballs.com/weblog/lgf-rss.php +#http://www.makezine.com/blog/index.xml http://www.mattcutts.com/blog/feed/ -http://xml.metafilter.com/rss.xml +#http://xml.metafilter.com/rss.xml http://www.mezzoblue.com/rss/index.xml -http://www.micropersuasion.com/index.rdf +#http://www.micropersuasion.com/index.rdf http://www.neilgaiman.com/journal/feed/rss.xml -http://www.oilman.ca/feed/ +#http://www.oilman.ca/feed/ http://www.perezhilton.com/index.xml http://www.plasticbag.org/index.rdf http://www.powazek.com/rss.xml http://www.problogger.net/feed/ http://feeds.feedburner.com/QuickOnlineTips -http://www.readwriteweb.com/rss.xml +#http://www.readwriteweb.com/rss.xml http://www.schneier.com/blog/index.rdf -http://scienceblogs.com/sample/combined.xml +#http://scienceblogs.com/sample/combined.xml http://www.seroundtable.com/index.rdf http://www.shoemoney.com/feed/ -http://www.sifry.com/alerts/index.rdf -http://www.simplebits.com/xml/rss.xml +#http://www.sifry.com/alerts/index.rdf +#http://www.simplebits.com/xml/rss.xml http://feeds.feedburner.com/Spikedhumor http://www.stevepavlina.com/blog/feed -http://www.talkingpointsmemo.com/index.xml +#http://www.talkingpointsmemo.com/index.xml http://www.tbray.org/ongoing/ongoing.rss http://feeds.feedburner.com/TechCrunch http://www.techdirt.com/techdirt_rss.xml -http://www.techeblog.com/index.php/feed/ -http://www.thesuperficial.com/index.xml +#http://www.techeblog.com/index.php/feed/ +#http://www.thesuperficial.com/index.xml http://www.tmz.com/rss.xml -http://www.treehugger.com/index.rdf +#http://www.treehugger.com/index.rdf http://www.tuaw.com/rss.xml -http://www.valleywag.com/index.xml -http://www.we-make-money-not-art.com/index.rdf -http://www.wired.com/rss/index.xml -http://www.wonkette.com/index.xml +#http://www.valleywag.com/index.xml +#http://www.we-make-money-not-art.com/index.rdf +#http://www.wired.com/rss/index.xml +#http://www.wonkette.com/index.xml diff --git a/chapter3/generatefeedvector.py b/chapter3/generatefeedvector.py index e8fe92f..b1c45e0 100644 --- a/chapter3/generatefeedvector.py +++ b/chapter3/generatefeedvector.py @@ -41,8 +41,9 @@ def getwords(html): apcount = {} wordcounts = {} -feedlist = [line for line in file('feedlist.txt')] +feedlist = [line for line in open('chapter3/feedlist.txt')] for feedurl in feedlist: + if feedurl.startswith('#'): continue try: (title, wc) = getwordcounts(feedurl) wordcounts[title] = wc @@ -51,7 +52,7 @@ def getwords(html): if count > 1: apcount[word] += 1 except: - print 'Failed to parse feed %s' % feedurl + print ('Failed to parse feed %s' % feedurl) wordlist = [] for (w, bc) in apcount.items(): @@ -59,13 +60,13 @@ def getwords(html): if frac > 0.1 and frac < 0.5: wordlist.append(w) -out = file('blogdata1.txt', 'w') +out = open('chapter3/blogdata1.txt', 'w') out.write('Blog') for word in wordlist: out.write('\t%s' % word) out.write('\n') for (blog, wc) in wordcounts.items(): - print blog + print (blog) out.write(blog) for word in wordlist: if word in wc: diff --git a/chapter4/searchengine.py b/chapter4/searchengine.py index 1b99b62..97c1a89 100644 --- a/chapter4/searchengine.py +++ b/chapter4/searchengine.py @@ -1,5 +1,5 @@ -import urllib2 -from BeautifulSoup import * +import urllib +from bs4 import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite import nn @@ -37,7 +37,7 @@ def getentryid(self,table,field,value,createnew=True): # Index an individual page def addtoindex(self,url,soup): if self.isindexed(url): return - print 'Indexing '+url + print ('Indexing '+url) # Get the individual words text=self.gettextonly(soup) @@ -101,7 +101,7 @@ def crawl(self,pages,depth=2): try: c=urllib2.urlopen(page) except: - print "Could not open %s" % page + print ("Could not open %s" % page) continue try: soup=BeautifulSoup(c.read()) @@ -120,7 +120,7 @@ def crawl(self,pages,depth=2): self.dbcommit() except: - print "Could not parse page %s" % page + print ("Could not parse page %s" % page) pages=newpages @@ -150,7 +150,7 @@ def calculatepagerank(self,iterations=20): self.dbcommit() for i in range(iterations): - print "Iteration %d" % (i) + print ("Iteration %d" % (i)) for (urlid,) in self.con.execute('select rowid from urllist'): pr=0.15 @@ -205,7 +205,7 @@ def getmatchrows(self,q): # Create the query from the separate parts fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist) - print fullquery + print (fullquery) cur=self.con.execute(fullquery) rows=[row for row in cur] @@ -237,7 +237,7 @@ def query(self,q): rankedscores.sort() rankedscores.reverse() for (score,urlid) in rankedscores[0:10]: - print '%f\t%s' % (score,self.geturlname(urlid)) + print ('%f\t%s' % (score,self.geturlname(urlid))) return wordids,[r[1] for r in rankedscores[0:10]] def normalizescores(self,scores,smallIsBetter=0):