From aeda146f36bca897fca342fd4e30853bcab6d959 Mon Sep 17 00:00:00 2001
From: jezhang <jean.zhang@dbschenker.com>
Date: Tue, 10 Jan 2017 17:18:15 +0800
Subject: [PATCH] 20170110

---
 chapter2/recommendations.py    |  9 ++--
 chapter3/clusters.py           | 14 +++---
 chapter3/feedlist.txt          | 92 +++++++++++++++++-----------------
 chapter3/generatefeedvector.py |  9 ++--
 chapter4/searchengine.py       | 16 +++---
 5 files changed, 72 insertions(+), 68 deletions(-)

diff --git a/chapter2/recommendations.py b/chapter2/recommendations.py
index 31da0ea..40b54b7 100644
--- a/chapter2/recommendations.py
+++ b/chapter2/recommendations.py
@@ -47,8 +47,11 @@
         'Superman Returns': 5.0,
         'You, Me and Dupree': 3.5,
     },
-    'Toby': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0,
-             'Superman Returns': 4.0},
+    'Toby': {
+        'Snakes on a Plane': 4.5, 
+        'You, Me and Dupree': 1.0,
+        'Superman Returns': 4.0
+    },
 }
 
 
@@ -187,7 +190,7 @@ def calculateSimilarItems(prefs, n=10):
         # Status updates for large datasets
         c += 1
         if c % 100 == 0:
-            print '%d / %d' % (c, len(itemPrefs))
+            print ('%d / %d') % (c, len(itemPrefs))
         # Find the most similar items to this one
         scores = topMatches(itemPrefs, item, n=n, similarity=sim_distance)
         result[item] = scores
diff --git a/chapter3/clusters.py b/chapter3/clusters.py
index 2bc9807..be6dd4b 100644
--- a/chapter3/clusters.py
+++ b/chapter3/clusters.py
@@ -5,7 +5,7 @@
 import random
 
 def readfile(filename):
-    lines = [line for line in file(filename)]
+    lines = [line for line in open(filename)]
 
   # First line is the column titles
     colnames = lines[0].strip().split('\t')[1:]
@@ -105,16 +105,16 @@ def hcluster(rows, distance=pearson):
 def printclust(clust, labels=None, n=0):
   # indent to make a hierarchy layout
     for i in range(n):
-        print ' ',
+        print (' ')
     if clust.id < 0:
     # negative id means that this is branch
-        print '-'
+        print ('-')
     else:
     # positive id means that this is an endpoint
         if labels == None:
-            print clust.id
+            print (clust.id)
         else:
-            print labels[clust.id]
+            print (labels[clust.id])
 
   # now print the right and left branches
     if clust.left != None:
@@ -236,7 +236,7 @@ def kcluster(rows, distance=pearson, k=4):
 
     lastmatches = None
     for t in range(100):
-        print 'Iteration %d' % t
+        print ('Iteration %d' % t)
         bestmatches = [[] for i in range(k)]
 
     # Find which centroid is the closest for each row
@@ -321,7 +321,7 @@ def scaledown(data, distance=pearson, rate=0.01):
 
         # Keep track of the total error
                 totalerror += abs(errorterm)
-        print totalerror
+        print (totalerror)
 
     # If the answer got worse by moving the points, we are done
         if lasterror and lasterror < totalerror:
diff --git a/chapter3/feedlist.txt b/chapter3/feedlist.txt
index 0e1c711..f31120b 100644
--- a/chapter3/feedlist.txt
+++ b/chapter3/feedlist.txt
@@ -1,97 +1,97 @@
 http://feeds.feedburner.com/37signals/beMH
 http://feeds.feedburner.com/blogspot/bRuz
-http://battellemedia.com/index.xml
-http://blog.guykawasaki.com/index.rdf
+#http://battellemedia.com/index.xml
+#http://blog.guykawasaki.com/index.rdf
 http://blog.outer-court.com/rss.xml
-http://feeds.searchenginewatch.com/sewblog
-http://blog.topix.net/index.rdf
-http://blogs.abcnews.com/theblotter/index.rdf
+#http://feeds.searchenginewatch.com/sewblog
+#http://blog.topix.net/index.rdf
+#http://blogs.abcnews.com/theblotter/index.rdf
 http://feeds.feedburner.com/ConsumingExperienceFull
-http://flagrantdisregard.com/index.php/feed/
-http://featured.gigaom.com/feed/
+#http://flagrantdisregard.com/index.php/feed/
+#http://featured.gigaom.com/feed/
 http://gizmodo.com/index.xml
 http://gofugyourself.typepad.com/go_fug_yourself/index.rdf
 http://googleblog.blogspot.com/rss.xml
 http://feeds.feedburner.com/GoogleOperatingSystem
 http://headrush.typepad.com/creating_passionate_users/index.rdf
-http://feeds.feedburner.com/instapundit/main
-http://jeremy.zawodny.com/blog/rss2.xml
+#http://feeds.feedburner.com/instapundit/main
+#http://jeremy.zawodny.com/blog/rss2.xml
 http://joi.ito.com/index.rdf
 http://feeds.feedburner.com/Mashable
-http://michellemalkin.com/index.rdf
+#http://michellemalkin.com/index.rdf
 http://moblogsmoproblems.blogspot.com/rss.xml
 http://newsbusters.org/node/feed
-http://beta.blogger.com/feeds/27154654/posts/full?alt=rss
+#http://beta.blogger.com/feeds/27154654/posts/full?alt=rss
 http://feeds.feedburner.com/paulstamatiou
-http://powerlineblog.com/index.rdf
-http://feeds.feedburner.com/Publishing20
+#http://powerlineblog.com/index.rdf
+#http://feeds.feedburner.com/Publishing20
 http://radar.oreilly.com/index.rdf
-http://scienceblogs.com/pharyngula/index.xml
-http://scobleizer.wordpress.com/feed/
+#http://scienceblogs.com/pharyngula/index.xml
+#http://scobleizer.wordpress.com/feed/
 http://sethgodin.typepad.com/seths_blog/index.rdf
-http://rss.slashdot.org/Slashdot/slashdot
+#http://rss.slashdot.org/Slashdot/slashdot
 http://thinkprogress.org/feed/
 http://feeds.feedburner.com/andrewsullivan/rApM
-http://wilwheaton.typepad.com/wwdnbackup/index.rdf
+#http://wilwheaton.typepad.com/wwdnbackup/index.rdf
 http://www.43folders.com/feed/
 http://www.456bereastreet.com/feed.xml
 http://www.autoblog.com/rss.xml
 http://www.bloggersblog.com/rss.xml
-http://www.bloglines.com/rss/about/news
+#http://www.bloglines.com/rss/about/news
 http://www.blogmaverick.com/rss.xml
 http://www.boingboing.net/index.rdf
-http://www.buzzmachine.com/index.xml
+#http://www.buzzmachine.com/index.xml
 http://www.captainsquartersblog.com/mt/index.rdf
-http://www.coolhunting.com/index.rdf
+#http://www.coolhunting.com/index.rdf
 http://feeds.copyblogger.com/Copyblogger
 http://feeds.feedburner.com/crooksandliars/YaCP
 http://feeds.dailykos.com/dailykos/index.xml
 http://www.deadspin.com/index.xml
-http://www.downloadsquad.com/rss.xml
-http://www.engadget.com/rss.xml
-http://www.gapingvoid.com/index.rdf
-http://www.gawker.com/index.xml
-http://www.gothamist.com/index.rdf
-http://www.huffingtonpost.com/raw_feed_index.rdf
-http://www.hyperorg.com/blogger/index.rdf
+#http://www.downloadsquad.com/rss.xml
+https://www.engadget.com/rss.xml
+#http://www.gapingvoid.com/index.rdf
+#http://www.gawker.com/index.xml
+http://feeds.gothamistllc.com/gothamist05
+#http://www.huffingtonpost.com/raw_feed_index.rdf
+#http://www.hyperorg.com/blogger/index.rdf
 http://www.joelonsoftware.com/rss.xml
-http://www.joystiq.com/rss.xml
-http://www.kotaku.com/index.xml
+#http://www.joystiq.com/rss.xml
+#http://www.kotaku.com/index.xml
 http://feeds.kottke.org/main
 http://www.lifehack.org/feed/
 http://www.lifehacker.com/index.xml
-http://littlegreenfootballs.com/weblog/lgf-rss.php
-http://www.makezine.com/blog/index.xml
+#http://littlegreenfootballs.com/weblog/lgf-rss.php
+#http://www.makezine.com/blog/index.xml
 http://www.mattcutts.com/blog/feed/
-http://xml.metafilter.com/rss.xml
+#http://xml.metafilter.com/rss.xml
 http://www.mezzoblue.com/rss/index.xml
-http://www.micropersuasion.com/index.rdf
+#http://www.micropersuasion.com/index.rdf
 http://www.neilgaiman.com/journal/feed/rss.xml
-http://www.oilman.ca/feed/
+#http://www.oilman.ca/feed/
 http://www.perezhilton.com/index.xml
 http://www.plasticbag.org/index.rdf
 http://www.powazek.com/rss.xml
 http://www.problogger.net/feed/
 http://feeds.feedburner.com/QuickOnlineTips
-http://www.readwriteweb.com/rss.xml
+#http://www.readwriteweb.com/rss.xml
 http://www.schneier.com/blog/index.rdf
-http://scienceblogs.com/sample/combined.xml
+#http://scienceblogs.com/sample/combined.xml
 http://www.seroundtable.com/index.rdf
 http://www.shoemoney.com/feed/
-http://www.sifry.com/alerts/index.rdf
-http://www.simplebits.com/xml/rss.xml
+#http://www.sifry.com/alerts/index.rdf
+#http://www.simplebits.com/xml/rss.xml
 http://feeds.feedburner.com/Spikedhumor
 http://www.stevepavlina.com/blog/feed
-http://www.talkingpointsmemo.com/index.xml
+#http://www.talkingpointsmemo.com/index.xml
 http://www.tbray.org/ongoing/ongoing.rss
 http://feeds.feedburner.com/TechCrunch
 http://www.techdirt.com/techdirt_rss.xml
-http://www.techeblog.com/index.php/feed/
-http://www.thesuperficial.com/index.xml
+#http://www.techeblog.com/index.php/feed/
+#http://www.thesuperficial.com/index.xml
 http://www.tmz.com/rss.xml
-http://www.treehugger.com/index.rdf
+#http://www.treehugger.com/index.rdf
 http://www.tuaw.com/rss.xml
-http://www.valleywag.com/index.xml
-http://www.we-make-money-not-art.com/index.rdf
-http://www.wired.com/rss/index.xml
-http://www.wonkette.com/index.xml
+#http://www.valleywag.com/index.xml
+#http://www.we-make-money-not-art.com/index.rdf
+#http://www.wired.com/rss/index.xml
+#http://www.wonkette.com/index.xml
diff --git a/chapter3/generatefeedvector.py b/chapter3/generatefeedvector.py
index e8fe92f..b1c45e0 100644
--- a/chapter3/generatefeedvector.py
+++ b/chapter3/generatefeedvector.py
@@ -41,8 +41,9 @@ def getwords(html):
 
 apcount = {}
 wordcounts = {}
-feedlist = [line for line in file('feedlist.txt')]
+feedlist = [line for line in open('chapter3/feedlist.txt')]
 for feedurl in feedlist:
+    if feedurl.startswith('#'): continue
     try:
         (title, wc) = getwordcounts(feedurl)
         wordcounts[title] = wc
@@ -51,7 +52,7 @@ def getwords(html):
             if count > 1:
                 apcount[word] += 1
     except:
-        print 'Failed to parse feed %s' % feedurl
+        print ('Failed to parse feed %s' % feedurl)
 
 wordlist = []
 for (w, bc) in apcount.items():
@@ -59,13 +60,13 @@ def getwords(html):
     if frac > 0.1 and frac < 0.5:
         wordlist.append(w)
 
-out = file('blogdata1.txt', 'w')
+out = open('chapter3/blogdata1.txt', 'w')
 out.write('Blog')
 for word in wordlist:
     out.write('\t%s' % word)
 out.write('\n')
 for (blog, wc) in wordcounts.items():
-    print blog
+    print (blog)
     out.write(blog)
     for word in wordlist:
         if word in wc:
diff --git a/chapter4/searchengine.py b/chapter4/searchengine.py
index 1b99b62..97c1a89 100644
--- a/chapter4/searchengine.py
+++ b/chapter4/searchengine.py
@@ -1,5 +1,5 @@
-import urllib2
-from BeautifulSoup import *
+import urllib
+from bs4 import *
 from urlparse import urljoin
 from pysqlite2 import dbapi2 as sqlite
 import nn
@@ -37,7 +37,7 @@ def getentryid(self,table,field,value,createnew=True):
   # Index an individual page
   def addtoindex(self,url,soup):
     if self.isindexed(url): return
-    print 'Indexing '+url
+    print ('Indexing '+url)
   
     # Get the individual words
     text=self.gettextonly(soup)
@@ -101,7 +101,7 @@ def crawl(self,pages,depth=2):
         try:
           c=urllib2.urlopen(page)
         except:
-          print "Could not open %s" % page
+          print ("Could not open %s" % page)
           continue
         try:
           soup=BeautifulSoup(c.read())
@@ -120,7 +120,7 @@ def crawl(self,pages,depth=2):
   
           self.dbcommit()
         except:
-          print "Could not parse page %s" % page
+          print ("Could not parse page %s" % page)
 
       pages=newpages
 
@@ -150,7 +150,7 @@ def calculatepagerank(self,iterations=20):
     self.dbcommit()
     
     for i in range(iterations):
-      print "Iteration %d" % (i)
+      print ("Iteration %d" % (i))
       for (urlid,) in self.con.execute('select rowid from urllist'):
         pr=0.15
         
@@ -205,7 +205,7 @@ def getmatchrows(self,q):
 
     # Create the query from the separate parts
     fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist)
-    print fullquery
+    print (fullquery)
     cur=self.con.execute(fullquery)
     rows=[row for row in cur]
 
@@ -237,7 +237,7 @@ def query(self,q):
     rankedscores.sort()
     rankedscores.reverse()
     for (score,urlid) in rankedscores[0:10]:
-      print '%f\t%s' % (score,self.geturlname(urlid))
+      print ('%f\t%s' % (score,self.geturlname(urlid)))
     return wordids,[r[1] for r in rankedscores[0:10]]
 
   def normalizescores(self,scores,smallIsBetter=0):