Few minor fixes to reflect errata fixes and final testing of the Goog…

…le Buzz => Google+ code changes. Everything appears to be in good order now.
ptwobrussell · Feb 9, 2012 · c6c9fc5 · c6c9fc5
1 parent c3a4663
commit c6c9fc5
Show file tree

Hide file tree

Showing 6 changed files with 107 additions and 32 deletions.
diff --git a/python_code/plus__cosine_similarity.py b/python_code/plus__cosine_similarity.py
@@ -4,21 +4,83 @@
 import json
 import nltk
 
-# Load in human readable text from wherever you've saved it
+# Ensure that if output is piped to standard out, it
+# is encoded as utf-8 (versus ascii, which is the default)
+sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
+
+# Load in textual data from wherever you've saved it
 
 DATA = sys.argv[1]
-N = 25
 data = json.loads(open(DATA).read())
 
-all_tokens = [token for activity in data for token in activity['object']['content'
-              ].lower().split()]
+all_posts = [post['object']['content'].lower().split() 
+             for post in data
+               if post['object']['content'] != '']
+
+# Provides tf/idf/tf_idf abstractions for scoring
+
+tc = nltk.TextCollection(all_posts)
+
+# Compute a term-document matrix such that td_matrix[doc_title][term]
+# returns a tf-idf score for the term in the document
+
+td_matrix = {}
+for idx in range(len(all_posts)):
+    post = all_posts[idx]
+    fdist = nltk.FreqDist(post)
+
+    doc_title = data[idx]['title']
+    url = data[idx]['url']
+    td_matrix[(doc_title, url)] = {}
+
+    for term in fdist.iterkeys():
+        td_matrix[(doc_title, url)][term] = tc.tf_idf(term, post)
+
+# Build vectors such that term scores are in the same positions...
+
+distances = {}
+for (title1, url1) in td_matrix.keys():
+
+    distances[(title1, url1)] = {}
+    (max_score, most_similar) = (0.0, ('', ''))
+
+    for (title2, url2) in td_matrix.keys():
+
+        # Take care not to mutate the original data structures
+        # since we're in a loop and need the originals multiple times
+
+        terms1 = td_matrix[(title1, url1)].copy()
+        terms2 = td_matrix[(title2, url2)].copy()
+
+        # Fill in "gaps" in each map so vectors of the same length can be computed
+
+        for term1 in terms1:
+            if term1 not in terms2:
+                terms2[term1] = 0
+
+        for term2 in terms2:
+            if term2 not in terms1:
+                terms1[term2] = 0
+
+        # Create vectors from term maps
+
+        v1 = [score for (term, score) in sorted(terms1.items())]
+        v2 = [score for (term, score) in sorted(terms2.items())]
+
+        # Compute similarity amongst documents
+
+        distances[(title1, url1)][(title2, url2)] = \
+            nltk.cluster.util.cosine_distance(v1, v2)
+
+        if url1 == url2:
+            continue
 
-finder = nltk.BigramCollocationFinder.from_words(all_tokens)
-finder.apply_freq_filter(2)
-finder.apply_word_filter(lambda w: w in nltk.corpus.stopwords.words('english'))
-scorer = nltk.metrics.BigramAssocMeasures.jaccard
-collocations = finder.nbest(scorer, N)
+        if distances[(title1, url1)][(title2, url2)] > max_score:
+            (max_score, most_similar) = (distances[(title1, url1)][(title2,
+                                         url2)], (title2, url2))
 
-for collocation in collocations:
-    c = ' '.join(collocation)
-    print c
+    print '''Most similar to %s (%s)
+\t%s (%s)
+\tscore %d
+''' % (title1, url1,
+            most_similar[0], most_similar[1], max_score)
diff --git a/python_code/plus__cosine_similarity_protovis_output.py b/python_code/plus__cosine_similarity_protovis_output.py
@@ -4,6 +4,7 @@
 import sys
 import shutil
 import webbrowser
+from random import shuffle
 import json
 from operator import itemgetter
 import nltk
@@ -14,7 +15,13 @@
 DATA = sys.argv[1]
 data = json.loads(open(DATA).read())
 
+# Take a random sample so that a meaningful visualization can be displayed
+
+shuffle(data)
+data = data[:25]
+
 # HTML templmates that we'll inject Protovis consumable data into
+
 HTML_TEMPLATES = ['../web_code/protovis/matrix_diagram.html', 
                   '../web_code/protovis/arc_diagram.html']
 
@@ -33,29 +40,29 @@
     fdist = nltk.FreqDist(activity)
 
     doc_title = data[idx]['title']
-    link = data[idx]['link']
-    td_matrix[(doc_title, link)] = {}
+    url = data[idx]['url']
+    td_matrix[(doc_title, url)] = {}
 
     for term in fdist.iterkeys():
-        td_matrix[(doc_title, link)][term] = tc.tf_idf(term, activity)
+        td_matrix[(doc_title, url)][term] = tc.tf_idf(term, activity)
 
 # Build vectors such that term scores are in the same positions...
 
 distances = {}
-for (title1, link1) in td_matrix.keys():
+for (title1, url1) in td_matrix.keys():
 
-    distances[(title1, link1)] = {}
+    distances[(title1, url1)] = {}
 
-    for (title2, link2) in td_matrix.keys():
+    for (title2, url2) in td_matrix.keys():
 
-        if link1 == link2:
+        if url1 == url2:
             continue
 
         # Take care not to mutate the original data structures
         # since we're in a loop and need the originals multiple times
 
-        terms1 = td_matrix[(title1, link1)].copy()
-        terms2 = td_matrix[(title2, link2)].copy()
+        terms1 = td_matrix[(title1, url1)].copy()
+        terms2 = td_matrix[(title2, url2)].copy()
 
         # Fill in "gaps" in each map so vectors of the same length can be computed
 
@@ -74,7 +81,7 @@
 
         # Compute similarity amongst documents
 
-        distances[(title1, link1)][(title2, link2)] = \
+        distances[(title1, url1)][(title2, url2)] = \
             nltk.cluster.util.cosine_distance(v1, v2)
 
 # Compute the standard deviation for the distances as a basis of automated thresholding
@@ -91,8 +98,8 @@
 
         d = distances[k1][k2]
         if d < std / 2 and d > 0.000001:  # call them similar
-            (title1, link1) = k1
-            (title2, link2) = k2
+            (title1, url1) = k1
+            (title2, url2) = k2
             similar.append((k1, k2, distances[k1][k2]))
 
 # Emit output expected by Protovis.
@@ -113,7 +120,7 @@
 
     node1 = nodes[s[1]]
 
-    edges.append({'source': node0, 'target': node1, 'value': s[2] * 1000})
+    edges.append({'source': node0, 'target': node1, 'value': s[2] * 100})
 
 nodes = [{'nodeName': title, 'nodeUrl': url} for ((title, url), idx) in
          sorted(nodes.items(), key=itemgetter(1))]

diff --git a/python_code/plus__get_activities.py b/python_code/plus__get_activities.py
@@ -20,7 +20,7 @@
 
 USER_ID=sys.argv[1] # Tim O'Reilly's Google+ id is '107033731246200681024'
 
-API_KEY=""
+API_KEY=None # Supply your own API key value.
 
 MAX_RESULTS = 200 # May actually get slightly more 
 
@@ -49,8 +49,12 @@ def cleanHtml(html):
   activities_document = request.execute()
 
   if 'items' in activities_document:
+
     for activity in activities_document['items']:
-      if activity['object']['objectType'] == 'note':
+
+      if activity['object']['objectType'] == 'note' and \
+         activity['object']['content'] != '':
+
         activity['title'] = cleanHtml(activity['title'])
         activity['object']['content'] = cleanHtml(activity['object']['content'])
         activities.append(activity)

diff --git a/python_code/plus__tf_idf.py b/python_code/plus__tf_idf.py
@@ -49,7 +49,7 @@ def tf_idf(term, doc, corpus):
     print
 
     for doc in sorted(corpus):
-        score = tf_idf(term, corpus[doc], corpus)
+        score = tf_idf(term, corpus[doc], corpus.values())
         print 'TF-IDF(%s): %s' % (doc, term), score
         query_scores[doc] += score
     print

diff --git a/python_code/plus__tf_idf_nltk.py b/python_code/plus__tf_idf_nltk.py
@@ -11,7 +11,9 @@
 
 QUERY_TERMS = sys.argv[2:]
 
-activities = [activity['object']['content'].lower().split() for activity in data]
+activities = [activity['object']['content'].lower().split() \
+              for activity in data \
+                if activity['object']['content'] != ""]
 
 # Provides tf/idf/tf_idf abstractions
 
@@ -25,12 +27,12 @@
         score += tc.tf_idf(term, activities[idx])
     if score > 0:
         relevant_activities.append({'score': score, 'title': data[idx]['title'],
-                              'link': data[idx]['link']})
+                              'url': data[idx]['url']})
 
 # Sort by score and display results
 
 relevant_activities = sorted(relevant_activities, key=lambda p: p['score'], reverse=True)
 for activity in relevant_activities:
     print activity['title']
-    print '\tLink: %s' % (activity['link'], )
+    print '\tLink: %s' % (activity['url'], )
     print '\tScore: %s' % (activity['score'], )
diff --git a/python_code/the_tweet__how_many_user_entities_are_friends.py b/python_code/the_tweet__how_many_user_entities_are_friends.py
@@ -21,7 +21,7 @@
 for friend_id in friend_ids:
     try:
         friend_screen_names.append(json.loads(r.get(getRedisIdByUserId(friend_id,
-                                   'info.json')))['screen_name'])
+                                   'info.json')))['screen_name'].lower())
     except TypeError, e:
         continue  # not locally available in Redis - look it up or skip it