Skip to content

Commit

Permalink
Few minor fixes to reflect errata fixes and final testing of the Goog…
Browse files Browse the repository at this point in the history
…le Buzz => Google+ code changes. Everything appears to be in good order now.
  • Loading branch information
Matthew A. Russell committed Feb 9, 2012
1 parent c3a4663 commit c6c9fc5
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 32 deletions.
86 changes: 74 additions & 12 deletions python_code/plus__cosine_similarity.py
Expand Up @@ -4,21 +4,83 @@
import json
import nltk

# Load in human readable text from wherever you've saved it
# Ensure that if output is piped to standard out, it
# is encoded as utf-8 (versus ascii, which is the default)
sys.stdout=codecs.getwriter('utf-8')(sys.stdout)

# Load in textual data from wherever you've saved it

DATA = sys.argv[1]
N = 25
data = json.loads(open(DATA).read())

all_tokens = [token for activity in data for token in activity['object']['content'
].lower().split()]
all_posts = [post['object']['content'].lower().split()
for post in data
if post['object']['content'] != '']

# Provides tf/idf/tf_idf abstractions for scoring

tc = nltk.TextCollection(all_posts)

# Compute a term-document matrix such that td_matrix[doc_title][term]
# returns a tf-idf score for the term in the document

td_matrix = {}
for idx in range(len(all_posts)):
post = all_posts[idx]
fdist = nltk.FreqDist(post)

doc_title = data[idx]['title']
url = data[idx]['url']
td_matrix[(doc_title, url)] = {}

for term in fdist.iterkeys():
td_matrix[(doc_title, url)][term] = tc.tf_idf(term, post)

# Build vectors such that term scores are in the same positions...

distances = {}
for (title1, url1) in td_matrix.keys():

distances[(title1, url1)] = {}
(max_score, most_similar) = (0.0, ('', ''))

for (title2, url2) in td_matrix.keys():

# Take care not to mutate the original data structures
# since we're in a loop and need the originals multiple times

terms1 = td_matrix[(title1, url1)].copy()
terms2 = td_matrix[(title2, url2)].copy()

# Fill in "gaps" in each map so vectors of the same length can be computed

for term1 in terms1:
if term1 not in terms2:
terms2[term1] = 0

for term2 in terms2:
if term2 not in terms1:
terms1[term2] = 0

# Create vectors from term maps

v1 = [score for (term, score) in sorted(terms1.items())]
v2 = [score for (term, score) in sorted(terms2.items())]

# Compute similarity amongst documents

distances[(title1, url1)][(title2, url2)] = \
nltk.cluster.util.cosine_distance(v1, v2)

if url1 == url2:
continue

finder = nltk.BigramCollocationFinder.from_words(all_tokens)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: w in nltk.corpus.stopwords.words('english'))
scorer = nltk.metrics.BigramAssocMeasures.jaccard
collocations = finder.nbest(scorer, N)
if distances[(title1, url1)][(title2, url2)] > max_score:
(max_score, most_similar) = (distances[(title1, url1)][(title2,
url2)], (title2, url2))

for collocation in collocations:
c = ' '.join(collocation)
print c
print '''Most similar to %s (%s)
\t%s (%s)
\tscore %d
''' % (title1, url1,
most_similar[0], most_similar[1], max_score)
33 changes: 20 additions & 13 deletions python_code/plus__cosine_similarity_protovis_output.py
Expand Up @@ -4,6 +4,7 @@
import sys
import shutil
import webbrowser
from random import shuffle
import json
from operator import itemgetter
import nltk
Expand All @@ -14,7 +15,13 @@
DATA = sys.argv[1]
data = json.loads(open(DATA).read())

# Take a random sample so that a meaningful visualization can be displayed

shuffle(data)
data = data[:25]

# HTML templmates that we'll inject Protovis consumable data into

HTML_TEMPLATES = ['../web_code/protovis/matrix_diagram.html',
'../web_code/protovis/arc_diagram.html']

Expand All @@ -33,29 +40,29 @@
fdist = nltk.FreqDist(activity)

doc_title = data[idx]['title']
link = data[idx]['link']
td_matrix[(doc_title, link)] = {}
url = data[idx]['url']
td_matrix[(doc_title, url)] = {}

for term in fdist.iterkeys():
td_matrix[(doc_title, link)][term] = tc.tf_idf(term, activity)
td_matrix[(doc_title, url)][term] = tc.tf_idf(term, activity)

# Build vectors such that term scores are in the same positions...

distances = {}
for (title1, link1) in td_matrix.keys():
for (title1, url1) in td_matrix.keys():

distances[(title1, link1)] = {}
distances[(title1, url1)] = {}

for (title2, link2) in td_matrix.keys():
for (title2, url2) in td_matrix.keys():

if link1 == link2:
if url1 == url2:
continue

# Take care not to mutate the original data structures
# since we're in a loop and need the originals multiple times

terms1 = td_matrix[(title1, link1)].copy()
terms2 = td_matrix[(title2, link2)].copy()
terms1 = td_matrix[(title1, url1)].copy()
terms2 = td_matrix[(title2, url2)].copy()

# Fill in "gaps" in each map so vectors of the same length can be computed

Expand All @@ -74,7 +81,7 @@

# Compute similarity amongst documents

distances[(title1, link1)][(title2, link2)] = \
distances[(title1, url1)][(title2, url2)] = \
nltk.cluster.util.cosine_distance(v1, v2)

# Compute the standard deviation for the distances as a basis of automated thresholding
Expand All @@ -91,8 +98,8 @@

d = distances[k1][k2]
if d < std / 2 and d > 0.000001: # call them similar
(title1, link1) = k1
(title2, link2) = k2
(title1, url1) = k1
(title2, url2) = k2
similar.append((k1, k2, distances[k1][k2]))

# Emit output expected by Protovis.
Expand All @@ -113,7 +120,7 @@

node1 = nodes[s[1]]

edges.append({'source': node0, 'target': node1, 'value': s[2] * 1000})
edges.append({'source': node0, 'target': node1, 'value': s[2] * 100})

nodes = [{'nodeName': title, 'nodeUrl': url} for ((title, url), idx) in
sorted(nodes.items(), key=itemgetter(1))]
Expand Down
8 changes: 6 additions & 2 deletions python_code/plus__get_activities.py
Expand Up @@ -20,7 +20,7 @@

USER_ID=sys.argv[1] # Tim O'Reilly's Google+ id is '107033731246200681024'

API_KEY=""
API_KEY=None # Supply your own API key value.

MAX_RESULTS = 200 # May actually get slightly more

Expand Down Expand Up @@ -49,8 +49,12 @@ def cleanHtml(html):
activities_document = request.execute()

if 'items' in activities_document:

for activity in activities_document['items']:
if activity['object']['objectType'] == 'note':

if activity['object']['objectType'] == 'note' and \
activity['object']['content'] != '':

activity['title'] = cleanHtml(activity['title'])
activity['object']['content'] = cleanHtml(activity['object']['content'])
activities.append(activity)
Expand Down
2 changes: 1 addition & 1 deletion python_code/plus__tf_idf.py
Expand Up @@ -49,7 +49,7 @@ def tf_idf(term, doc, corpus):
print

for doc in sorted(corpus):
score = tf_idf(term, corpus[doc], corpus)
score = tf_idf(term, corpus[doc], corpus.values())
print 'TF-IDF(%s): %s' % (doc, term), score
query_scores[doc] += score
print
Expand Down
8 changes: 5 additions & 3 deletions python_code/plus__tf_idf_nltk.py
Expand Up @@ -11,7 +11,9 @@

QUERY_TERMS = sys.argv[2:]

activities = [activity['object']['content'].lower().split() for activity in data]
activities = [activity['object']['content'].lower().split() \
for activity in data \
if activity['object']['content'] != ""]

# Provides tf/idf/tf_idf abstractions

Expand All @@ -25,12 +27,12 @@
score += tc.tf_idf(term, activities[idx])
if score > 0:
relevant_activities.append({'score': score, 'title': data[idx]['title'],
'link': data[idx]['link']})
'url': data[idx]['url']})

# Sort by score and display results

relevant_activities = sorted(relevant_activities, key=lambda p: p['score'], reverse=True)
for activity in relevant_activities:
print activity['title']
print '\tLink: %s' % (activity['link'], )
print '\tLink: %s' % (activity['url'], )
print '\tScore: %s' % (activity['score'], )
Expand Up @@ -21,7 +21,7 @@
for friend_id in friend_ids:
try:
friend_screen_names.append(json.loads(r.get(getRedisIdByUserId(friend_id,
'info.json')))['screen_name'])
'info.json')))['screen_name'].lower())
except TypeError, e:
continue # not locally available in Redis - look it up or skip it

Expand Down

0 comments on commit c6c9fc5

Please sign in to comment.