diff --git a/python_code/plus__cosine_similarity.py b/python_code/plus__cosine_similarity.py index 19bdbce..1700904 100644 --- a/python_code/plus__cosine_similarity.py +++ b/python_code/plus__cosine_similarity.py @@ -4,21 +4,83 @@ import json import nltk -# Load in human readable text from wherever you've saved it +# Ensure that if output is piped to standard out, it +# is encoded as utf-8 (versus ascii, which is the default) +sys.stdout=codecs.getwriter('utf-8')(sys.stdout) + +# Load in textual data from wherever you've saved it DATA = sys.argv[1] -N = 25 data = json.loads(open(DATA).read()) -all_tokens = [token for activity in data for token in activity['object']['content' - ].lower().split()] +all_posts = [post['object']['content'].lower().split() + for post in data + if post['object']['content'] != ''] + +# Provides tf/idf/tf_idf abstractions for scoring + +tc = nltk.TextCollection(all_posts) + +# Compute a term-document matrix such that td_matrix[doc_title][term] +# returns a tf-idf score for the term in the document + +td_matrix = {} +for idx in range(len(all_posts)): + post = all_posts[idx] + fdist = nltk.FreqDist(post) + + doc_title = data[idx]['title'] + url = data[idx]['url'] + td_matrix[(doc_title, url)] = {} + + for term in fdist.iterkeys(): + td_matrix[(doc_title, url)][term] = tc.tf_idf(term, post) + +# Build vectors such that term scores are in the same positions... + +distances = {} +for (title1, url1) in td_matrix.keys(): + + distances[(title1, url1)] = {} + (max_score, most_similar) = (0.0, ('', '')) + + for (title2, url2) in td_matrix.keys(): + + # Take care not to mutate the original data structures + # since we're in a loop and need the originals multiple times + + terms1 = td_matrix[(title1, url1)].copy() + terms2 = td_matrix[(title2, url2)].copy() + + # Fill in "gaps" in each map so vectors of the same length can be computed + + for term1 in terms1: + if term1 not in terms2: + terms2[term1] = 0 + + for term2 in terms2: + if term2 not in terms1: + terms1[term2] = 0 + + # Create vectors from term maps + + v1 = [score for (term, score) in sorted(terms1.items())] + v2 = [score for (term, score) in sorted(terms2.items())] + + # Compute similarity amongst documents + + distances[(title1, url1)][(title2, url2)] = \ + nltk.cluster.util.cosine_distance(v1, v2) + + if url1 == url2: + continue -finder = nltk.BigramCollocationFinder.from_words(all_tokens) -finder.apply_freq_filter(2) -finder.apply_word_filter(lambda w: w in nltk.corpus.stopwords.words('english')) -scorer = nltk.metrics.BigramAssocMeasures.jaccard -collocations = finder.nbest(scorer, N) + if distances[(title1, url1)][(title2, url2)] > max_score: + (max_score, most_similar) = (distances[(title1, url1)][(title2, + url2)], (title2, url2)) -for collocation in collocations: - c = ' '.join(collocation) - print c + print '''Most similar to %s (%s) +\t%s (%s) +\tscore %d +''' % (title1, url1, + most_similar[0], most_similar[1], max_score) diff --git a/python_code/plus__cosine_similarity_protovis_output.py b/python_code/plus__cosine_similarity_protovis_output.py index f4aa59e..a834dbd 100644 --- a/python_code/plus__cosine_similarity_protovis_output.py +++ b/python_code/plus__cosine_similarity_protovis_output.py @@ -4,6 +4,7 @@ import sys import shutil import webbrowser +from random import shuffle import json from operator import itemgetter import nltk @@ -14,7 +15,13 @@ DATA = sys.argv[1] data = json.loads(open(DATA).read()) +# Take a random sample so that a meaningful visualization can be displayed + +shuffle(data) +data = data[:25] + # HTML templmates that we'll inject Protovis consumable data into + HTML_TEMPLATES = ['../web_code/protovis/matrix_diagram.html', '../web_code/protovis/arc_diagram.html'] @@ -33,29 +40,29 @@ fdist = nltk.FreqDist(activity) doc_title = data[idx]['title'] - link = data[idx]['link'] - td_matrix[(doc_title, link)] = {} + url = data[idx]['url'] + td_matrix[(doc_title, url)] = {} for term in fdist.iterkeys(): - td_matrix[(doc_title, link)][term] = tc.tf_idf(term, activity) + td_matrix[(doc_title, url)][term] = tc.tf_idf(term, activity) # Build vectors such that term scores are in the same positions... distances = {} -for (title1, link1) in td_matrix.keys(): +for (title1, url1) in td_matrix.keys(): - distances[(title1, link1)] = {} + distances[(title1, url1)] = {} - for (title2, link2) in td_matrix.keys(): + for (title2, url2) in td_matrix.keys(): - if link1 == link2: + if url1 == url2: continue # Take care not to mutate the original data structures # since we're in a loop and need the originals multiple times - terms1 = td_matrix[(title1, link1)].copy() - terms2 = td_matrix[(title2, link2)].copy() + terms1 = td_matrix[(title1, url1)].copy() + terms2 = td_matrix[(title2, url2)].copy() # Fill in "gaps" in each map so vectors of the same length can be computed @@ -74,7 +81,7 @@ # Compute similarity amongst documents - distances[(title1, link1)][(title2, link2)] = \ + distances[(title1, url1)][(title2, url2)] = \ nltk.cluster.util.cosine_distance(v1, v2) # Compute the standard deviation for the distances as a basis of automated thresholding @@ -91,8 +98,8 @@ d = distances[k1][k2] if d < std / 2 and d > 0.000001: # call them similar - (title1, link1) = k1 - (title2, link2) = k2 + (title1, url1) = k1 + (title2, url2) = k2 similar.append((k1, k2, distances[k1][k2])) # Emit output expected by Protovis. @@ -113,7 +120,7 @@ node1 = nodes[s[1]] - edges.append({'source': node0, 'target': node1, 'value': s[2] * 1000}) + edges.append({'source': node0, 'target': node1, 'value': s[2] * 100}) nodes = [{'nodeName': title, 'nodeUrl': url} for ((title, url), idx) in sorted(nodes.items(), key=itemgetter(1))] diff --git a/python_code/plus__get_activities.py b/python_code/plus__get_activities.py index 8330495..c976726 100644 --- a/python_code/plus__get_activities.py +++ b/python_code/plus__get_activities.py @@ -20,7 +20,7 @@ USER_ID=sys.argv[1] # Tim O'Reilly's Google+ id is '107033731246200681024' -API_KEY="" +API_KEY=None # Supply your own API key value. MAX_RESULTS = 200 # May actually get slightly more @@ -49,8 +49,12 @@ def cleanHtml(html): activities_document = request.execute() if 'items' in activities_document: + for activity in activities_document['items']: - if activity['object']['objectType'] == 'note': + + if activity['object']['objectType'] == 'note' and \ + activity['object']['content'] != '': + activity['title'] = cleanHtml(activity['title']) activity['object']['content'] = cleanHtml(activity['object']['content']) activities.append(activity) diff --git a/python_code/plus__tf_idf.py b/python_code/plus__tf_idf.py index 2c3ee9e..cbd87fa 100644 --- a/python_code/plus__tf_idf.py +++ b/python_code/plus__tf_idf.py @@ -49,7 +49,7 @@ def tf_idf(term, doc, corpus): print for doc in sorted(corpus): - score = tf_idf(term, corpus[doc], corpus) + score = tf_idf(term, corpus[doc], corpus.values()) print 'TF-IDF(%s): %s' % (doc, term), score query_scores[doc] += score print diff --git a/python_code/plus__tf_idf_nltk.py b/python_code/plus__tf_idf_nltk.py index 59fdfe6..04fa39c 100644 --- a/python_code/plus__tf_idf_nltk.py +++ b/python_code/plus__tf_idf_nltk.py @@ -11,7 +11,9 @@ QUERY_TERMS = sys.argv[2:] -activities = [activity['object']['content'].lower().split() for activity in data] +activities = [activity['object']['content'].lower().split() \ + for activity in data \ + if activity['object']['content'] != ""] # Provides tf/idf/tf_idf abstractions @@ -25,12 +27,12 @@ score += tc.tf_idf(term, activities[idx]) if score > 0: relevant_activities.append({'score': score, 'title': data[idx]['title'], - 'link': data[idx]['link']}) + 'url': data[idx]['url']}) # Sort by score and display results relevant_activities = sorted(relevant_activities, key=lambda p: p['score'], reverse=True) for activity in relevant_activities: print activity['title'] - print '\tLink: %s' % (activity['link'], ) + print '\tLink: %s' % (activity['url'], ) print '\tScore: %s' % (activity['score'], ) diff --git a/python_code/the_tweet__how_many_user_entities_are_friends.py b/python_code/the_tweet__how_many_user_entities_are_friends.py index 46c6ac2..3806353 100644 --- a/python_code/the_tweet__how_many_user_entities_are_friends.py +++ b/python_code/the_tweet__how_many_user_entities_are_friends.py @@ -21,7 +21,7 @@ for friend_id in friend_ids: try: friend_screen_names.append(json.loads(r.get(getRedisIdByUserId(friend_id, - 'info.json')))['screen_name']) + 'info.json')))['screen_name'].lower()) except TypeError, e: continue # not locally available in Redis - look it up or skip it