-
Notifications
You must be signed in to change notification settings - Fork 491
/
plus__tf_idf_nltk.py
38 lines (27 loc) · 1.03 KB
/
plus__tf_idf_nltk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# -*- coding: utf-8 -*-
import sys
import json
import nltk
# Load in unstructured data from wherever you've saved it
DATA = sys.argv[1]
data = json.loads(open(DATA).read())
QUERY_TERMS = sys.argv[2:]
activities = [activity['object']['content'].lower().split() \
for activity in data \
if activity['object']['content'] != ""]
# Provides tf/idf/tf_idf abstractions
tc = nltk.TextCollection(activities)
relevant_activities = []
for idx in range(len(activities)):
score = 0
for term in [t.lower() for t in QUERY_TERMS]:
score += tc.tf_idf(term, activities[idx])
if score > 0:
relevant_activities.append({'score': score, 'title': data[idx]['title'],
'url': data[idx]['url']})
# Sort by score and display results
relevant_activities = sorted(relevant_activities, key=lambda p: p['score'], reverse=True)
for activity in relevant_activities:
print activity['title']
print '\tLink: %s' % (activity['url'], )
print '\tScore: %s' % (activity['score'], )