-
Notifications
You must be signed in to change notification settings - Fork 491
/
buzz__tf_idf_nltk.py
36 lines (25 loc) · 921 Bytes
/
buzz__tf_idf_nltk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# -*- coding: utf-8 -*-
import sys
import json
import nltk
# Load in unstructured data from wherever you've saved it
DATA = sys.argv[1]
data = json.loads(open(DATA).read())
QUERY_TERMS = sys.argv[2:]
all_posts = [post['content'].lower().split() for post in data]
# Provides tf/idf/tf_idf abstractions
tc = nltk.TextCollection(all_posts)
relevant_posts = []
for idx in range(len(all_posts)):
score = 0
for term in [t.lower() for t in QUERY_TERMS]:
score += tc.tf_idf(term, all_posts[idx])
if score > 0:
relevant_posts.append({'score': score, 'title': data[idx]['title'],
'link': data[idx]['link']})
# Sort by score and display results
relevant_posts = sorted(relevant_posts, key=lambda p: p['score'], reverse=True)
for post in relevant_posts:
print post['title']
print '\tLink: %s' % (post['link'], )
print '\tScore: %s' % (post['score'], )