-
Notifications
You must be signed in to change notification settings - Fork 491
/
plus__tf_idf.py
59 lines (43 loc) · 1.8 KB
/
plus__tf_idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
import sys
from math import log
QUERY_TERMS = sys.argv[1:]
def tf(term, doc, normalize=True):
doc = doc.lower().split()
if normalize:
return doc.count(term.lower()) / float(len(doc))
else:
return doc.count(term.lower()) / 1.0
def idf(term, corpus):
num_texts_with_term = len([True for text in corpus if term.lower()
in text.lower().split()])
# tf-idf calc involves multiplying against a tf value less than 0, so it's important
# to return a value greater than 1 for consistent scoring. (Multiplying two values
# less than 1 returns a value less than each of them)
try:
return 1.0 + log(float(len(corpus)) / num_texts_with_term)
except ZeroDivisionError:
return 1.0
def tf_idf(term, doc, corpus):
return tf(term, doc) * idf(term, corpus)
corpus = \
{'a': 'Mr. Green killed Colonel Mustard in the study with the candlestick. \
Mr. Green is not a very nice fellow.',
'b': 'Professor Plumb has a green plant in his study.',
'c': "Miss Scarlett watered Professor Plumb's green plant while he was away \
from his office last week."}
# Score queries by calculating cumulative tf_idf score for each term in query
query_scores = {'a': 0, 'b': 0, 'c': 0}
for term in [t.lower() for t in QUERY_TERMS]:
for doc in sorted(corpus):
print 'TF(%s): %s' % (doc, term), tf(term, corpus[doc])
print 'IDF: %s' % (term, ), idf(term, corpus.values())
print
for doc in sorted(corpus):
score = tf_idf(term, corpus[doc], corpus.values())
print 'TF-IDF(%s): %s' % (doc, term), score
query_scores[doc] += score
print
print "Overall TF-IDF scores for query '%s'" % (' '.join(QUERY_TERMS), )
for (doc, score) in sorted(query_scores.items()):
print doc, score