-
Notifications
You must be signed in to change notification settings - Fork 53
/
t_scores.py
60 lines (46 loc) · 1.49 KB
/
t_scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""
t-scores of bigrams in a corpus
Usage: python t_scores.py < corpus.txt
"""
__author__ = "Pierre Nugues"
import sys
import math
import regex
def tokenize(text):
words = regex.findall(r"\p{L}+", text)
return words
def count_unigrams(words):
frequency = {}
for word in words:
if word in frequency:
frequency[word] += 1
else:
frequency[word] = 1
return frequency
def count_bigrams(words):
bigrams = [tuple(words[idx:idx + 2]) for idx in range(len(words) - 1)]
frequency_bigrams = {}
for bigram in bigrams:
if bigram in frequency_bigrams:
frequency_bigrams[bigram] += 1
else:
frequency_bigrams[bigram] = 1
return frequency_bigrams
def t_scores(words, freq_unigrams, freq_bigrams):
ts = {}
for bigram in freq_bigrams:
ts[bigram] = ((freq_bigrams[bigram] -
freq_unigrams[bigram[0]] *
freq_unigrams[bigram[1]] /
len(words)) /
math.sqrt(freq_bigrams[bigram]))
return ts
if __name__ == '__main__':
text = sys.stdin.read().lower()
words = tokenize(text)
frequency = count_unigrams(words)
frequency_bigrams = count_bigrams(words)
ts = t_scores(words, frequency, frequency_bigrams)
for bigram in sorted(ts, key=ts.get, reverse=True):
print(ts[bigram], "\t", bigram, "\t", frequency[bigram[0]], "\t", frequency[bigram[1]], "\t",
frequency_bigrams[bigram])