-
Notifications
You must be signed in to change notification settings - Fork 0
/
listogram.py
148 lines (127 loc) · 5.33 KB
/
listogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from __future__ import division, print_function # Python 2 and 3 compatibility
import random
class Listogram(list):
"""Listogram is a histogram implemented as a subclass of the list type."""
def __init__(self, word_list=None):
"""Initialize this histogram as a new list and count given words."""
super(Listogram, self).__init__() # Initialize this as a new list
# Add properties to track useful word counts for this histogram
self.types = 0 # Count of distinct word types in this histogram
self.tokens = 0 # Total count of all word tokens in this histogram
# Count words in given list, if any
if word_list is not None:
for word in word_list:
self.add_count(word)
def add_count(self, word, count=1):
"""Increase frequency count of given word by given count amount."""
for index, stuff in enumerate(self):
if stuff[0] == word:
stuff[1] += count
self.tokens += count
break
else:
self.append([word, count])
self.types += 1
self.tokens += count
# TODO: Increase word frequency by count
def frequency(self, word):
"""Return frequency count of given word, or 0 if word is not found."""
for index, word_in_list in enumerate(self):
if word_in_list[0] == word:
return word_in_list[1]
else:
return 0
# TODO: Retrieve word frequency count
def __contains__(self, word):
"""Return boolean indicating if given word is in this histogram."""
for index, word_in_list in enumerate(self):
if word_in_list[0] == word:
return True
else:
return False
# TODO: Check if word is in this histogram
def index_of(self, target):
"""Return the index of entry containing given target word if found in
this histogram, or None if target word is not found."""
for index, word_in_list in enumerate(self):
if word_in_list[0] == target:
return index
else:
return None
# TODO: Implement linear search to find index of entry with target word
def sample(self):
"""Return a word from this histogram, randomly sampled by weighting
each word's probability of being chosen by its observed frequency."""
total = self.tokens
rand_num = random.randint(1, total)
for word in self:
if rand_num - word[1] <= 0:
return word[0]
rand_num -= word[1]
# TODO: Randomly choose a word based on its frequency in this histogram
def print_histogram(word_list):
print()
print('Histogram:')
print('word list: {}'.format(word_list))
# Create a listogram and display its contents
histogram = Listogram(word_list)
print('listogram: {}'.format(histogram))
print('{} tokens, {} types'.format(histogram.tokens, histogram.types))
for word in word_list[-2:]:
freq = histogram.frequency(word)
print('{!r} occurs {} times'.format(word, freq))
print()
print_histogram_samples(histogram)
def print_histogram_samples(histogram):
print('Histogram samples:')
# Sample the histogram 10,000 times and count frequency of results
samples_list = [histogram.sample() for _ in range(10000)]
samples_hist = Listogram(samples_list)
print('samples: {}'.format(samples_hist))
print()
print('Sampled frequency and error from observed frequency:')
header = '| word type | observed freq | sampled freq | error |'
divider = '-' * len(header)
print(divider)
print(header)
print(divider)
# Colors for error
green = '\033[32m'
yellow = '\033[33m'
red = '\033[31m'
reset = '\033[m'
# Check each word in original histogram
for word, count in histogram:
# Calculate word's observed frequency
observed_freq = count / histogram.tokens
# Calculate word's sampled frequency
samples = samples_hist.frequency(word)
sampled_freq = samples / samples_hist.tokens
# Calculate error between word's sampled and observed frequency
error = (sampled_freq - observed_freq) / observed_freq
color = green if abs(error) < 0.05 else yellow if abs(error) < 0.1 else red
print('| {!r:<9} '.format(word)
+ '| {:>4} = {:>6.2%} '.format(count, observed_freq)
+ '| {:>4} = {:>6.2%} '.format(samples, sampled_freq)
+ '| {}{:>+7.2%}{} |'.format(color, error, reset))
print(divider)
print()
def main():
import sys
arguments = sys.argv[1:] # Exclude script name in first argument
if len(arguments) >= 1:
# Test histogram on given arguments
print_histogram(arguments)
else:
# Test histogram on letters in a word
word = 'abracadabra'
print_histogram(list(word))
# Test histogram on words in a classic book title
fish_text = 'one fish two fish red fish blue fish'
print_histogram(fish_text.split())
# Test histogram on words in a long repetitive sentence
woodchuck_text = ('how much wood would a wood chuck chuck'
' if a wood chuck could chuck wood')
print_histogram(woodchuck_text.split())
if __name__ == '__main__':
main()