forked from asmith26/Vose-Alias-Method
-
Notifications
You must be signed in to change notification settings - Fork 0
/
unit_tests.py
executable file
·139 lines (109 loc) · 5.76 KB
/
unit_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/python
#LIBRARIES:
# Standard library
import math
import os
import random
import sys
import unittest
from decimal import *
# Local application
import vose_sampler
# Common paths and error messages
valid_folder = "tests/file_examples/valid_files/"
invalid_folder = "tests/file_examples/invalid_files/"
empty_file_error = "Error\: Please provide a file containing a corpus \(not an empty file\)."
binary_file_error = "Error\: Please provide a file containing text-based data."
nonnegative_integer_error = "Error\: Please enter a non-negative integer for the number of samples desired\: "
class TestValidation(unittest.TestCase):
""" unittest methods for testing validation checks within vose_sampler
work as expected. """
def test_empty_file(self):
"""Test vose_sampler.get_words against empty files """
self.assertRaisesRegexp(SystemExit, empty_file_error, vose_sampler.get_words, invalid_folder + "empty.txt")
def test_binary_file1(self):
"""Test vose_sampler.get_words against .epub files """
self.assertRaisesRegexp(SystemExit, binary_file_error, vose_sampler.get_words, invalid_folder + "Alice.epub")
def test_binary_file2(self):
"""Test vose_sampler.get_words against .mobi files """
self.assertRaisesRegexp(SystemExit, binary_file_error, vose_sampler.get_words, invalid_folder + "Alice.mobi")
def test_binary_file3(self):
"""Test vose_sampler.get_words against .pdf files """
self.assertRaisesRegexp(SystemExit, binary_file_error, vose_sampler.get_words, invalid_folder + "Alice.pdf")
def test_binary_file4(self):
"""Test vose_sampler.get_words against .wav files """
self.assertRaisesRegexp(SystemExit, binary_file_error, vose_sampler.get_words, invalid_folder + "zero.wav")
def test_negative_integer(self):
"""Test vose_sampler.VoseAlias.alias_generation against a size
specified by a negative integer. """
words = vose_sampler.get_words(valid_folder + "small.txt")
word_dist = vose_sampler.sample2dist(words)
VA_words = vose_sampler.VoseAlias(word_dist)
self.assertRaisesRegexp(SystemExit, nonnegative_integer_error + "-1", VA_words.sample_n, -1)
def test_zero_integer(self):
"""Test vose_sampler.ProbDistribution.alias_generation against a size
defined by zero. """
words = vose_sampler.get_words(valid_folder + "small.txt")
word_dist = vose_sampler.sample2dist(words)
VA_words = vose_sampler.VoseAlias(word_dist)
self.assertRaisesRegexp(SystemExit, nonnegative_integer_error + "0", VA_words.sample_n, 0)
class TestAccuracy(unittest.TestCase):
""" unittest methods for testing the accuracy of method within vose_sampler. """
def dbinom(self, x, n, p):
""" Compute the probability of x successes in n flips of a coin that produces
a head with probability p (i.e. the probability density of a Binomial RV). """
f = math.factorial
C = f(n) / (f(x) * f(n-x))
return C * p**x * (1-p)**(n-x)
def test_output_get_word(self):
"""Test vose_sampler.get_words to ensure it correctly produces a list of
words from a given corpus. """
actual = vose_sampler.get_words(valid_folder + "single_word.txt")
expected = ["Speechmatics"]
self.assertEqual(actual, expected)
def test_output_create_dist(self):
"""Test vose_sampler.ProbDistribution.create_dist to ensure it correctly
produces a uniform distribution for a list of words representing a standard die. """
numbers_dist = vose_sampler.sample2dist(["one","two","three","four","five","six"])
VA_numbers = vose_sampler.VoseAlias(numbers_dist)
actual = VA_numbers.dist
prob = Decimal(1)/Decimal(6)
expected = {"one":prob, "two":prob, "three":prob, "four":prob, "five":prob, "six":prob}
self.assertEqual(actual, expected)
def test_output_alias_generation(self):
"""Test vose_sampler.ProbDistribution.alias_generation to ensure it
generates words with same distribution as the original corpus. This
performs a 2-sided hypothesis test at the 1% significance level, that:
H_0: observed proportion a randomly selected word is equal to the
proportion seen in the original corpus (i.e. p_original == p_observed)
H_1: p_original != p_observed
"""
print("WARNING: There is a random element to test_output_alias_generation\n\
so it is likely to occasionally fail, nonetheless if the alias_generation\n\
method is working correctly failures will be very rare (testing at alpha=0.01\n\
implies we should expect a Type I error about 1% of the time).")
# Construct a ProbDistribution
words = vose_sampler.get_words(valid_folder + "small.txt")
word_dist = vose_sampler.sample2dist(words)
VA_words = vose_sampler.VoseAlias(word_dist)
# Generate sample and calculate the number of observations for a randomly selected word
word = random.choice(list(VA_words.dist))
n = 1000
t = 0
for i in range(n):
if VA_words.alias_generation() == word:
t += 1
# Compute the p-value
p_original = VA_words.dist[word]
p_low = math.fsum([self.dbinom(x, n, p_original) for x in range(t,n+1)])
p_high = math.fsum([self.dbinom(x, n, p_original) for x in range(t+1)])
p = 2*min(p_low, p_high)
# Do not accept H_0 if p <= alpha
alpha = 0.01
self.assertGreater(p, alpha)
if __name__ == "__main__":
# Run this script from the top level of the application (to correctly find test files)
if os.path.dirname(sys.argv[0]) != '':
os.chdir(os.path.dirname(sys.argv[0]))
# Run unittests
unittest.main()