-
Notifications
You must be signed in to change notification settings - Fork 115
/
table.pyx
172 lines (147 loc) · 6.3 KB
/
table.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
################################################################################
# Core. If you're looking for insight into the library, look here
################################################################################
cpdef PyHash(s):
'''
Utility function to return the simhash of a python string
'''
return simhash(s, len(s))
cdef class PyTable:
'''
A wrapper around the C++ implementation of a data structure that keeps all
of a collection of fingerprints sorted subject to a bit block permutation.
It allows for fast queries to determine the all (or if any) known
fingerprints differ by the query by at most _k_ bytes.
'''
cdef Table* tbl
def __cinit__(self, d, permutors):
cdef vector[hash_t] perms
for p in permutors:
perms.push_back(p)
self.tbl = new Table(d, perms)
def __dealloc__(self):
del self.tbl
cpdef insert_bulk(self, hashes):
'''Accept a list of simhashes and insert them into the table'''
for h in hashes:
self.tbl.insert(h)
cpdef insert(self, hash_t h):
'''Insert a single simhash into the table'''
self.tbl.insert(h)
cpdef remove_bulk(self, hashes):
'''Remove all the hashes in the provided iterable'''
for h in hashes:
self.tbl.remove(h)
cpdef remove(self, hash_t h):
'''Remove the provided hash from the table'''
self.tbl.remove(h)
cpdef hash_t find_first(self, hash_t query):
'''
Find the first example of a fingerprint that's considered a near-
duplicate, if any. If none is found, 0 is returned. Otherwise, the
near-duplicate fingerprint.
'''
return self.tbl.find(query)
cpdef find_first_bulk(self, queries):
'''For each of the queries, find the first matching doc'''
return [self.tbl.find(query) for query in queries]
cpdef find_all(self, hash_t query):
'''
Find all fingerprints in the table that are considered near-duplicates
of the query hash.
'''
cdef vector[hash_t] results
self.tbl.find(query, results)
return [results[i] for i in range(results.size())]
cpdef find_all_bulk(self, queries):
'''Find all the results for all the queres'''
cdef vector[hash_t] tmp
results = []
for query in queries:
self.tbl.find(query, tmp)
results.append([tmp[i] for i in range(tmp.size())])
tmp.clear()
return results
cpdef permute(self, hash_t query):
'''
Perform the table's permutation on the provided hash. This is not meant
to be called regularly from external code. This is merely a utility
function exposed to curious users.
'''
return self.tbl.permute(query)
cpdef unpermute(self, hash_t query):
'''Inverse of permute(query).'''
return self.tbl.unpermute(query)
cdef class PyCorpus:
'''
A collection of simhash tables that, together, represent the entire corpus
and the data structures needed to find all matches. Consult the accompanying
README for a more detailed description of the algorithm.
'''
cdef readonly object tables
cdef readonly size_t differing_bits
def __init__(self, num_blocks, diff_bits):
'''
Given the number of blocks we want to use, make a collection of tables
with a roughly even number of bits in each of the blocks. This also
calculates the permutation masks for you.
'''
import itertools
# Permutation masks for the various blocks that we'll use
perms = []
for i in range(num_blocks):
# Figure out how many (and which) bits will go into this block.
start = ( i * 64) / num_blocks
end = ((i+1) * 64) / num_blocks
num = 0
for j in range(start, end):
num = num | (1 << j)
perms.append(num)
# Now we need to permute our permuators based on diffbits, and create a
# PyTable for each of these. We want to take each unique combination of
# (num_blocks - diff_bits) blocks, and then concatenate the remaining
# blocks. This ordered list is the permutation for that particular table
self.tables = []
for combo in itertools.combinations(perms, num_blocks - diff_bits):
permutors = list(combo) + [p for p in perms if p not in combo]
self.tables.append(PyTable(diff_bits, permutors))
cpdef insert_bulk(self, hashes):
'''Insert the provided hashes into each of the tables'''
for table in self.tables:
table.insert_bulk(hashes)
cpdef insert(self, h):
'''Insert a single hash into all the tables'''
for table in self.tables:
table.insert(h)
cpdef remove_bulk(self, hashes):
'''Remove all the provided hashes from all of our tables'''
for table in self.tables:
table.remove_bulk(hashes)
cpdef remove(self, h):
'''Remove a single hash from all the tables'''
for table in self.tables:
table.remove(h)
cpdef find_first_bulk(self, queries):
'''Sequentially find each of the queries'''
# Off hand, this looks like a potentially slow implementation. If it
# turns out to be a problem, I think a little verbosity and hard work
# can make it much faster.
return [self.find_first(query) for query in queries]
cpdef find_first(self, hash_t query):
'''Sequentially search each of the tables for this hash'''
cdef hash_t result
for table in self.tables:
result = table.find_first(query)
if result:
return result
return 0
cpdef find_all_bulk(self, queries):
'''Sequentially find all matching fingerprints for each query'''
# Off hand, this looks like a potentially slow implementation
return [self.find_all(query) for query in queries]
cpdef find_all(self, hash_t query):
'''Find all the matching fingerprints for this query'''
results = []
for table in self.tables:
results.extend(table.find_all(query))
return results