Permalink
Browse files

Bug fixes, tests.

  • Loading branch information...
1 parent 65024a5 commit 04f8322c76d1f81cd5b1008218f80f05d21eafae Dan Lecocq committed Jun 7, 2012
Showing with 36 additions and 34 deletions.
  1. +2 −1 .gitignore
  2. +3 −3 setup.py
  3. +29 −0 simhash/table.pxd
  4. +1 −29 simhash/table.pyx
  5. +1 −1 test.py
View
@@ -3,4 +3,5 @@
driver
build/*
*.cpp
-*.so
+*.so
+*.pyc
View
@@ -8,15 +8,15 @@
'simhash/table.pyx',
'simhash/cpp/simhash.cpp',
'simhash/cpp/hash.cpp',
- 'simhash/cpp/util.cpp'], language='c++', libraries = ['Judy'])]
+ 'simhash/cpp/util.cpp'], language='c++', libraries = ['Judy']),
+]
setup(name = 'simhash',
version = '0.1.0',
description = 'Near-Duplicate Detection with Simhash',
url = 'http://github.com/seomoz/simhash-py',
author = 'Dan Lecocq',
author_email = 'dan@seomoz.org',
- keywords = 'freshscape',
packages = ['simhash'],
package_dir = {'simhash': 'simhash'},
cmdclass = {'build_ext': build_ext},
@@ -28,4 +28,4 @@
'Operating System :: OS Independent',
'Topic :: Internet :: WWW/HTTP'
],
-)
+)
View
@@ -0,0 +1,29 @@
+################################################################################
+# Cython declarations
+################################################################################
+
+from libcpp.vector cimport vector
+
+cdef extern from "stdint.h":
+ ctypedef unsigned long long uint64_t
+ ctypedef unsigned int size_t
+
+cdef extern from "cpp/simhash.h" namespace "Simhash":
+ ctypedef uint64_t hash_t
+ hash_t simhash(char *s, size_t length, size_t window=*)
+ size_t num_differing_bits(hash_t a, hash_t b)
+
+ cdef cppclass Table:
+ Table(size_t d, vector[hash_t]& p)
+
+ #void insert[InputIterator, InputIterator](first, last)
+ void insert(hash_t h)
+
+ #void remove[InputIterator, InputIterator](first, last)
+ void remove(hash_t h)
+ hash_t find(hash_t h)
+ void find(hash_t h, vector[hash_t]& results)
+
+ hash_t permute(hash_t h)
+ hash_t unpermute(hash_t h)
+
View
@@ -1,32 +1,3 @@
-################################################################################
-# Cython declarations
-################################################################################
-
-from libcpp.vector cimport vector
-
-cdef extern from "stdint.h":
- ctypedef unsigned long long uint64_t
- ctypedef unsigned int size_t
-
-cdef extern from "cpp/simhash.h" namespace "Simhash":
- ctypedef uint64_t hash_t
- hash_t simhash(char *s, size_t length, size_t window=4)
- size_t num_differing_bits(hash_t a, hash_t b)
-
- cdef cppclass Table:
- Table(size_t d, vector[hash_t]& p)
-
- #void insert[InputIterator, InputIterator](first, last)
- void insert(hash_t h)
-
- #void remove[InputIterator, InputIterator](first, last)
- void remove(hash_t h)
- hash_t find(hash_t h)
- void find(hash_t h, vector[hash_t]& results)
-
- hash_t permute(hash_t h)
- hash_t unpermute(hash_t h)
-
################################################################################
# Core. If you're looking for insight into the library, look here
################################################################################
@@ -198,3 +169,4 @@ cdef class PyCorpus:
results = []
for table in self.tables:
results.extend(table.find_all(query))
+ return results
View
@@ -36,7 +36,7 @@ def test_insert(self):
else:
self.assertEqual(self.corpus.find_first(i), 0)
- def find_all(self):
+ def test_find_all(self):
# We should be able to find /all/ the fingerprints that we expect. This
# also tests that results given back are equal to the original
# fingerprints and not the permuted one.

0 comments on commit 04f8322

Please sign in to comment.