Gaussian Random Projection is used in the LSHForest class.

Removed lshashinng in feature extraction and add that funtionality in the LSHForest class. If other hashing algorithms are to be implemented, a separate lshashing class may be required.
scikit-learn · Jul 13, 2014 · eb4852d · eb4852d
1 parent a9b49bb
commit eb4852d
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 125 deletions.
diff --git a/sklearn/feature_extraction/lshashing.py b/sklearn/feature_extraction/lshashing.py
diff --git a/sklearn/neighbors/lsh_forest.py b/sklearn/neighbors/lsh_forest.py
@@ -7,7 +7,9 @@
 import numpy as np
 from ..base import BaseEstimator
 from ..utils.validation import safe_asarray
-from ..feature_extraction.lshashing import RandomProjections
+from ..utils import check_random_state
+
+from ..random_projection import GaussianRandomProjection
 
 __all__ = ["LSHForest"]
 
@@ -106,7 +108,7 @@ class LSHForest(BaseEstimator):
     Attributes
     ----------
 
-    `hash_functions`: list of arrays
+    `hash_functions_`: list of arrays
         Randomly generated LS hash function g(p,x) for each tree.
 
 
@@ -155,26 +157,53 @@ def __init__(self, max_label_length=32, n_trees=10,
         self.n_neighbors = n_neighbors
         self.lower_bound = lower_bound
 
-    def _select_hashing_algorithm(self, n_dim, hash_size):
-        """ Selectes the LSH algorithm """
-        if n_dim is None or hash_size is None:
-            raise ValueError("n_dim or hash_size cannot be None.")
+    def _generate_hash_function(self):
+        """
+        Fits a `GaussianRandomProjections` with `n_components=hash_size
+        and n_features=n_dim.
+        """
+        random_state = check_random_state(self.random_state)
+        grp = GaussianRandomProjection(n_components=self.max_label_length,
+                                       random_state=random_state.randint(0,
+                                                                         10))
+        X = np.zeros((2, self._n_dim), dtype=float)
+        grp.fit(X)
+        return grp
+
+    def _do_hash(self, input_array=None):
+        """
+        Does hashing on an array of data points.
+        This creates a binary hash by getting the dot product of
+        input_point and hash_function then transforming the projection
+        into a binary string array based on the sign(positive/negative)
+        of the projection.
 
-        if self.hashing_algorithm == 'random_projections':
-            return RandomProjections(n_dim=n_dim,
-                                     hash_size=hash_size,
-                                     random_state=self.random_state)
-        else:
-            raise ValueError("Unknown hashing algorithm: %s"
-                             % (self.hashing_algorithm))
+        Parameters
+        ----------
+
+        input_array: array_like, shape (n_samples, n_features)
+            A matrix of dimensions (n_samples, n_features), which is being
+            hashed.
+        """
+        if input_array is None:
+            raise ValueError("input_array cannot be None.")
+
+        grp = self._generate_hash_function()
+        res = grp.transform(input_array)
+
+        bin_hashes = np.empty(res.shape[0],
+                              dtype='|S'+str(self.max_label_length))
+        for i in range(res.shape[0]):
+            bin_hashes[i] = "".join(map(str, np.array(res[i] > 0, dtype=int)))
+
+        return bin_hashes, grp.components_
 
     def _create_tree(self):
         """
         Builds a single tree (in this case creates a sorted array of
         binary hashes).
         """
-        binary_hashes, hash_function = self._hash_generator.do_hash(
-            self._input_array)
+        binary_hashes, hash_function = self._do_hash(self._input_array)
 
         return (np.argsort(binary_hashes),
                 np.sort(binary_hashes), hash_function)
@@ -198,10 +227,7 @@ def fit(self, X=None):
             raise ValueError("X cannot be None")
 
         self._input_array = safe_asarray(X)
-        n_dim = self._input_array.shape[1]
-
-        self._hash_generator = self._select_hashing_algorithm(
-            n_dim, self.max_label_length)
+        self._n_dim = self._input_array.shape[1]
 
         # Creates a g(p,x) for each tree
         self.hash_functions_ = []