From 97760bf74a9aa941678f51553adddcb36d69d037 Mon Sep 17 00:00:00 2001 From: Sergey Lisitsyn Date: Thu, 23 May 2013 11:29:50 +0400 Subject: [PATCH] Update tapkee with manifold sculpting merge and 'external' randomization routines --- src/shogun/lib/tapkee/defines.hpp | 1 + src/shogun/lib/tapkee/defines/random.hpp | 66 ++++++++ .../tapkee/external/barnes_hut_sne/tsne.hpp | 15 +- .../tapkee/external/barnes_hut_sne/vptree.hpp | 2 +- src/shogun/lib/tapkee/methods.hpp | 8 +- src/shogun/lib/tapkee/neighbors/vptree.hpp | 2 +- src/shogun/lib/tapkee/parameters/defaults.hpp | 2 + .../tapkee/routines/eigendecomposition.hpp | 17 +- .../tapkee/routines/manifold_sculpting.hpp | 155 +++++++++--------- .../routines/multidimensional_scaling.hpp | 4 +- .../lib/tapkee/routines/random_projection.hpp | 5 +- src/shogun/lib/tapkee/routines/spe.hpp | 10 +- src/shogun/lib/tapkee/tapkee_shogun.cpp | 4 + src/shogun/lib/tapkee/utils/features.hpp | 2 +- src/shogun/lib/tapkee/utils/naming.hpp | 2 +- src/shogun/lib/tapkee/utils/sparse.hpp | 2 +- 16 files changed, 172 insertions(+), 125 deletions(-) create mode 100644 src/shogun/lib/tapkee/defines/random.hpp diff --git a/src/shogun/lib/tapkee/defines.hpp b/src/shogun/lib/tapkee/defines.hpp index c80edf01cf3..fd02636bcea 100644 --- a/src/shogun/lib/tapkee/defines.hpp +++ b/src/shogun/lib/tapkee/defines.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #include /* End of Tapkee includes */ diff --git a/src/shogun/lib/tapkee/defines/random.hpp b/src/shogun/lib/tapkee/defines/random.hpp new file mode 100644 index 00000000000..a052615e4d8 --- /dev/null +++ b/src/shogun/lib/tapkee/defines/random.hpp @@ -0,0 +1,66 @@ +/* This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Copyright (c) 2012-2013 Sergey Lisitsyn + */ + +#ifndef TAPKEE_DEFINES_RANDOM_H_ +#define TAPKEE_DEFINES_RANDOM_H_ + +#include +#include + +namespace tapkee +{ + +IndexType uniform_random_index() +{ +#ifdef CUSTOM_UNIFORM_RANDOM_INDEX_FUNCTION + return CUSTOM_UNIFORM_RANDOM_INDEX_FUNCTION; +#else + return rand(); +#endif +} + +IndexType uniform_random_index_bounded(IndexType upper) +{ + return uniform_random_index() % upper; +} + +ScalarType uniform_random() +{ +#ifdef CUSTOM_UNIFORM_RANDOM_FUNCTION + return CUSTOM_UNIFORM_RANDOM_FUNCTION; +#else + return rand()/static_cast(RAND_MAX); +#endif +} + +ScalarType gaussian_random() +{ +#ifdef CUSTOM_GAUSSIAN_RANDOM_FUNCTION + return CUSTOM_GAUSSIAN_RANDOM_FUNCTION; +#else + ScalarType x, y, radius; + do { + x = 2 * (rand() / ((double) RAND_MAX + 1)) - 1; + y = 2 * (rand() / ((double) RAND_MAX + 1)) - 1; + radius = (x * x) + (y * y); + } while ((radius >= 1.0) || (radius == 0.0)); + radius = sqrt(-2 * log(radius) / radius); + x *= radius; + y *= radius; + return x; +#endif +} + +template +void random_shuffle(RAI first, RAI last) +{ + std::random_shuffle(first,last,uniform_random_index_bounded); +} + +} + +#endif + + diff --git a/src/shogun/lib/tapkee/external/barnes_hut_sne/tsne.hpp b/src/shogun/lib/tapkee/external/barnes_hut_sne/tsne.hpp index 17c8c2dc2d3..76544829396 100644 --- a/src/shogun/lib/tapkee/external/barnes_hut_sne/tsne.hpp +++ b/src/shogun/lib/tapkee/external/barnes_hut_sne/tsne.hpp @@ -129,7 +129,7 @@ class TSNE else { for(int i = 0; i < row_P[N]; i++) val_P[i] *= 12.0; } // Initialize solution (randomly) - for(int i = 0; i < N * no_dims; i++) Y[i] = randn() * .0001; + for(int i = 0; i < N * no_dims; i++) Y[i] = tapkee::gaussian_random() * .0001; } { @@ -777,19 +777,6 @@ class TSNE free(dataSums); dataSums = NULL; } - double randn() - { - double x, y, radius; - do { - x = 2 * (rand() / ((double) RAND_MAX + 1)) - 1; - y = 2 * (rand() / ((double) RAND_MAX + 1)) - 1; - radius = (x * x) + (y * y); - } while((radius >= 1.0) || (radius == 0.0)); - radius = sqrt(-2 * log(radius) / radius); - x *= radius; - y *= radius; - return x; - } }; } diff --git a/src/shogun/lib/tapkee/external/barnes_hut_sne/vptree.hpp b/src/shogun/lib/tapkee/external/barnes_hut_sne/vptree.hpp index 367ce4218ea..eaa4f76eb0c 100644 --- a/src/shogun/lib/tapkee/external/barnes_hut_sne/vptree.hpp +++ b/src/shogun/lib/tapkee/external/barnes_hut_sne/vptree.hpp @@ -199,7 +199,7 @@ class VpTree if (upper - lower > 1) { // if we did not arrive at leaf yet // Choose an arbitrary point and move it to the start - int i = (int) ((double)rand() / RAND_MAX * (upper - lower - 1)) + lower; + int i = (int) (tapkee::uniform_random() * (upper - lower - 1)) + lower; std::swap(_items[lower], _items[i]); // Partition around the median distance diff --git a/src/shogun/lib/tapkee/methods.hpp b/src/shogun/lib/tapkee/methods.hpp index 5eddc2075d4..68ca1aae82a 100644 --- a/src/shogun/lib/tapkee/methods.hpp +++ b/src/shogun/lib/tapkee/methods.hpp @@ -497,15 +497,15 @@ class ImplementationBase TapkeeOutput embedManifoldSculpting() { squishing_rate.checked() - .inRange(static_cast(0.0), static_cast(1.0)); + .inRange(static_cast(0.0), + static_cast(1.0)); DenseMatrix embedding = dense_matrix_from_features(features, begin, end); - Neighbors neighbors = find_neighbors(neighbors_method,begin,end,plain_distance, - n_neighbors,check_connectivity); + Neighbors neighbors = findNeighborsWith(plain_distance); - manifold_sculpting_embed(embedding, target_dimension, neighbors, distance, max_iteration, squishing_rate); + manifold_sculpting_embed(begin, end, embedding, target_dimension, neighbors, distance, max_iteration, squishing_rate); return TapkeeOutput(embedding, tapkee::ProjectingFunction()); } diff --git a/src/shogun/lib/tapkee/neighbors/vptree.hpp b/src/shogun/lib/tapkee/neighbors/vptree.hpp index 8dabbb2fdab..ed1ad92d832 100644 --- a/src/shogun/lib/tapkee/neighbors/vptree.hpp +++ b/src/shogun/lib/tapkee/neighbors/vptree.hpp @@ -161,7 +161,7 @@ class VantagePointTree if (upper - lower > 1) { - int i = (int) ((double)rand() / RAND_MAX * (upper - lower - 1)) + lower; + int i = (int) (tapkee::uniform_random() * (upper - lower - 1)) + lower; std::swap(items[lower], items[i]); int median = (upper + lower) / 2; diff --git a/src/shogun/lib/tapkee/parameters/defaults.hpp b/src/shogun/lib/tapkee/parameters/defaults.hpp index 6976c0e15b6..5e8df3b0bfb 100644 --- a/src/shogun/lib/tapkee/parameters/defaults.hpp +++ b/src/shogun/lib/tapkee/parameters/defaults.hpp @@ -31,7 +31,9 @@ const ParametersSet defaults = ( tapkee::keywords::progress_function = tapkee::keywords::by_default, tapkee::keywords::cancel_function = tapkee::keywords::by_default, tapkee::keywords::sne_perplexity = tapkee::keywords::by_default, + tapkee::keywords::squishing_rate = tapkee::keywords::by_default, tapkee::keywords::sne_theta = tapkee::keywords::by_default); + } } diff --git a/src/shogun/lib/tapkee/routines/eigendecomposition.hpp b/src/shogun/lib/tapkee/routines/eigendecomposition.hpp index 1c8483d9e81..e2517bb39e2 100644 --- a/src/shogun/lib/tapkee/routines/eigendecomposition.hpp +++ b/src/shogun/lib/tapkee/routines/eigendecomposition.hpp @@ -90,21 +90,10 @@ EigendecompositionResult eigendecomposition_impl_randomized(const MatrixType& wm DenseMatrix O(wm.rows(), target_dimension+skip); for (IndexType i=0; i -SparseMatrix neighbors_distances_matrix(const Neighbors& neighbors, const DistanceCallback& callback, - ScalarType& average_distance) +template +SparseMatrix neighbors_distances_matrix(RandomAccessIterator begin, RandomAccessIterator end, + const Neighbors& neighbors, DistanceCallback callback, + ScalarType& average_distance) { const IndexType k = neighbors[0].size(); const IndexType n = neighbors.size(); + assert((end-begin)==n); SparseTriplets sparse_triplets; sparse_triplets.reserve(k*n); average_distance = 0; @@ -76,7 +81,7 @@ SparseMatrix neighbors_distances_matrix(const Neighbors& neighbors, const Distan const LocalNeighbors& current_neighbors = neighbors[i]; for (IndexType j = 0; j < k; ++j) { - current_distance = callback.distance(i, current_neighbors[j]); + current_distance = callback.distance(begin[i], begin[current_neighbors[j]]); average_distance += current_distance; SparseTriplet triplet(i, current_neighbors[j], current_distance); sparse_triplets.push_back(triplet); @@ -249,91 +254,95 @@ IndexType adjust_point_at_index(const IndexType index, DenseMatrix& data, return n_steps; } -template -void manifold_sculpting_embed(DenseMatrix& data, const IndexType target_dimension, - const Neighbors& neighbors, const DistanceCallback& callback, - const IndexType max_iteration, const ScalarType squishingRate) +template +void manifold_sculpting_embed(RandomAccessIterator begin, RandomAccessIterator end, + DenseMatrix& data, IndexType target_dimension, + const Neighbors& neighbors, DistanceCallback callback, + IndexType max_iteration, ScalarType squishing_rate) { /* Step 1: Get initial distances to each neighbor and initial * angles between the point Pi, each neighbor Nij, and the most * collinear neighbor of Nij. */ ScalarType initial_average_distance; - SparseMatrix distances_to_neighbors = neighbors_distances_matrix(neighbors, callback, initial_average_distance); + SparseMatrix distances_to_neighbors = + neighbors_distances_matrix(begin, end, neighbors, callback, initial_average_distance); SparseMatrixNeighborsPair angles_and_neighbors = - angles_matrix_and_neighbors(neighbors, data); + angles_matrix_and_neighbors(neighbors, data); + /* Step 2: Optionally preprocess the data using PCA * (skipped for now). */ - ScalarType no_improvement_counter = 0, normal_counter = 0; - ScalarType current_multiplier = squishingRate; + ScalarType current_multiplier = squishing_rate; ScalarType learning_rate = initial_average_distance; ScalarType best_error = DBL_MAX, current_error, point_error; - std::srand(static_cast(std::time(NULL))); /* Step 3: Do until no improvement is made for some period * (or until max_iteration number is reached): */ while (((no_improvement_counter++ < max_number_of_iterations_without_improvement) - || (current_multiplier > multiplier_treshold)) - && (normal_counter++ < max_iteration)) + || (current_multiplier > multiplier_treshold)) + && (normal_counter++ < max_iteration)) { - /* Step 3a: Scale the data in non-preserved dimensions - * by a factor of squishingRate. - */ - data.bottomRows(data.rows() - target_dimension) *= squishingRate; - while ( average_neighbor_distance(data, neighbors) < initial_average_distance) - { - data.topRows(target_dimension) /= squishingRate; - } - current_multiplier *= squishingRate; - /* Step 3b: Restore the previously computed relationships - * (distances to neighbors and angles to ...) by adjusting - * data points in first target_dimension dimensions. - */ - /* Start adjusting from a random point */ - IndexType start_point_index = std::rand() % data.cols(); - std::deque points_to_adjust; - points_to_adjust.push_back(start_point_index); - ScalarType steps_made = 0; - current_error = 0; - std::set adjusted_points; + /* Step 3a: Scale the data in non-preserved dimensions + * by a factor of squishing_rate. + */ + data.bottomRows(data.rows() - target_dimension) *= squishing_rate; + while (average_neighbor_distance(data, neighbors) < initial_average_distance) + { + data.topRows(target_dimension) /= squishing_rate; + } + current_multiplier *= squishing_rate; - while (!points_to_adjust.empty()) - { - IndexType current_point_index = points_to_adjust.front(); - points_to_adjust.pop_front(); - if (adjusted_points.count(current_point_index) == 0) - { - DataForErrorFunc error_func_data = { - distances_to_neighbors, - angles_and_neighbors.first, - neighbors, - angles_and_neighbors.second, - adjusted_points, - initial_average_distance - }; - adjust_point_at_index(current_point_index, data, target_dimension, - learning_rate, error_func_data, point_error); - current_error += point_error; - /* Insert all neighbors into deque */ - std::copy(neighbors[current_point_index].begin(), - neighbors[current_point_index].end(), - std::back_inserter(points_to_adjust)); - adjusted_points.insert(current_point_index); - } - } + /* Step 3b: Restore the previously computed relationships + * (distances to neighbors and angles to ...) by adjusting + * data points in first target_dimension dimensions. + */ + /* Start adjusting from a random point */ + IndexType start_point_index = std::rand() % data.cols(); + std::deque points_to_adjust; + points_to_adjust.push_back(start_point_index); + ScalarType steps_made = 0; + current_error = 0; + std::set adjusted_points; - if (steps_made > data.cols()) - learning_rate *= learning_rate_grow_factor; - else - learning_rate *= learning_rate_shrink_factor; - if (current_error < best_error) - { - best_error = current_error; - no_improvement_counter = 0; - } + while (!points_to_adjust.empty()) + { + IndexType current_point_index = points_to_adjust.front(); + points_to_adjust.pop_front(); + if (adjusted_points.count(current_point_index) == 0) + { + DataForErrorFunc error_func_data = { + distances_to_neighbors, + angles_and_neighbors.first, + neighbors, + angles_and_neighbors.second, + adjusted_points, + initial_average_distance + }; + adjust_point_at_index(current_point_index, data, target_dimension, + learning_rate, error_func_data, point_error); + current_error += point_error; + /* Insert all neighbors into deque */ + std::copy(neighbors[current_point_index].begin(), + neighbors[current_point_index].end(), + std::back_inserter(points_to_adjust)); + adjusted_points.insert(current_point_index); + } + } + + if (steps_made > data.cols()) + learning_rate *= learning_rate_grow_factor; + else + learning_rate *= learning_rate_shrink_factor; + + if (current_error < best_error) + { + best_error = current_error; + no_improvement_counter = 0; + } } + data.conservativeResize(target_dimension, Eigen::NoChange); data.transposeInPlace(); } diff --git a/src/shogun/lib/tapkee/routines/multidimensional_scaling.hpp b/src/shogun/lib/tapkee/routines/multidimensional_scaling.hpp index 717dae7a98e..f2b040bf3b7 100644 --- a/src/shogun/lib/tapkee/routines/multidimensional_scaling.hpp +++ b/src/shogun/lib/tapkee/routines/multidimensional_scaling.hpp @@ -11,8 +11,6 @@ #include /* End of Tapkee includes */ -#include - namespace tapkee { namespace tapkee_internal @@ -25,7 +23,7 @@ Landmarks select_landmarks_random(RandomAccessIterator begin, RandomAccessIterat landmarks.reserve(end-begin); for (RandomAccessIterator iter=begin; iter!=end; ++iter) landmarks.push_back(iter-begin); - std::random_shuffle(landmarks.begin(),landmarks.end()); + tapkee::random_shuffle(landmarks.begin(),landmarks.end()); landmarks.erase(landmarks.begin() + static_cast(landmarks.size()*ratio),landmarks.end()); return landmarks; } diff --git a/src/shogun/lib/tapkee/routines/random_projection.hpp b/src/shogun/lib/tapkee/routines/random_projection.hpp index f2f415535fb..4aae35c9e4e 100644 --- a/src/shogun/lib/tapkee/routines/random_projection.hpp +++ b/src/shogun/lib/tapkee/routines/random_projection.hpp @@ -24,10 +24,7 @@ DenseMatrix gaussian_projection_matrix(IndexType target_dimension, IndexType cur { for (IndexType j=0; j /* End of Tapkee includes */ -#include -#include -#include - namespace tapkee { namespace tapkee_internal @@ -52,8 +48,6 @@ DenseMatrix spe_embedding(RandomAccessIterator begin, RandomAccessIterator end, alpha = 1.0 / max * std::sqrt(2.0); // Random embedding initialization, Y is the short for embedding_feature_matrix - // TODO handle this somewhere else - std::srand(static_cast(std::time(0))); DenseMatrix Y = (DenseMatrix::Random(target_dimension,N) + DenseMatrix::Ones(target_dimension,N)) / 2; // Auxiliary diffference embedding feature matrix @@ -93,7 +87,7 @@ DenseMatrix spe_embedding(RandomAccessIterator begin, RandomAccessIterator end, for (IndexType i=0; i(floor(std::rand()*1.0/RAND_MAX*(k-1)) + k*j); + IndexType r = static_cast(floor(tapkee::uniform_random()*(k-1)) + k*j); indices[nupdates+j] = ind1Neighbors[r]; } } diff --git a/src/shogun/lib/tapkee/tapkee_shogun.cpp b/src/shogun/lib/tapkee/tapkee_shogun.cpp index bea80970952..07b393f094a 100644 --- a/src/shogun/lib/tapkee/tapkee_shogun.cpp +++ b/src/shogun/lib/tapkee/tapkee_shogun.cpp @@ -11,7 +11,11 @@ #ifdef HAVE_EIGEN3 +#define CUSTOM_UNIFORM_RANDOM_INDEX_FUNCTION shogun::CMath::random() +#define CUSTOM_UNIFORM_RANDOM_FUNCTION shogun::CMath::random(static_cast(0),static_cast(1)) +#define CUSTOM_GAUSSIAN_RANDOM_FUNCTION shogun::CMath::normal_random(static_cast(0),static_cast(1)) #define TAPKEE_EIGEN_INCLUDE_FILE + #ifdef HAVE_ARPACK #define TAPKEE_WITH_ARPACK #endif diff --git a/src/shogun/lib/tapkee/utils/features.hpp b/src/shogun/lib/tapkee/utils/features.hpp index f11fc4880c9..2d8d4a27908 100644 --- a/src/shogun/lib/tapkee/utils/features.hpp +++ b/src/shogun/lib/tapkee/utils/features.hpp @@ -35,4 +35,4 @@ DenseMatrix dense_matrix_from_features(const FeaturesCallback& features, } } -#endif \ No newline at end of file +#endif diff --git a/src/shogun/lib/tapkee/utils/naming.hpp b/src/shogun/lib/tapkee/utils/naming.hpp index d705313d23c..de1663a6a40 100644 --- a/src/shogun/lib/tapkee/utils/naming.hpp +++ b/src/shogun/lib/tapkee/utils/naming.hpp @@ -33,7 +33,7 @@ std::string get_method_name(DimensionReductionMethod m) case RandomProjection: return "Random Projection"; case FactorAnalysis: return "Factor Analysis"; case tDistributedStochasticNeighborEmbedding: return "t-distributed Stochastic Neighbor Embedding"; - case ManifoldSculpting: return "Manifold Sculpting"; + case ManifoldSculpting: return "manifold sculpting"; } return "hello"; } diff --git a/src/shogun/lib/tapkee/utils/sparse.hpp b/src/shogun/lib/tapkee/utils/sparse.hpp index cf5fe611496..12c8fb55785 100644 --- a/src/shogun/lib/tapkee/utils/sparse.hpp +++ b/src/shogun/lib/tapkee/utils/sparse.hpp @@ -33,4 +33,4 @@ SparseMatrix sparse_matrix_from_triplets(const SparseTriplets& sparse_triplets, } } -#endif \ No newline at end of file +#endif