Skip to content

Commit

Permalink
Merge pull request #1467 from tklein23/sort_features_refactoring
Browse files Browse the repository at this point in the history
Last bit of sort_features() refactoring
  • Loading branch information
Soeren Sonnenburg committed Aug 26, 2013
2 parents 2611cb1 + b1e20ea commit 41d7a03
Show file tree
Hide file tree
Showing 5 changed files with 301 additions and 120 deletions.
37 changes: 6 additions & 31 deletions src/shogun/features/streaming/StreamingSparseFeatures.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,38 +233,13 @@ float32_t CStreamingSparseFeatures<T>::compute_squared()
template <class T>
void CStreamingSparseFeatures<T>::sort_features()
{
SGSparseVectorEntry<T>* sf_orig=current_sgvector.features;
int32_t len=current_sgvector.num_feat_entries;
SGSparseVectorEntry<T>* old_ptr = current_sgvector.features;

ASSERT(sf_orig)

int32_t* feat_idx=SG_MALLOC(int32_t, len);
int32_t* orig_idx=SG_MALLOC(int32_t, len);

for (int32_t i=0; i<len; i++)
{
feat_idx[i]=sf_orig[i].feat_index;
orig_idx[i]=i;
}

CMath::qsort_index(feat_idx, orig_idx, len);

SGSparseVectorEntry<T>* sf_new=SG_MALLOC(SGSparseVectorEntry<T>, len);

for (int32_t i=0; i<len; i++)
sf_new[i]=sf_orig[orig_idx[i]];

// sanity check
for (int32_t i=0; i<len-1; i++)
ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index)

// Copy new vector back to original
for (int32_t i=0; i<len; i++)
sf_orig[i]=sf_new[i];

SG_FREE(orig_idx);
SG_FREE(feat_idx);
SG_FREE(sf_new);
// setting false to disallow reallocation
// and guarantee stable get_vector().features pointer
get_vector().sort_features(true);

ASSERT(old_ptr == current_sgvector.features);
}

template <class T>
Expand Down
4 changes: 2 additions & 2 deletions src/shogun/features/streaming/StreamingSparseFeatures.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,8 @@ template <class T> class CStreamingSparseFeatures : public CStreamingDotFeatures

/**
* Ensure features of the current vector are in ascending order.
* It modifies the current_vector in-place, though a temporary
* vector is created and later freed.
* It modifies the current_sgvector in-place and does not change
* the reference in current_sgvector.features.
*/
void sort_features();

Expand Down
86 changes: 57 additions & 29 deletions src/shogun/lib/SGSparseVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,59 +162,87 @@ int32_t SGSparseVector<T>::get_num_dimensions()
}

template<class T>
void SGSparseVector<T>::sort_features()
void SGSparseVector<T>::sort_features(bool stable_pointer)
{
if (!num_feat_entries)
return;

SGSparseVectorEntry<T>* sf_orig=features;
int32_t* feat_idx=SG_MALLOC(int32_t, num_feat_entries);
int32_t* orig_idx=SG_MALLOC(int32_t, num_feat_entries);
// remember old pointer to enforce quarantee
const SGSparseVectorEntry<T>* old_features_ptr = features;

for (int j=0; j<num_feat_entries; j++)
int32_t* feat_idx=SG_MALLOC(int32_t, num_feat_entries);
for (index_t j=0; j<num_feat_entries; j++)
{
feat_idx[j]=sf_orig[j].feat_index;
orig_idx[j]=j;
feat_idx[j]=features[j].feat_index;
}

CMath::qsort_index(feat_idx, orig_idx, num_feat_entries);
CMath::qsort_index(feat_idx, features, num_feat_entries);
SG_FREE(feat_idx);

SGSparseVectorEntry<T>* sf_new= SG_MALLOC(SGSparseVectorEntry<T>, num_feat_entries);
for (index_t j=1; j<num_feat_entries; j++)
{
REQUIRE(features[j-1].feat_index <= features[j].feat_index,
"sort_features(): failed sanity check %d <= %d after sorting (comparing indices features[%d] <= features[%d], features=%d)\n",
features[j-1].feat_index, features[j].feat_index, j-1, j, num_feat_entries);
}

// compression: merging duplicates (features with same index)
// compression: removing zero-entries and merging features with same index
int32_t last_index = 0;
sf_new[last_index] = sf_orig[orig_idx[last_index]];

for (int32_t i = 1; i < num_feat_entries; i++)
for (index_t j=1; j<num_feat_entries; j++)
{
if (sf_new[last_index].feat_index == sf_orig[orig_idx[i]].feat_index)
// always true, but kept for future changes
REQUIRE(last_index < j,
"sort_features(): target index %d must not exceed source index j=%d",
last_index, j);
REQUIRE(features[last_index].feat_index <= features[j].feat_index,
"sort_features(): failed sanity check %d = features[%d].feat_index <= features[%d].feat_index = %d\n",
features[last_index].feat_index, last_index, j, features[j].feat_index);

// merging of features with same index
if (features[last_index].feat_index == features[j].feat_index)
{
sf_new[last_index].entry += sf_orig[orig_idx[i]].entry;
features[last_index].entry += features[j].entry;
continue;
}
else

// only skip to next element if current one is not zero
if (features[last_index].entry != 0.0)
{
last_index++;
sf_new[last_index] = sf_orig[orig_idx[i]];
}

features[last_index] = features[j];
}

REQUIRE(last_index < num_feat_entries, "sort_features(): last_index=%d must not exceed num_feat_entries=%d\n",
last_index, num_feat_entries);
// remove single first element if zero (not caught by loop)
if (features[last_index].entry == 0.0)
{
last_index--;
}

SG_FREE(orig_idx);
SG_FREE(feat_idx);
SG_FREE(sf_orig);
int32_t new_feat_count = last_index+1;
ASSERT(new_feat_count <= num_feat_entries);

features = SG_REALLOC(SGSparseVectorEntry<T>, sf_new, num_feat_entries, last_index+1);
num_feat_entries = last_index+1;
// shrinking vector
if (!stable_pointer)
{
SG_SINFO("shrinking vector from %d to %d\n", num_feat_entries, new_feat_count);
features = SG_REALLOC(SGSparseVectorEntry<T>, features, num_feat_entries, new_feat_count);
}
num_feat_entries = new_feat_count;

// sanity check: strict sort order (assuming no duplicates)
for (int j=0; j<num_feat_entries-1; j++)
for (index_t j=1; j<num_feat_entries; j++)
{
REQUIRE(features[j].feat_index < features[j+1].feat_index,
"sort_features(): failed sanity check %d <= %d after sorting (comparing indices sf_new[%d] <= sf_new[%d], features=%d)\n",
features[j].feat_index, features[j+1].feat_index, j, j+1, num_feat_entries);
REQUIRE(features[j-1].feat_index < features[j].feat_index,
"sort_features(): failed sanity check %d < %d after sorting (comparing indices features[%d] < features[%d], features=%d)\n",
features[j-1].feat_index, features[j].feat_index, j-1, j, num_feat_entries);
}

// compare with old pointer to enforce quarantee
if (stable_pointer) {
ASSERT(old_features_ptr == features);
}
return;
}

template<class T>
Expand Down
9 changes: 7 additions & 2 deletions src/shogun/lib/SGSparseVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,14 @@ template <class T> class SGSparseVector : public SGReferencedData
int32_t get_num_dimensions();

/**
* sort features by indices
* sort features by indices (Setting stable_pointer=true to
* guarantee that pointer features does not change. On the
* other hand, stable_pointer=false can shrink the vector if
* possible.)
*
* @param stable_pointer (default false) enforce stable pointer
*/
void sort_features();
void sort_features(bool stable_pointer = false);

/**
* get feature value for index
Expand Down
Loading

0 comments on commit 41d7a03

Please sign in to comment.