Skip to content

Commit

Permalink
Merge pull request #1225 from iglesias/feature/knn_public_neighbors
Browse files Browse the repository at this point in the history
Feature/knn public neighbors
  • Loading branch information
iglesias committed Jul 9, 2013
2 parents fd2ba03 + fb0c239 commit 3c8ea00
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 117 deletions.
1 change: 1 addition & 0 deletions examples/undocumented/libshogun/Makefile
Expand Up @@ -27,6 +27,7 @@ TARGETS = basic_minimal \
classifier_qda \
classifier_lda \
classifier_multiclasslinearmachine \
classifier_knn \
kernel_gaussian kernel_revlin kernel_custom\
library_dyn_int library_gc_array library_indirect_object \
library_hash parameter_set_from_parameters \
Expand Down
60 changes: 60 additions & 0 deletions examples/undocumented/libshogun/classifier_knn.cpp
@@ -0,0 +1,60 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2013 Fernando J. Iglesias García
*/

#include <shogun/distance/EuclideanDistance.h>
#include <shogun/features/DataGenerator.h>
#include <shogun/features/DenseFeatures.h>
#include <shogun/labels/MulticlassLabels.h>
#include <shogun/multiclass/KNN.h>

using namespace shogun;

#define NUM 10
#define DIMS 2
#define CLASSES 4
#define k 3

int main(int, char*[])
{
init_shogun_with_defaults();

#ifdef HAVE_LAPACK /* because of CDataGenerator::generate_gaussians */

// Labels and features containers
SGVector<float64_t> lab(CLASSES*NUM);
SGMatrix<float64_t> feat(DIMS, CLASSES*NUM);
// Random generation of features
feat = CDataGenerator::generate_gaussians(NUM,CLASSES,DIMS);
// Labels
for (int32_t i = 0; i < CLASSES; ++i)
for (int32_t j = 0; j < NUM; ++j)
lab[i*NUM + j] = i;

// Create train labels
CMulticlassLabels* labels = new CMulticlassLabels(lab);
// Create train features
CDenseFeatures<float64_t>* features = new CDenseFeatures<float64_t>(feat);

// Create KNN classifier
CKNN* knn = new CKNN(k, new CEuclideanDistance(features, features), labels);
// Train classifier
knn->train();
// Apply classifier
CMulticlassLabels* output = CLabelsFactory::to_multiclass( knn->apply() );
SGMatrix<int32_t> multiple_k_output = knn->classify_for_multiple_k();

// Free memory
SG_UNREF(knn)
SG_UNREF(output)

#endif /* HAVE_LAPACK */

exit_shogun();
return 0;
}
215 changes: 100 additions & 115 deletions src/shogun/multiclass/KNN.cpp
Expand Up @@ -107,12 +107,56 @@ bool CKNN::train_machine(CFeatures* data)
return true;
}

SGMatrix<int32_t> CKNN::nearest_neighbors()
{
//number of examples to which kNN is applied
int32_t n=distance->get_num_vec_rhs();
//distances to train data
float64_t* dists=SG_MALLOC(float64_t, m_train_labels.vlen);
//indices to train data
int32_t* train_idxs=SG_MALLOC(int32_t, m_train_labels.vlen);
//pre-allocation of the nearest neighbors
SGMatrix<int32_t> NN(m_k, n);

//for each test example
for (int32_t i=0; i<n && (!CSignal::cancel_computations()); i++)
{
SG_PROGRESS(i, 0, n)

//lhs idx 0..num train examples-1 (i.e., all train examples) and rhs idx i
distances_lhs(dists,0,m_train_labels.vlen-1,i);

//fill in an array with 0..num train examples-1
for (int32_t j=0; j<m_train_labels.vlen; j++)
train_idxs[j]=j;

//sort the distance vector between test example i and all train examples
CMath::qsort_index(dists, train_idxs, m_train_labels.vlen);

#ifdef DEBUG_KNN
SG_PRINT("\nQuick sort query %d\n", i)
for (int32_t j=0; j<m_k; j++)
SG_PRINT("%d ", train_idxs[j])
SG_PRINT("\n")
#endif

//fill in the output the indices of the nearest neighbors
for (int32_t j=0; j<m_k; j++)
NN(j,i) = train_idxs[j];
}

SG_FREE(train_idxs);
SG_FREE(dists);

return NN;
}

CMulticlassLabels* CKNN::apply_multiclass(CFeatures* data)
{
if (data)
init_distance(data);

// redirecting to fast (without sorting) classify if k==1
//redirecting to fast (without sorting) classify if k==1
if (m_k == 1)
return classify_NN();

Expand All @@ -125,24 +169,13 @@ CMulticlassLabels* CKNN::apply_multiclass(CFeatures* data)

CMulticlassLabels* output=new CMulticlassLabels(num_lab);

float64_t* dists = NULL;
int32_t* train_lab = NULL;

//distances to train data and working buffer of m_train_labels
if ( ! m_use_covertree )
{
dists=SG_MALLOC(float64_t, m_train_labels.vlen);
train_lab=SG_MALLOC(int32_t, m_train_labels.vlen);
}
else
{
train_lab=SG_MALLOC(int32_t, m_k);
}
//labels of the k nearest neighbors
int32_t* train_lab=SG_MALLOC(int32_t, m_k);

SG_INFO("%d test examples\n", num_lab)
CSignal::clear_cancel();

///histogram of classes and returned output
//histogram of classes and returned output
float64_t* classes=SG_MALLOC(float64_t, m_num_classes);

#ifdef BENCHMARK_KNN
Expand All @@ -152,36 +185,19 @@ CMulticlassLabels* CKNN::apply_multiclass(CFeatures* data)

if ( ! m_use_covertree )
{
//get the k nearest neighbors of each example
SGMatrix<int32_t> NN = nearest_neighbors();

//from the indices to the nearest neighbors, compute the class labels
for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
{
SG_PROGRESS(i, 0, num_lab)

#ifdef DEBUG_KNN
distances_lhs(dists,0,m_train_labels.vlen-1,i);

for (int32_t j=0; j<m_train_labels.vlen; j++)
train_lab[j]=j;

CMath::qsort_index(dists, train_lab, m_train_labels.vlen);

SG_PRINT("\nQuick sort query %d\n", i)
//write the labels of the k nearest neighbors from theirs indices
for (int32_t j=0; j<m_k; j++)
SG_PRINT("%d ", train_lab[j])
SG_PRINT("\n")
#endif

//lhs idx 1..n and rhs idx i
distances_lhs(dists,0,m_train_labels.vlen-1,i);

for (int32_t j=0; j<m_train_labels.vlen; j++)
train_lab[j]=m_train_labels.vector[j];

//sort the distance vector for test example j to all
//train examples
CMath::qsort_index(dists, train_lab, m_train_labels.vlen);
train_lab[j] = m_train_labels[ NN(j,i) ];

// Get the index of the 'nearest' class
//get the index of the 'nearest' class
int32_t out_idx = choose_class(classes, train_lab);
//write the label of 'nearest' in the output
output->set_label(i, out_idx + m_min_label);
}

Expand Down Expand Up @@ -266,8 +282,6 @@ CMulticlassLabels* CKNN::apply_multiclass(CFeatures* data)

SG_FREE(classes);
SG_FREE(train_lab);
if ( ! m_use_covertree )
SG_FREE(dists);

return output;
}
Expand Down Expand Up @@ -328,68 +342,34 @@ SGMatrix<int32_t> CKNN::classify_for_multiple_k()

int32_t* output=SG_MALLOC(int32_t, m_k*num_lab);

float64_t* dists;
int32_t* train_lab;
//distances to train data and working buffer of m_train_labels
if ( ! m_use_covertree )
{
dists=SG_MALLOC(float64_t, m_train_labels.vlen);
train_lab=SG_MALLOC(int32_t, m_train_labels.vlen);
}
else
{
dists=SG_MALLOC(float64_t, m_k);
train_lab=SG_MALLOC(int32_t, m_k);
}
//working buffer of m_train_labels
int32_t* train_lab=SG_MALLOC(int32_t, m_k);

///histogram of classes and returned output
//histogram of classes and returned output
int32_t* classes=SG_MALLOC(int32_t, m_num_classes);

SG_INFO("%d test examples\n", num_lab)
CSignal::clear_cancel();

if ( ! m_use_covertree )
{
//get the k nearest neighbors of each example
SGMatrix<int32_t> NN = nearest_neighbors();

for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
{
SG_PROGRESS(i, 0, num_lab)

// lhs idx 1..n and rhs idx i
distances_lhs(dists,0,m_train_labels.vlen-1,i);
for (int32_t j=0; j<m_train_labels.vlen; j++)
train_lab[j]=m_train_labels.vector[j];

//sort the distance vector for test example j to all train examples
//classes[1..k] then holds the classes for minimum distance
CMath::qsort_index(dists, train_lab, m_train_labels.vlen);

//compute histogram of class outputs of the first k nearest
//neighbours
for (int32_t j=0; j<m_num_classes; j++)
classes[j]=0;

//write the labels of the k nearest neighbors from theirs indices
for (int32_t j=0; j<m_k; j++)
{
classes[train_lab[j]]++;

//choose the class that got 'outputted' most often
int32_t out_idx=0;
int32_t out_max=0;

for (int32_t c=0; c<m_num_classes; c++)
{
if (out_max< classes[c])
{
out_idx= c;
out_max= classes[c];
}
}
output[j*num_lab+i]=out_idx+m_min_label;
}
train_lab[j] = m_train_labels[ NN(j,i) ];

choose_class_for_multiple_k(output+i, classes, train_lab, num_lab);
}
}
else
else // Use cover tree
{
//allocation for distances to nearest neighbors
float64_t* dists=SG_MALLOC(float64_t, m_k);

// From the sets of features (lhs and rhs) stored in distance,
// build arrays of cover tree points
v_array< CJLCoverTreePoint > set_of_points =
Expand Down Expand Up @@ -427,36 +407,15 @@ SGMatrix<int32_t> CKNN::classify_for_multiple_k()
// Now we get the indices to the neighbors sorted by distance
CMath::qsort_index(dists, train_lab, m_k);

//compute histogram of class outputs of the first k nearest
//neighbours
for (int32_t j=0; j<m_num_classes; j++)
classes[j]=0;

for (int32_t j=0; j<m_k; j++)
{
classes[train_lab[j]]++;

//choose the class that got 'outputted' most often
int32_t out_idx=0;
int32_t out_max=0;

for (int32_t c=0; c<m_num_classes; c++)
{
if (out_max< classes[c])
{
out_idx= c;
out_max= classes[c];
}
}
output[j*num_lab+res[i][0].m_index]=out_idx+m_min_label;
}

choose_class_for_multiple_k(output+res[i][0].m_index, classes,
train_lab, num_lab);
}

SG_FREE(dists);
}

SG_FREE(train_lab);
SG_FREE(classes);
SG_FREE(dists);

return SGMatrix<int32_t>(output,num_lab,m_k,true);
}
Expand Down Expand Up @@ -527,3 +486,29 @@ int32_t CKNN::choose_class(float64_t* classes, int32_t* train_lab)

return out_idx;
}

void CKNN::choose_class_for_multiple_k(int32_t* output, int32_t* classes, int32_t* train_lab, int32_t step)
{
//compute histogram of class outputs of the first k nearest neighbours
memset(classes, 0, sizeof(int32_t)*m_num_classes);

for (int32_t j=0; j<m_k; j++)
{
classes[train_lab[j]]++;

//choose the class that got 'outputted' most often
int32_t out_idx=0;
int32_t out_max=0;

for (int32_t c=0; c<m_num_classes; c++)
{
if (out_max< classes[c])
{
out_idx= c;
out_max= classes[c];
}
}

output[j*step]=out_idx+m_min_label;
}
}

0 comments on commit 3c8ea00

Please sign in to comment.