Skip to content

Commit

Permalink
Use nearest neighbors method in classify for multiple k.
Browse files Browse the repository at this point in the history
Also, avoid code duplication by introducing choose for multiple k
method, used in classify for multiple k w/ and w/o cover tree.
  • Loading branch information
iglesias committed Jul 9, 2013
1 parent 28e5183 commit fb0c239
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 75 deletions.
1 change: 1 addition & 0 deletions examples/undocumented/libshogun/classifier_knn.cpp
Expand Up @@ -47,6 +47,7 @@ int main(int, char*[])
knn->train();
// Apply classifier
CMulticlassLabels* output = CLabelsFactory::to_multiclass( knn->apply() );
SGMatrix<int32_t> multiple_k_output = knn->classify_for_multiple_k();

// Free memory
SG_UNREF(knn)
Expand Down
117 changes: 44 additions & 73 deletions src/shogun/multiclass/KNN.cpp
Expand Up @@ -342,68 +342,34 @@ SGMatrix<int32_t> CKNN::classify_for_multiple_k()

int32_t* output=SG_MALLOC(int32_t, m_k*num_lab);

float64_t* dists;
int32_t* train_lab;
//distances to train data and working buffer of m_train_labels
if ( ! m_use_covertree )
{
dists=SG_MALLOC(float64_t, m_train_labels.vlen);
train_lab=SG_MALLOC(int32_t, m_train_labels.vlen);
}
else
{
dists=SG_MALLOC(float64_t, m_k);
train_lab=SG_MALLOC(int32_t, m_k);
}
//working buffer of m_train_labels
int32_t* train_lab=SG_MALLOC(int32_t, m_k);

///histogram of classes and returned output
//histogram of classes and returned output
int32_t* classes=SG_MALLOC(int32_t, m_num_classes);

SG_INFO("%d test examples\n", num_lab)
CSignal::clear_cancel();

if ( ! m_use_covertree )
{
//get the k nearest neighbors of each example
SGMatrix<int32_t> NN = nearest_neighbors();

for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
{
SG_PROGRESS(i, 0, num_lab)

// lhs idx 1..n and rhs idx i
distances_lhs(dists,0,m_train_labels.vlen-1,i);
for (int32_t j=0; j<m_train_labels.vlen; j++)
train_lab[j]=m_train_labels.vector[j];

//sort the distance vector for test example j to all train examples
//classes[1..k] then holds the classes for minimum distance
CMath::qsort_index(dists, train_lab, m_train_labels.vlen);

//compute histogram of class outputs of the first k nearest
//neighbours
for (int32_t j=0; j<m_num_classes; j++)
classes[j]=0;

//write the labels of the k nearest neighbors from theirs indices
for (int32_t j=0; j<m_k; j++)
{
classes[train_lab[j]]++;

//choose the class that got 'outputted' most often
int32_t out_idx=0;
int32_t out_max=0;

for (int32_t c=0; c<m_num_classes; c++)
{
if (out_max< classes[c])
{
out_idx= c;
out_max= classes[c];
}
}
output[j*num_lab+i]=out_idx+m_min_label;
}
train_lab[j] = m_train_labels[ NN(j,i) ];

choose_class_for_multiple_k(output+i, classes, train_lab, num_lab);
}
}
else
else // Use cover tree
{
//allocation for distances to nearest neighbors
float64_t* dists=SG_MALLOC(float64_t, m_k);

// From the sets of features (lhs and rhs) stored in distance,
// build arrays of cover tree points
v_array< CJLCoverTreePoint > set_of_points =
Expand Down Expand Up @@ -441,36 +407,15 @@ SGMatrix<int32_t> CKNN::classify_for_multiple_k()
// Now we get the indices to the neighbors sorted by distance
CMath::qsort_index(dists, train_lab, m_k);

//compute histogram of class outputs of the first k nearest
//neighbours
for (int32_t j=0; j<m_num_classes; j++)
classes[j]=0;

for (int32_t j=0; j<m_k; j++)
{
classes[train_lab[j]]++;

//choose the class that got 'outputted' most often
int32_t out_idx=0;
int32_t out_max=0;

for (int32_t c=0; c<m_num_classes; c++)
{
if (out_max< classes[c])
{
out_idx= c;
out_max= classes[c];
}
}
output[j*num_lab+res[i][0].m_index]=out_idx+m_min_label;
}

choose_class_for_multiple_k(output+res[i][0].m_index, classes,
train_lab, num_lab);
}

SG_FREE(dists);
}

SG_FREE(train_lab);
SG_FREE(classes);
SG_FREE(dists);

return SGMatrix<int32_t>(output,num_lab,m_k,true);
}
Expand Down Expand Up @@ -541,3 +486,29 @@ int32_t CKNN::choose_class(float64_t* classes, int32_t* train_lab)

return out_idx;
}

void CKNN::choose_class_for_multiple_k(int32_t* output, int32_t* classes, int32_t* train_lab, int32_t step)
{
//compute histogram of class outputs of the first k nearest neighbours
memset(classes, 0, sizeof(int32_t)*m_num_classes);

for (int32_t j=0; j<m_k; j++)
{
classes[train_lab[j]]++;

//choose the class that got 'outputted' most often
int32_t out_idx=0;
int32_t out_max=0;

for (int32_t c=0; c<m_num_classes; c++)
{
if (out_max< classes[c])
{
out_idx= c;
out_max= classes[c];
}
}

output[j*step]=out_idx+m_min_label;
}
}
18 changes: 16 additions & 2 deletions src/shogun/multiclass/KNN.h
Expand Up @@ -198,7 +198,7 @@ class CKNN : public CDistanceMachine
private:
void init();

/** compute the histogram of class outputs of the first k nearest
/** compute the histogram of class outputs of the k nearest
* neighbors to a test vector and return the index of the most
* frequent class
*
Expand All @@ -207,11 +207,25 @@ class CKNN : public CDistanceMachine
* tree is not used, the elements are ordered by increasing distance
* and there are elements for each of the training vectors. If the cover
* tree is used, it contains just m_k elements not necessary ordered.
*
*
* @return index of the most frequent class, class detected by KNN
*/
int32_t choose_class(float64_t* classes, int32_t* train_lab);

/** compute the histogram of class outputs of the k nearest neighbors
* to a test vector, using k from 1 to m_k, and write the most frequent
* class for each value of k in output, using a distance equal to step
* between elements in the output array
*
* @param output return value where the most frequent classes are written
* @param classes vector used to store the histogram
* @param train_lab class indices of the training data; no matter the cover tree
* is used or not, the neighbors are ordered by distance to the test vector
* in ascending order
* @param step distance between elements to be written in output
*/
void choose_class_for_multiple_k(int32_t* output, int32_t* classes, int32_t* train_lab, int32_t step);

protected:
/// the k parameter in KNN
int32_t m_k;
Expand Down

0 comments on commit fb0c239

Please sign in to comment.