From bd69a3b0cd793cfc63e710bfae5b017a5d354674 Mon Sep 17 00:00:00 2001 From: Viktor Gal Date: Mon, 14 May 2018 14:40:07 +0200 Subject: [PATCH] cleanup and fixes in CARTree convert whatever is possible to const and use linalg wherever possible optimize set_const for Eigen backend CMath::argsort use lambda instead of compartor class optimize CMath::pow(2,n) in case of integer fix #4282: segfault in CARTree --- src/shogun/lib/SGVector.cpp | 12 +- src/shogun/mathematics/Math.h | 50 +- .../mathematics/linalg/LinalgBackendEigen.h | 8 +- .../mathematics/linalg/backend/eigen/Misc.cpp | 19 +- src/shogun/multiclass/tree/CARTree.cpp | 522 +++++++++--------- src/shogun/multiclass/tree/CARTree.h | 39 +- src/shogun/multiclass/tree/RandomCARTree.cpp | 18 +- src/shogun/multiclass/tree/RandomCARTree.h | 12 +- tests/unit/lib/SGVector_unittest.cc | 10 + 9 files changed, 334 insertions(+), 356 deletions(-) diff --git a/src/shogun/lib/SGVector.cpp b/src/shogun/lib/SGVector.cpp index cab40117da4..23f32bc58e4 100644 --- a/src/shogun/lib/SGVector.cpp +++ b/src/shogun/lib/SGVector.cpp @@ -871,15 +871,9 @@ float32_t SGVector::sum_abs(float32_t* vec, int32_t len) template int32_t SGVector::unique(T* output, int32_t size) { - CMath::qsort(output, size); - int32_t j=0; - - for (int32_t i=0; i diff --git a/src/shogun/mathematics/Math.h b/src/shogun/mathematics/Math.h index aaaf6dbcab7..ed5691b8f00 100644 --- a/src/shogun/mathematics/Math.h +++ b/src/shogun/mathematics/Math.h @@ -18,6 +18,7 @@ #include #include #include +#include #ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES @@ -468,12 +469,17 @@ class CMath : public CSGObject * @param x base (integer) * @param n exponent (integer) */ - static inline int32_t pow(int32_t x, int32_t n) + template::is_integer, T>* = nullptr> + static inline T pow(T x, T n) { ASSERT(n>=0) - int32_t result=1; + // power of integer 2 is basically a bitshift... + if (x == 2) + return (1 << n); + + T result = 1; while (n--) - result*=x; + result *= x; return result; } @@ -1229,42 +1235,25 @@ class CMath : public CSGObject } #endif - /** Helper functor for the function argsort */ - template - struct IndexSorter - { - /** constructor */ - IndexSorter(const SGVector *vec) { data = vec->vector; } - - /** access operator */ - bool operator() (index_t i, index_t j) const - { - return abs(data[i]-data[j])>std::numeric_limits::epsilon() - && data[i]::value>::type> - static SGVector argsort(SGVector vector) + static SGVector argsort(SGVector v) { - IndexSorter cmp(&vector); - SGVector idx(vector.size()); - for (index_t i=0; i < vector.size(); ++i) - idx[i] = i; - - std::sort(idx.vector, idx.vector+vector.size(), cmp); - + SGVector idx(v.vlen); + std::iota(idx.begin(), idx.end(), 0); + std::sort(idx.begin(), idx.end(), + [&v](index_t i1, index_t i2) + { + return std::abs(v[i1]-v[i2])>std::numeric_limits::epsilon() + && v[i1] static void* parallel_qsort_index(void* p); - /** Finds the smallest element in output and puts that element as the * first element * @param output element array diff --git a/src/shogun/mathematics/linalg/LinalgBackendEigen.h b/src/shogun/mathematics/linalg/LinalgBackendEigen.h index 424cbe895e4..45d926acf56 100644 --- a/src/shogun/mathematics/linalg/LinalgBackendEigen.h +++ b/src/shogun/mathematics/linalg/LinalgBackendEigen.h @@ -615,8 +615,12 @@ namespace shogun scale_impl(const SGMatrix& a, T alpha, SGMatrix& result) const; /** Eigen3 set const method */ - template class Container> - void set_const_impl(Container& a, T value) const; + template + void set_const_impl(SGVector& a, T value) const; + + /** Eigen3 set matrix to const */ + template + void set_const_impl(SGMatrix& a, T value) const; /** Eigen3 softmax method */ template class Container> diff --git a/src/shogun/mathematics/linalg/backend/eigen/Misc.cpp b/src/shogun/mathematics/linalg/backend/eigen/Misc.cpp index 591288383b9..a7ba5fa3528 100644 --- a/src/shogun/mathematics/linalg/backend/eigen/Misc.cpp +++ b/src/shogun/mathematics/linalg/backend/eigen/Misc.cpp @@ -117,18 +117,25 @@ void LinalgBackendEigen::identity_impl(SGMatrix& identity_matrix) const I_eig.setIdentity(); } -template class Container> -void LinalgBackendEigen::range_fill_impl(Container& a, const T start) const +template +void LinalgBackendEigen::set_const_impl(SGVector& a, T value) const { - for (decltype(a.size()) i = 0; i < a.size(); ++i) - a[i] = start + T(i); + typename SGVector::EigenVectorXtMap a_eig = a; + a_eig.setConstant(value); +} + +template +void LinalgBackendEigen::set_const_impl(SGMatrix& a, T value) const +{ + typename SGMatrix::EigenMatrixXtMap a_eig = a; + a_eig.setConstant(value); } template class Container> -void LinalgBackendEigen::set_const_impl(Container& a, T value) const +void LinalgBackendEigen::range_fill_impl(Container& a, const T start) const { for (decltype(a.size()) i = 0; i < a.size(); ++i) - a[i] = value; + a[i] = start + T(i); } template diff --git a/src/shogun/multiclass/tree/CARTree.cpp b/src/shogun/multiclass/tree/CARTree.cpp index 0ca900a6313..7f7c3b6bedf 100644 --- a/src/shogun/multiclass/tree/CARTree.cpp +++ b/src/shogun/multiclass/tree/CARTree.cpp @@ -28,10 +28,14 @@ * either expressed or implied, of the Shogun Development Team. */ +#include + #include #include +#include #include + using namespace Eigen; using namespace shogun; @@ -106,10 +110,10 @@ CMulticlassLabels* CCARTree::apply_multiclass(CFeatures* data) bnode_t* current=dynamic_cast(get_root()); REQUIRE(current, "Tree machine not yet trained.\n"); - CLabels* ret=apply_from_current_node(dynamic_cast*>(data), current); + CLabels* ret=apply_from_current_node(data->as>(), current); SG_UNREF(current); - return dynamic_cast(ret); + return ret->as(); } CRegressionLabels* CCARTree::apply_regression(CFeatures* data) @@ -121,7 +125,7 @@ CRegressionLabels* CCARTree::apply_regression(CFeatures* data) CLabels* ret=apply_from_current_node(dynamic_cast*>(data), current); SG_UNREF(current); - return dynamic_cast(ret); + return ret->as(); } void CCARTree::prune_using_test_dataset(CDenseFeatures* feats, CLabels* gnd_truth, SGVector weights) @@ -129,22 +133,21 @@ void CCARTree::prune_using_test_dataset(CDenseFeatures* feats, CLabel if (weights.vlen==0) { weights=SGVector(feats->get_num_vectors()); - weights.fill_vector(weights.vector,weights.vlen,1); + linalg::set_const(weights, 1.0); } CDynamicObjectArray* pruned_trees=prune_tree(this); int32_t min_index=0; float64_t min_error=CMath::MAX_REAL_NUMBER; - for (int32_t i=0;iget_num_elements();i++) + for (int32_t i=0;iget_num_elements();++i) { CSGObject* element=pruned_trees->get_element(i); - bnode_t* root=NULL; - if (element!=NULL) - root=dynamic_cast(element); - else + if (element == nullptr) SG_ERROR("%d element is NULL\n",i); + bnode_t* root = dynamic_cast(element); + CLabels* labels=apply_from_current_node(feats, root); float64_t error=compute_error(labels,gnd_truth,weights); if (error* feats, CLabel } CSGObject* element=pruned_trees->get_element(min_index); - bnode_t* root=NULL; - if (element!=NULL) - root=dynamic_cast(element); - else + if (element == nullptr) SG_ERROR("%d element is NULL\n",min_index); + bnode_t* root = dynamic_cast(element); this->set_root(root); SG_UNREF(pruned_trees); @@ -248,8 +249,9 @@ bool CCARTree::train_machine(CFeatures* data) REQUIRE(data,"Data required for training\n") REQUIRE(data->get_feature_class()==C_DENSE,"Dense data required for training\n") - int32_t num_features=(dynamic_cast*>(data))->get_num_features(); - int32_t num_vectors=(dynamic_cast*>(data))->get_num_vectors(); + auto dense_features = data->as>(); + auto num_features = dense_features->get_num_features(); + auto num_vectors = dense_features->get_num_vectors(); if (m_weights_set) { @@ -260,7 +262,7 @@ bool CCARTree::train_machine(CFeatures* data) { // all weights are equal to 1 m_weights=SGVector(num_vectors); - m_weights.fill_vector(m_weights.vector,m_weights.vlen,1.0); + linalg::set_const(m_weights, 1.0); } if (m_types_set) @@ -277,15 +279,15 @@ bool CCARTree::train_machine(CFeatures* data) "Feature types are not specified. All features are " "considered as continuous in training.\n") m_nominal=SGVector(num_features); - m_nominal.fill_vector(m_nominal.vector,m_nominal.vlen,false); + linalg::set_const(m_nominal, false); } - set_root(CARTtrain(data,m_weights,m_labels,0)); + auto dense_labels = m_labels->as(); + set_root(CARTtrain(dense_features,m_weights,dense_labels,0)); if (m_apply_cv_pruning) { - CDenseFeatures* feats=dynamic_cast*>(data); - prune_by_cross_validation(feats,m_folds); + prune_by_cross_validation(dense_features,m_folds); } return true; @@ -300,7 +302,7 @@ void CCARTree::set_sorted_features(SGMatrix& sorted_feats, SGMatrix& sorted_feats, SGMatrix& sorted_indices) { - SGMatrix mat=(dynamic_cast*>(data))->get_feature_matrix(); + SGMatrix mat=(data)->as>()->get_feature_matrix(); sorted_feats = SGMatrix(mat.num_cols, mat.num_rows); sorted_indices = SGMatrix(mat.num_cols, mat.num_rows); for(int32_t i=0; i& sorted_fe } -CBinaryTreeMachineNode* CCARTree::CARTtrain(CFeatures* data, SGVector weights, CLabels* labels, int32_t level) +CBinaryTreeMachineNode* CCARTree::CARTtrain(CDenseFeatures* data, const SGVector& weights, CDenseLabels* labels, int32_t level) { REQUIRE(labels,"labels have to be supplied\n"); REQUIRE(data,"data matrix has to be supplied\n"); bnode_t* node=new bnode_t(); - SGVector labels_vec=(dynamic_cast(labels))->get_labels(); - SGMatrix mat=(dynamic_cast*>(data))->get_feature_matrix(); - int32_t num_feats=mat.num_rows; - int32_t num_vecs=mat.num_cols; + auto labels_vec = labels->get_labels(); + auto mat = data->get_feature_matrix(); + auto num_feats=mat.num_rows; + auto num_vecs=mat.num_cols; // calculate node label switch(m_mode) { case PT_REGRESSION: { - float64_t sum=0; - for (int32_t i=0;idata.weight_minus_node=tot*least_squares_deviation(labels_vec,weights,tot); @@ -349,13 +348,13 @@ CBinaryTreeMachineNode* CCARTree::CARTtrain(CFeatures* data, SG case PT_MULTICLASS: { SGVector lab=labels_vec.clone(); - CMath::qsort(lab); + std::sort(lab.begin(), lab.end()); // stores max total weight for a single label - int32_t max=weights[0]; + auto max=weights[0]; // stores one of the indices having max total weight - int32_t maxi=0; - int32_t c=weights[0]; - for (int32_t i=1;i* CCARTree::CARTtrain(CFeatures* data, SG node->data.node_label=lab[maxi]; // resubstitution error calculation - node->data.total_weight=weights.sum(weights); + node->data.total_weight=linalg::sum(weights); node->data.weight_minus_node=node->data.total_weight-max; break; } @@ -426,7 +425,7 @@ CBinaryTreeMachineNode* CCARTree::CARTtrain(CFeatures* data, SG if (subset_stack->has_subsets()) indices=(subset_stack->get_last_subset())->get_subset_idx(); else - indices.range_fill(); + linalg::range_fill(indices); SG_UNREF(subset_stack); best_attribute=compute_best_attribute(m_sorted_features,weights,labels,left,right,left_final,num_missing_final,c_left,c_right,0,indices); } @@ -449,7 +448,7 @@ CBinaryTreeMachineNode* CCARTree::CARTtrain(CFeatures* data, SG { SGVector is_left_final(num_vecs-num_missing_final); int32_t ilf=0; - for (int32_t i=0;i* CCARTree::CARTtrain(CFeatures* data, SG left_final=surrogate_split(mat,weights,is_left_final,best_attribute); } - int32_t count_left=0; - for (int32_t c=0;c subsetl(count_left); SGVector weightsl(count_left); @@ -468,7 +465,7 @@ CBinaryTreeMachineNode* CCARTree::CARTtrain(CFeatures* data, SG SGVector weightsr(num_vecs-count_left); index_t l=0; index_t r=0; - for (int32_t c=0;c* CCARTree::CARTtrain(CFeatures* data, SG return node; } -SGVector CCARTree::get_unique_labels(SGVector labels_vec, int32_t &n_ulabels) +SGVector CCARTree::get_unique_labels(const SGVector& labels_vec, index_t &n_ulabels) const { float64_t delta=0; if (m_mode==PT_REGRESSION) @@ -518,8 +515,8 @@ SGVector CCARTree::get_unique_labels(SGVector labels_vec, SGVector sidx=CMath::argsort(labels_vec); ulabels[0]=labels_vec[sidx[0]]; n_ulabels=1; - int32_t start=0; - for (int32_t i=1;i CCARTree::get_unique_labels(SGVector labels_vec, return ulabels; } -int32_t CCARTree::compute_best_attribute(const SGMatrix& mat, const SGVector& weights, CLabels* labels, - SGVector& left, SGVector& right, SGVector& is_left_final, int32_t &num_missing_final, int32_t &count_left, - int32_t &count_right, int32_t subset_size, const SGVector& active_indices) +index_t CCARTree::compute_best_attribute(const SGMatrix& mat, const SGVector& weights, CDenseLabels* labels, + SGVector& left, SGVector& right, SGVector& is_left_final, index_t &num_missing_final, index_t &count_left, + index_t &count_right, index_t subset_size, const SGVector& active_indices) { - SGVector labels_vec=(dynamic_cast(labels))->get_labels(); - int32_t num_vecs=labels->get_num_labels(); - int32_t num_feats; - if (m_pre_sort) - num_feats=mat.num_cols; - else - num_feats=mat.num_rows; + auto labels_vec=labels->get_labels(); + auto num_vecs=labels->get_num_labels(); + auto num_feats = (m_pre_sort) ? mat.num_cols : mat.num_rows; - int32_t n_ulabels; - SGVector ulabels=get_unique_labels(labels_vec,n_ulabels); + index_t n_ulabels; + auto ulabels = get_unique_labels(labels_vec, n_ulabels); // if all labels same early stop if (n_ulabels==1) @@ -556,14 +549,14 @@ int32_t CCARTree::compute_best_attribute(const SGMatrix& mat, const S delta=m_label_epsilon; SGVector total_wclasses(n_ulabels); - total_wclasses.zero(); + linalg::zero(total_wclasses); - SGVector simple_labels(num_vecs); - for (int32_t i=0;i simple_labels(num_vecs); + for (index_t i=0;i& mat, const S } SGVector idx(num_feats); - idx.range_fill(); + linalg::range_fill(idx); if (subset_size) { num_feats=subset_size; @@ -581,19 +574,19 @@ int32_t CCARTree::compute_best_attribute(const SGMatrix& mat, const S } float64_t max_gain=MIN_SPLIT_GAIN; - int32_t best_attribute=-1; + index_t best_attribute=-1; float64_t best_threshold=0; SGVector indices_mask; - SGVector count_indices(mat.num_rows); + SGVector count_indices(mat.num_rows); count_indices.zero(); - SGVector dupes(num_vecs); - dupes.range_fill(); + SGVector dupes(num_vecs); + linalg::range_fill(dupes); if (m_pre_sort) { indices_mask = SGVector(mat.num_rows); - indices_mask.set_const(-1); - for(int32_t j=0;j=0) dupes[indices_mask[active_indices[j]]]=j; @@ -603,23 +596,23 @@ int32_t CCARTree::compute_best_attribute(const SGMatrix& mat, const S } } - for (int32_t i=0;i feats(num_vecs); SGVector sorted_args(num_vecs); - SGVector temp_count_indices(count_indices.size()); - sg_memcpy(temp_count_indices.vector, count_indices.vector, sizeof(int32_t)*count_indices.size()); + SGVector temp_count_indices(count_indices.size()); + sg_memcpy(temp_count_indices.vector, count_indices.vector, sizeof(index_t)*count_indices.size()); if (m_pre_sort) { SGVector temp_col(mat.get_column_vector(idx[i]), mat.num_rows, false); SGVector sorted_indices(m_sorted_indices.get_column_vector(idx[i]), mat.num_rows, false); - int32_t count=0; - for(int32_t j=0;j=0) { - int32_t count_index = count_indices[sorted_indices[j]]; + index_t count_index = count_indices[sorted_indices[j]]; while(count_index>0) { feats[count]=temp_col[j]; @@ -634,19 +627,19 @@ int32_t CCARTree::compute_best_attribute(const SGMatrix& mat, const S } else { - for (int32_t j=0;j& mat, const S if (m_nominal[idx[i]]) { - SGVector simple_feats(num_vecs); - simple_feats.fill_vector(simple_feats.vector,simple_feats.vlen,-1); + SGVector simple_feats(num_vecs); + linalg::set_const(simple_feats, -1); // convert to simple values simple_feats[0]=0; - int32_t c=0; - for (int32_t j=1;j& mat, const S simple_feats[j]=(++c); } + // collect the unique categorical values SGVector ufeats(c+1); ufeats[0]=feats[0]; - int32_t u=0; - for (int32_t j=1;j& mat, const S ufeats[++u]=feats[j]; } + // FIXME: this approach is way too vanilla! // test all 2^(I-1)-1 possible division between two nodes - int32_t num_cases=CMath::pow(2,c); - for (int32_t k=1;k wleft(n_ulabels); - SGVector wright(n_ulabels); - wleft.zero(); - wright.zero(); + SGVector wleft(n_ulabels), wright(n_ulabels); + linalg::zero(wleft); + linalg::zero(wright); // stores which vectors are assigned to left child SGVector is_left(num_vecs); - is_left.fill_vector(is_left.vector,is_left.vlen,false); + linalg::set_const(is_left, false); // stores which among the categorical values of chosen attribute are assigned left child SGVector feats_left(c+1); // fill feats_left in a unique way corresponding to the case - for (int32_t p=0;p& mat, const S else wright[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]]; } - for (int32_t j=n_nm_vecs-1;j>=0;j--) + for (index_t j = n_nm_vecs-1 ; j >= 0; --j) { if(dupes[j]!=j) is_left[j]=is_left[dupes[j]]; } float64_t g=0; - if (m_mode==PT_MULTICLASS) - g=gain(wleft,wright,total_wclasses); - else if (m_mode==PT_REGRESSION) - g=gain(wleft,wright,total_wclasses,ulabels); - else - SG_ERROR("Undefined problem statement\n"); + switch(m_mode) + { + case PT_MULTICLASS: + g=gain(wleft,wright,total_wclasses); + break; + case PT_REGRESSION: + g=gain(wleft,wright,total_wclasses,ulabels); + break; + default: + SG_ERROR("Undefined problem statement\n"); + } if (g>max_gain) { @@ -730,15 +729,14 @@ int32_t CCARTree::compute_best_attribute(const SGMatrix& mat, const S sg_memcpy(is_left_final.vector,is_left.vector,is_left.vlen*sizeof(bool)); num_missing_final=num_vecs-n_nm_vecs; - count_left=0; - for (int32_t l=0;l& mat, const S // O(N) SGVector right_wclasses=total_wclasses.clone(); SGVector left_wclasses(n_ulabels); - left_wclasses.zero(); + linalg::zero(left_wclasses); // O(N) // find best split for non-nominal attribute - choose threshold (z) float64_t z=feats[0]; right_wclasses[simple_labels[sorted_args[0]]]-=weights[sorted_args[0]]; left_wclasses[simple_labels[sorted_args[0]]]+=weights[sorted_args[0]]; - for (int32_t j=1;j& mat, const S while (n_nm_vecs& mat, const S { SGVector temp_vec(mat.get_column_vector(best_attribute), mat.num_rows, false); SGVector sorted_indices(m_sorted_indices.get_column_vector(best_attribute), mat.num_rows, false); - int32_t count=0; - for(int32_t i=0;i=0) { @@ -825,7 +823,7 @@ int32_t CCARTree::compute_best_attribute(const SGMatrix& mat, const S break; } } - for (int32_t i=num_vecs-1;i>=0;i--) + for (index_t i=num_vecs-1;i>=0;--i) { if(dupes[i]!=i) is_left_final[i]=is_left_final[dupes[i]]; @@ -834,7 +832,7 @@ int32_t CCARTree::compute_best_attribute(const SGMatrix& mat, const S } else { - for (int32_t i=0;i& mat, const S return best_attribute; } -SGVector CCARTree::surrogate_split(SGMatrix m,SGVector weights, SGVector nm_left, int32_t attr) +SGVector CCARTree::surrogate_split(SGMatrix m,SGVector weights, SGVector nm_left, int32_t attr) const { // return vector - left/right belongingness SGVector ret(m.num_cols); // ditribute data with known attributes - int32_t l=0; + index_t l=0; float64_t p_l=0.; float64_t total=0.; // stores indices of vectors with missing attribute - CDynamicArray* missing_vecs=new CDynamicArray(); + std::vector missing_vecs; // stores lambda values corresponding to missing vectors - initialized all with 0 - CDynamicArray* association_index=new CDynamicArray(); - for (int32_t i=0;i association_index; + for (index_t i=0;i CCARTree::surrogate_split(SGMatrix m,SGVectorpush_back(i); - association_index->push_back(0.); + missing_vecs.push_back(i); + association_index.push_back(0.); } } // for lambda calculation float64_t p_r=(total-p_l)/total; p_l/=total; - float64_t p=CMath::min(p_r,p_l); + float64_t p=std::min(p_r,p_l); // for each attribute (X') alternative to best split (X) - for (int32_t i=0;i* intersect_vecs=new CDynamicArray(); - for (int32_t j=0;j intersect_vecs; + for (index_t j=0;jpush_back(j); + intersect_vecs.push_back(j); } - if (intersect_vecs->get_num_elements()==0) + if (intersect_vecs.size() == 0) { - SG_UNREF(intersect_vecs); + intersect_vecs.clear(); continue; } @@ -901,109 +899,102 @@ SGVector CCARTree::surrogate_split(SGMatrix m,SGVectorget_num_elements();i++) + for (index_t i=0;iget_element(i)==0.) - ret[missing_vecs->get_element(i)]=(p_l>=p_r); + if (association_index.at(i)==0.) + ret[missing_vecs.at(i)]=(p_l>=p_r); } - SG_UNREF(missing_vecs); - SG_UNREF(association_index); return ret; } -void CCARTree::handle_missing_vecs_for_continuous_surrogate(SGMatrix m, CDynamicArray* missing_vecs, - CDynamicArray* association_index, CDynamicArray* intersect_vecs, SGVector is_left, - SGVector weights, float64_t p, int32_t attr) +void CCARTree::handle_missing_vecs_for_continuous_surrogate(SGMatrix m, const std::vector& missing_vecs, + std::vector& association_index, std::vector& intersect_vecs, + SGVector is_left, SGVector weights, float64_t p, index_t attr) const { // for lambda calculation - total weight of all vectors in X intersect X' float64_t denom=0.; - SGVector feats(intersect_vecs->get_num_elements()); - for (int32_t j=0;jget_num_elements();j++) + SGVector feats(intersect_vecs.size()); + for (index_t j = 0; j < intersect_vecs.size(); ++j) { - feats[j]=m(attr,intersect_vecs->get_element(j)); - denom+=weights[intersect_vecs->get_element(j)]; + feats[j]=m(attr,intersect_vecs.at(j)); + denom+=weights[intersect_vecs.at(j)]; } // unique feature values for X' - int32_t num_unique=feats.unique(feats.vector,feats.vlen); - + index_t num_unique=feats.unique(feats.vector,feats.vlen); // all possible splits for chosen attribute - for (int32_t j=0;jget_num_elements();k++) + for (index_t k = 0; k < intersect_vecs.size(); ++k) { // if both go left or both go right - if ((m(attr,intersect_vecs->get_element(k))<=z) && is_left[intersect_vecs->get_element(k)]) - numer+=weights[intersect_vecs->get_element(k)]; - else if ((m(attr,intersect_vecs->get_element(k))>z) && !is_left[intersect_vecs->get_element(k)]) - numer+=weights[intersect_vecs->get_element(k)]; + if ((m(attr,intersect_vecs.at(k))<=z) && is_left[intersect_vecs.at(k)]) + numer+=weights[intersect_vecs.at(k)]; + else if ((m(attr,intersect_vecs.at(k))>z) && !is_left[intersect_vecs.at(k)]) + numer+=weights[intersect_vecs.at(k)]; // complementary split cases - one goes left other right - else if ((m(attr,intersect_vecs->get_element(k))<=z) && !is_left[intersect_vecs->get_element(k)]) - numerc+=weights[intersect_vecs->get_element(k)]; - else if ((m(attr,intersect_vecs->get_element(k))>z) && is_left[intersect_vecs->get_element(k)]) - numerc+=weights[intersect_vecs->get_element(k)]; + else if ((m(attr,intersect_vecs.at(k))<=z) && !is_left[intersect_vecs.at(k)]) + numerc+=weights[intersect_vecs.at(k)]; + else if ((m(attr,intersect_vecs.at(k))>z) && is_left[intersect_vecs.at(k)]) + numerc+=weights[intersect_vecs.at(k)]; } - float64_t lambda=0.; - if (numer>=numerc) - lambda=(p-(1-numer/denom))/p; - else - lambda=(p-(1-numerc/denom))/p; - for (int32_t k=0;kget_num_elements();k++) + float64_t lambda = (numer>=numerc) + ? (p-(1-numer/denom))/p + : (p-(1-numerc/denom))/p; + + for (index_t k = 0; k < missing_vecs.size(); ++k) { - if ((lambda>association_index->get_element(k)) && - (!CMath::fequals(m(attr,missing_vecs->get_element(k)),MISSING,0))) + if ((lambda>association_index.at(k)) && + (!CMath::fequals(m(attr,missing_vecs.at(k)),MISSING,0))) { - association_index->set_element(lambda,k); - if (numer>=numerc) - is_left[missing_vecs->get_element(k)]=(m(attr,missing_vecs->get_element(k))<=z); - else - is_left[missing_vecs->get_element(k)]=(m(attr,missing_vecs->get_element(k))>z); + association_index[k] = lambda; + is_left[missing_vecs.at(k)] = (numer>=numerc) + ? (m(attr,missing_vecs.at(k))<=z) + : (m(attr,missing_vecs.at(k))>z); } } } } -void CCARTree::handle_missing_vecs_for_nominal_surrogate(SGMatrix m, CDynamicArray* missing_vecs, - CDynamicArray* association_index, CDynamicArray* intersect_vecs, SGVector is_left, - SGVector weights, float64_t p, int32_t attr) +void CCARTree::handle_missing_vecs_for_nominal_surrogate(SGMatrix m, const std::vector& missing_vecs, + std::vector& association_index, const std::vector& intersect_vecs, + SGVector is_left, SGVector weights, float64_t p, index_t attr) const { // for lambda calculation - total weight of all vectors in X intersect X' float64_t denom=0.; - SGVector feats(intersect_vecs->get_num_elements()); - for (int32_t j=0;jget_num_elements();j++) + SGVector feats(intersect_vecs.size()); + for (index_t j = 0; j < intersect_vecs.size(); ++j) { - feats[j]=m(attr,intersect_vecs->get_element(j)); - denom+=weights[intersect_vecs->get_element(j)]; + feats[j]=m(attr,intersect_vecs.at(j)); + denom+=weights[intersect_vecs.at(j)]; } // unique feature values for X' - int32_t num_unique=feats.unique(feats.vector,feats.vlen); + index_t num_unique = feats.unique(feats.vector,feats.vlen); // scan all splits for chosen alternative attribute X' - int32_t num_cases=CMath::pow(2,(num_unique-1)); - for (int32_t j=1;j feats_left(num_unique); - for (int32_t k=0;k intersect_vecs_left(intersect_vecs->get_num_elements()); - for (int32_t k=0;kget_num_elements();k++) + SGVector intersect_vecs_left(intersect_vecs.size()); + for (int32_t k=0;kget_element(k))) + if (feats[q]==m(attr,intersect_vecs.at(k))) { intersect_vecs_left[k]=feats_left[q]; break; @@ -1013,38 +1004,35 @@ void CCARTree::handle_missing_vecs_for_nominal_surrogate(SGMatrix m, float64_t numer=0.; float64_t numerc=0.; - for (int32_t k=0;kget_num_elements();k++) + for (int32_t k=0;kget_element(k)]) - numer+=weights[intersect_vecs->get_element(k)]; + if (intersect_vecs_left[k]==is_left[intersect_vecs.at(k)]) + numer+=weights[intersect_vecs.at(k)]; else - numerc+=weights[intersect_vecs->get_element(k)]; + numerc+=weights[intersect_vecs.at(k)]; } // lambda for this split (2 case identical split/complementary split) - float64_t lambda=0.; - if (numer>=numerc) - lambda=(p-(1-numer/denom))/p; - else - lambda=(p-(1-numerc/denom))/p; + float64_t lambda = (numer>=numerc) + ? (p-(1-numer/denom))/p + : (p-(1-numerc/denom))/p; // address missing value vectors not yet addressed or addressed using worse split - for (int32_t k=0;kget_num_elements();k++) + for (index_t k = 0; k < missing_vecs.size(); ++k) { - if ((lambda>association_index->get_element(k)) && - (!CMath::fequals(m(attr,missing_vecs->get_element(k)),MISSING,0))) + if ((lambda>association_index.at(k)) && + (!CMath::fequals(m(attr,missing_vecs.at(k)),MISSING,0))) { - association_index->set_element(lambda,k); + association_index[k] = lambda; // decide left/right based on which feature value the chosen data point has - for (int32_t q=0;qget_element(k))) + if (feats[q]==m(attr,missing_vecs.at(k))) { - if (numer>=numerc) - is_left[missing_vecs->get_element(k)]=feats_left[q]; - else - is_left[missing_vecs->get_element(k)]=!feats_left[q]; + is_left[missing_vecs.at(k)] = (numer>=numerc) + ? feats_left[q] + : !feats_left[q]; break; } @@ -1054,8 +1042,8 @@ void CCARTree::handle_missing_vecs_for_nominal_surrogate(SGMatrix m, } } -float64_t CCARTree::gain(SGVector wleft, SGVector wright, SGVector wtotal, - SGVector feats) +float64_t CCARTree::gain(const SGVector& wleft, const SGVector& wright, const SGVector& wtotal, + const SGVector& feats) const { float64_t total_lweight=0; float64_t total_rweight=0; @@ -1068,7 +1056,7 @@ float64_t CCARTree::gain(SGVector wleft, SGVector wright, return lsd_n-(lsd_l*(total_lweight/total_weight))-(lsd_r*(total_rweight/total_weight)); } -float64_t CCARTree::gain(const SGVector& wleft, const SGVector& wright, const SGVector& wtotal) +float64_t CCARTree::gain(const SGVector& wleft, const SGVector& wright, const SGVector& wtotal) const { float64_t total_lweight=0; float64_t total_rweight=0; @@ -1080,27 +1068,21 @@ float64_t CCARTree::gain(const SGVector& wleft, const SGVector& weighted_lab_classes, float64_t &total_weight) +float64_t CCARTree::gini_impurity_index(const SGVector& weighted_lab_classes, float64_t &total_weight) const { - Map map_weighted_lab_classes(weighted_lab_classes.vector, weighted_lab_classes.size()); - total_weight=map_weighted_lab_classes.sum(); - float64_t gini=map_weighted_lab_classes.dot(map_weighted_lab_classes); - - gini=1.0-(gini/(total_weight*total_weight)); - return gini; + total_weight = linalg::sum(weighted_lab_classes); + float64_t gini = linalg::dot(weighted_lab_classes, weighted_lab_classes); + return 1.0-(gini/(total_weight*total_weight)); } -float64_t CCARTree::least_squares_deviation(const SGVector& feats, const SGVector& weights, float64_t &total_weight) +float64_t CCARTree::least_squares_deviation(const SGVector& feats, const SGVector& weights, float64_t &total_weight) const { - - Map map_weights(weights.vector, weights.size()); - Map map_feats(feats.vector, weights.size()); - float64_t mean=map_weights.dot(map_feats); - total_weight=map_weights.sum(); + float64_t mean = linalg::dot(weights, feats); + total_weight = linalg::sum(weights); mean/=total_weight; float64_t dev=0; - for (int32_t i=0;i& feats, co CLabels* CCARTree::apply_from_current_node(CDenseFeatures* feats, bnode_t* current) { - int32_t num_vecs=feats->get_num_vectors(); + auto num_vecs=feats->get_num_vectors(); REQUIRE(num_vecs>0, "No data provided in apply\n"); SGVector labels(num_vecs); - for (int32_t i=0;i sample=feats->get_feature_vector(i); + auto sample=feats->get_feature_vector(i); bnode_t* node=current; SG_REF(node); @@ -1127,7 +1109,7 @@ CLabels* CCARTree::apply_from_current_node(CDenseFeatures* feats, bno { SGVector comp=leftchild->data.transit_into_values; bool flag=false; - for (int32_t k=0;kdata.attribute_id]) { @@ -1193,44 +1175,45 @@ CLabels* CCARTree::apply_from_current_node(CDenseFeatures* feats, bno void CCARTree::prune_by_cross_validation(CDenseFeatures* data, int32_t folds) { - int32_t num_vecs=data->get_num_vectors(); + auto num_vecs=data->get_num_vectors(); // divide data into V folds randomly - SGVector subid(num_vecs); + SGVector subid(num_vecs); subid.random_vector(subid.vector,subid.vlen,0,folds-1); // for each fold subset - CDynamicArray* r_cv=new CDynamicArray(); - CDynamicArray* alphak=new CDynamicArray(); + std::vector r_cv; + std::vector alphak; SGVector num_alphak(folds); - for (int32_t i=0;i* test_indices=new CDynamicArray(); - CDynamicArray* train_indices=new CDynamicArray(); - for (int32_t j=0;j test_indices; + std::vector train_indices; + for (index_t j = 0; j < num_vecs; ++j) { - if (subid[j]==i) - test_indices->push_back(j); - else - train_indices->push_back(j); + (subid[j]==i) + ? test_indices.push_back(j) + : train_indices.push_back(j); } - if (test_indices->get_num_elements()==0 || train_indices->get_num_elements()==0) + if (test_indices.size()==0 || train_indices.size()==0) { SG_ERROR("Unfortunately you have reached the very low probability event where atleast one of " "the subsets in cross-validation is not represented at all. Please re-run.") } - SGVector subset(train_indices->get_array(),train_indices->get_num_elements(),false); + SGVector subset(train_indices.data(),train_indices.size(),false); data->add_subset(subset); m_labels->add_subset(subset); - SGVector subset_weights(train_indices->get_num_elements()); - for (int32_t j=0;jget_num_elements();j++) - subset_weights[j]=m_weights[train_indices->get_element(j)]; + SGVector subset_weights(train_indices.size()); + + for (index_t j = 0; j < train_indices.size(); ++j) + subset_weights[j]=m_weights[train_indices.at(j)]; // train with training subset - bnode_t* root=CARTtrain(data,subset_weights,m_labels,0); + auto dense_labels = m_labels->as(); + bnode_t* root=CARTtrain(data,subset_weights,dense_labels,0); // prune trained tree CTreeMachine* tmax=new CTreeMachine(); @@ -1239,18 +1222,18 @@ void CCARTree::prune_by_cross_validation(CDenseFeatures* data, int32_ data->remove_subset(); m_labels->remove_subset(); - subset=SGVector(test_indices->get_array(),test_indices->get_num_elements(),false); + subset=SGVector(test_indices.data(),test_indices.size(),false); data->add_subset(subset); m_labels->add_subset(subset); - subset_weights=SGVector(test_indices->get_num_elements()); - for (int32_t j=0;jget_num_elements();j++) - subset_weights[j]=m_weights[test_indices->get_element(j)]; + subset_weights=SGVector(test_indices.size()); + for (int32_t j=0;jget_num_elements(); - for (int32_t j=0;jget_num_elements();j++) + for (int32_t j=0;jget_num_elements();++j) { - alphak->push_back(m_alphas->get_element(j)); + alphak.push_back(m_alphas->get_element(j)); CSGObject* jth_element=pruned_trees->get_element(j); bnode_t* current_root=NULL; if (jth_element!=NULL) @@ -1260,15 +1243,13 @@ void CCARTree::prune_by_cross_validation(CDenseFeatures* data, int32_ CLabels* labels=apply_from_current_node(data, current_root); float64_t error=compute_error(labels, m_labels, subset_weights); - r_cv->push_back(error); + r_cv.push_back(error); SG_UNREF(labels); SG_UNREF(jth_element); } data->remove_subset(); m_labels->remove_subset(); - SG_UNREF(train_indices); - SG_UNREF(test_indices); SG_UNREF(tmax); SG_UNREF(pruned_trees); } @@ -1279,7 +1260,7 @@ void CCARTree::prune_by_cross_validation(CDenseFeatures* data, int32_ // find subtree with minimum R_cv int32_t min_index=-1; float64_t min_r_cv=CMath::MAX_REAL_NUMBER; - for (int32_t i=0;iget_num_elements();i++) + for (int32_t i=0;iget_num_elements();++i) { float64_t alpha=0.; if (i==m_alphas->get_num_elements()-1) @@ -1290,21 +1271,21 @@ void CCARTree::prune_by_cross_validation(CDenseFeatures* data, int32_ float64_t rv=0.; int32_t base=0; - for (int32_t j=0;jget_element(k)<=alpha && alphak->get_element(k+1)>alpha) + if (alphak.at(k)<=alpha && alphak.at(k+1)>alpha) { - rv+=r_cv->get_element(k); + rv+=r_cv.at(k); flag=true; break; } } if (!flag) - rv+=r_cv->get_element(num_alphak[j]+base-1); + rv+=r_cv.at(num_alphak[j]+base-1); base+=num_alphak[j]; } @@ -1318,34 +1299,31 @@ void CCARTree::prune_by_cross_validation(CDenseFeatures* data, int32_ CSGObject* element=pruned_trees->get_element(min_index); bnode_t* best_tree_root=NULL; - if (element!=NULL) - best_tree_root=dynamic_cast(element); - else + if (element==nullptr) SG_ERROR("%d element is NULL which should not be",min_index); + best_tree_root=dynamic_cast(element); this->set_root(best_tree_root); SG_UNREF(element); SG_UNREF(pruned_trees); - SG_UNREF(r_cv); - SG_UNREF(alphak); } -float64_t CCARTree::compute_error(CLabels* labels, CLabels* reference, SGVector weights) +float64_t CCARTree::compute_error(CLabels* labels, CLabels* reference, SGVector weights) const { REQUIRE(labels,"input labels cannot be NULL"); REQUIRE(reference,"reference labels cannot be NULL") - CDenseLabels* gnd_truth=dynamic_cast(reference); - CDenseLabels* result=dynamic_cast(labels); + CDenseLabels* gnd_truth = reference->as(); + CDenseLabels* result = labels->as(); - float64_t denom=weights.sum(weights); + float64_t denom=linalg::sum(weights); float64_t numer=0.; switch (m_mode) { case PT_MULTICLASS: { - for (int32_t i=0;iget_label(i)!=result->get_label(i)) numer+=weights[i]; @@ -1356,7 +1334,7 @@ float64_t CCARTree::compute_error(CLabels* labels, CLabels* reference, SGVector< case PT_REGRESSION: { - for (int32_t i=0;iget_label(i)-result->get_label(i)),2); return numer/denom; @@ -1419,7 +1397,7 @@ CDynamicObjectArray* CCARTree::prune_tree(CTreeMachine* tree) return trees; } -float64_t CCARTree::find_weakest_alpha(bnode_t* node) +float64_t CCARTree::find_weakest_alpha(bnode_t* node) const { if (node->data.num_leaves!=1) { @@ -1434,7 +1412,7 @@ float64_t CCARTree::find_weakest_alpha(bnode_t* node) SG_UNREF(left); SG_UNREF(right); - return CMath::min(weak_links.vector,weak_links.vlen); + return *std::min_element(weak_links.begin(), weak_links.end()); } return CMath::MAX_REAL_NUMBER; diff --git a/src/shogun/multiclass/tree/CARTree.h b/src/shogun/multiclass/tree/CARTree.h index b2770e62e2b..ca78145a335 100644 --- a/src/shogun/multiclass/tree/CARTree.h +++ b/src/shogun/multiclass/tree/CARTree.h @@ -229,7 +229,7 @@ class CCARTree : public CTreeMachine void set_label_epsilon(float64_t epsilon); void pre_sort_features(CFeatures* data, SGMatrix& sorted_feats, SGMatrix& sorted_indices); - + void set_sorted_features(SGMatrix& sorted_feats, SGMatrix& sorted_indices); protected: @@ -247,7 +247,7 @@ class CCARTree : public CTreeMachine * @param level current tree depth * @return pointer to the root of the CART subtree */ - virtual CBinaryTreeMachineNode* CARTtrain(CFeatures* data, SGVector weights, CLabels* labels, int32_t level); + virtual CBinaryTreeMachineNode* CARTtrain(CDenseFeatures* data, const SGVector& weights, CDenseLabels* labels, int32_t level); /** modify labels for compute_best_attribute * @@ -255,7 +255,7 @@ class CCARTree : public CTreeMachine * @param n_ulabels stores number of unique labels * @return unique labels */ - SGVector get_unique_labels(SGVector labels_vec, int32_t &n_ulabels); + SGVector get_unique_labels(const SGVector& labels_vec, index_t &n_ulabels) const; /** computes best attribute for CARTtrain * @@ -270,9 +270,9 @@ class CCARTree : public CTreeMachine * @param count_right stores number of feature values for right transition * @return index to the best attribute */ - virtual int32_t compute_best_attribute(const SGMatrix& mat, const SGVector& weights, CLabels* labels, - SGVector& left, SGVector& right, SGVector& is_left_final, int32_t &num_missing, - int32_t &count_left, int32_t &count_right, int32_t subset_size=0, const SGVector& active_indices=SGVector()); + virtual index_t compute_best_attribute(const SGMatrix& mat, const SGVector& weights, CDenseLabels* labels, + SGVector& left, SGVector& right, SGVector& is_left_final, index_t &num_missing, + index_t &count_left, index_t &count_right, index_t subset_size=0, const SGVector& active_indices=SGVector()); /** handles missing values through surrogate splits @@ -283,7 +283,7 @@ class CCARTree : public CTreeMachine * @param attr best attribute chosen for split * @return vector denoting whether a data point goes to left child for all data points including ones with missing attributes */ - SGVector surrogate_split(SGMatrix data, SGVector weights, SGVector nm_left, int32_t attr); + SGVector surrogate_split(SGMatrix data, SGVector weights, SGVector nm_left, int32_t attr) const; /** handles missing values for a chosen continuous surrogate attribute @@ -298,9 +298,9 @@ class CCARTree : public CTreeMachine * @param attr surrogate attribute chosen for split * @return vector denoting whether a data point goes to left child for all data points including ones with missing attributes */ - void handle_missing_vecs_for_continuous_surrogate(SGMatrix m, CDynamicArray* missing_vecs, - CDynamicArray* association_index, CDynamicArray* intersect_vecs, SGVector is_left, - SGVector weights, float64_t p, int32_t attr); + void handle_missing_vecs_for_continuous_surrogate(SGMatrix m, const std::vector& missing_vecs, + std::vector& association_index, std::vector& intersect_vecs, + SGVector is_left, SGVector weights, float64_t p, index_t attr) const; /** handles missing values for a chosen nominal surrogate attribute * @@ -314,9 +314,9 @@ class CCARTree : public CTreeMachine * @param attr surrogate attribute chosen for split * @return vector denoting whether a data point goes to left child for all data points including ones with missing attributes */ - void handle_missing_vecs_for_nominal_surrogate(SGMatrix m, CDynamicArray* missing_vecs, - CDynamicArray* association_index, CDynamicArray* intersect_vecs, SGVector is_left, - SGVector weights, float64_t p, int32_t attr); + void handle_missing_vecs_for_nominal_surrogate(SGMatrix m, const std::vector& missing_vecs, + std::vector& association_index, const std::vector& intersect_vecs, + SGVector is_left, SGVector weights, float64_t p, index_t attr) const; /** returns gain in regression case * @@ -326,7 +326,8 @@ class CCARTree : public CTreeMachine * @param labels regression labels * @return least squared deviation gain achieved after spliting the node */ - float64_t gain(SGVector wleft, SGVector wright, SGVector wtotal, SGVector labels); + float64_t gain(const SGVector& wleft, const SGVector& wright, + const SGVector& wtotal, const SGVector& feats) const; /** returns gain in Gini impurity measure * @@ -335,7 +336,7 @@ class CCARTree : public CTreeMachine * @param wtotal label distribution in current node * @return Gini gain achieved after spliting the node */ - float64_t gain(const SGVector& wleft, const SGVector& wright, const SGVector& wtotal); + float64_t gain(const SGVector& wleft, const SGVector& wright, const SGVector& wtotal) const; /** returns Gini impurity of a node * @@ -343,7 +344,7 @@ class CCARTree : public CTreeMachine * @param total_weight stores the total weight of all classes * @return Gini index of the node */ - float64_t gini_impurity_index(const SGVector& weighted_lab_classes, float64_t &total_weight); + float64_t gini_impurity_index(const SGVector& weighted_lab_classes, float64_t &total_weight) const; /** returns least squares deviation * @@ -352,7 +353,7 @@ class CCARTree : public CTreeMachine * @param total_weight stores sum of weights in weights vector * @return least squares deviation of the data */ - float64_t least_squares_deviation(const SGVector& labels, const SGVector& weights, float64_t &total_weight); + float64_t least_squares_deviation(const SGVector& labels, const SGVector& weights, float64_t &total_weight) const; /** uses current subtree to classify/regress data * @@ -378,7 +379,7 @@ class CCARTree : public CTreeMachine * @param weights weights associated with the labels * @return error evaluated */ - float64_t compute_error(CLabels* labels, CLabels* reference, SGVector weights); + float64_t compute_error(CLabels* labels, CLabels* reference, SGVector weights) const; /** cost-complexity pruning * @@ -392,7 +393,7 @@ class CCARTree : public CTreeMachine * @param node the root of subtree whose weakest link it finds * @return alpha value corresponding to the weakest link in subtree */ - float64_t find_weakest_alpha(bnode_t* node); + float64_t find_weakest_alpha(bnode_t* node) const; /** recursively cuts weakest link(s) in a tree * diff --git a/src/shogun/multiclass/tree/RandomCARTree.cpp b/src/shogun/multiclass/tree/RandomCARTree.cpp index f2f0462c950..fdbd2df16ac 100644 --- a/src/shogun/multiclass/tree/RandomCARTree.cpp +++ b/src/shogun/multiclass/tree/RandomCARTree.cpp @@ -43,28 +43,24 @@ CRandomCARTree::~CRandomCARTree() { } -void CRandomCARTree::set_feature_subset_size(int32_t size) +void CRandomCARTree::set_feature_subset_size(index_t size) { REQUIRE(size>0, "Subset size should be greater than 0. %d supplied!\n",size) m_randsubset_size=size; } -int32_t CRandomCARTree::compute_best_attribute(const SGMatrix& mat, const SGVector& weights, CLabels* labels, - SGVector& left, SGVector& right, SGVector& is_left_final, int32_t &num_missing_final, int32_t &count_left, - int32_t &count_right, int32_t subset_size, const SGVector& active_indices) +index_t CRandomCARTree::compute_best_attribute(const SGMatrix& mat, const SGVector& weights, CDenseLabels* labels, + SGVector& left, SGVector& right, SGVector& is_left_final, index_t &num_missing_final, index_t &count_left, + index_t &count_right, index_t subset_size, const SGVector& active_indices) { - int32_t num_feats; - if(m_pre_sort) - num_feats=mat.num_cols; - else - num_feats=mat.num_rows; - + auto num_feats = (m_pre_sort) ? mat.num_cols : mat.num_rows; + // if subset size is not set choose sqrt(num_feats) by default if (m_randsubset_size==0) m_randsubset_size = std::sqrt((float64_t)num_feats); subset_size=m_randsubset_size; - + REQUIRE(subset_size<=num_feats, "The Feature subset size(set %d) should be less than" " or equal to the total number of features(%d here).\n",subset_size,num_feats) diff --git a/src/shogun/multiclass/tree/RandomCARTree.h b/src/shogun/multiclass/tree/RandomCARTree.h index 7a9f6e8725a..78cedfc3cc5 100644 --- a/src/shogun/multiclass/tree/RandomCARTree.h +++ b/src/shogun/multiclass/tree/RandomCARTree.h @@ -63,13 +63,13 @@ class CRandomCARTree : public CCARTree * * @param size subset size */ - void set_feature_subset_size(int32_t size); + void set_feature_subset_size(index_t size); /** get number of random features to choose in each node split * * @return size subset size */ - int32_t get_feature_subset_size() const { return m_randsubset_size; } + index_t get_feature_subset_size() const { return m_randsubset_size; } protected: /** computes best attribute for CARTtrain @@ -85,9 +85,9 @@ class CRandomCARTree : public CCARTree * @param count_right stores number of feature values for right transition * @return index to the best attribute */ - virtual int32_t compute_best_attribute(const SGMatrix& mat, const SGVector& weights, CLabels* labels, - SGVector& left, SGVector& right, SGVector& is_left_final, int32_t &num_missing, - int32_t &count_left, int32_t &count_right, int32_t subset_size=0, const SGVector& active_indices=SGVector()); + virtual index_t compute_best_attribute(const SGMatrix& mat, const SGVector& weights, CDenseLabels* labels, + SGVector& left, SGVector& right, SGVector& is_left_final, index_t &num_missing, + index_t &count_left, index_t &count_right, index_t subset_size=0, const SGVector& active_indices=SGVector()); private: /** initialize parameters */ @@ -95,7 +95,7 @@ class CRandomCARTree : public CCARTree private: /** random feature subset size */ - int32_t m_randsubset_size; + index_t m_randsubset_size; }; } /* namespace shogun */ diff --git a/tests/unit/lib/SGVector_unittest.cc b/tests/unit/lib/SGVector_unittest.cc index cf65042ed09..ff72df13e7c 100644 --- a/tests/unit/lib/SGVector_unittest.cc +++ b/tests/unit/lib/SGVector_unittest.cc @@ -418,3 +418,13 @@ TEST(SGVectorTest,iterator) for (auto v: t) EXPECT_EQ(t[index++], v); } + +TEST(SGVectorTest,unique) +{ + SGVector vec{1,2,3,1,2,3,3,4,5,4,5,6,7}; + auto num_unique = vec.unique(vec.vector, vec.vlen); + + EXPECT_EQ(7, num_unique); + for (index_t i = 0; i < num_unique; ++i) + EXPECT_EQ(i+1, vec[i]); +}