Skip to content

Commit

Permalink
BaggingMachine bugs fixed, CARTree code refactored, unittests added
Browse files Browse the repository at this point in the history
  • Loading branch information
mazumdarparijat committed Jun 6, 2014
1 parent dcba16c commit 4f8d0f5
Show file tree
Hide file tree
Showing 9 changed files with 621 additions and 313 deletions.
25 changes: 15 additions & 10 deletions src/shogun/machine/BaggingMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ SGVector<float64_t> CBaggingMachine::apply_get_outputs(CFeatures* data)
{
CMachine* m = dynamic_cast<CMachine*>(m_bags->get_element(i));
CLabels* l = m->apply(data);
SGVector<float64_t> lv = l->get_values();
SGVector<float64_t> lv = dynamic_cast<CDenseLabels*>(l)->get_labels();
float64_t* bag_results = output.get_column_vector(i);
memcpy(bag_results, lv.vector, lv.vlen*sizeof(float64_t));

Expand All @@ -108,9 +108,9 @@ bool CBaggingMachine::train_machine(CFeatures* data)
ASSERT(m_features->get_num_vectors() == m_labels->get_num_labels());
}

// bag size less than equal to number of feature vector
// bag size less than or equal to number of feature vector
REQUIRE((get_bag_size() <= m_features->get_num_vectors()) && (get_bag_size() > 0), "bag size (%d currently) "
" should be greater than 0 but less than equal to number of training vectors (%d here)\n",get_bag_size());
" should be greater than 0 but less than or equal to number of training vectors (%d here)\n",get_bag_size());

// clear the array, if previously trained
m_bags->reset_array();
Expand All @@ -129,7 +129,7 @@ bool CBaggingMachine::train_machine(CFeatures* data)
*/
for (int32_t i = 0; i < m_num_bags; ++i)
{
CMachine* c = dynamic_cast<CMachine*>(m_machine->clone());
CMachine* c=dynamic_cast<CMachine*>(m_machine->clone());
ASSERT(c != NULL);
SGVector<index_t> idx(get_bag_size());
idx.random(0, m_features->get_num_vectors()-1);
Expand Down Expand Up @@ -163,6 +163,8 @@ bool CBaggingMachine::train_machine(CFeatures* data)

// add trained machine to bag array
m_bags->append_element(c);

SG_UNREF(c);
}

return true;
Expand Down Expand Up @@ -268,11 +270,10 @@ float64_t CBaggingMachine::get_oob_error(CEvaluation* eval) const
= dynamic_cast<CDynamicArray<index_t>*>(m_oob_indices->get_element(i));

SGVector<index_t> oob(current_oob->get_array(), current_oob->get_num_elements(), false);
oob.display_vector();
m_features->add_subset(oob);

CLabels* l = m->apply(m_features);
SGVector<float64_t> lv = l->get_values();
SGVector<float64_t> lv = dynamic_cast<CDenseLabels*>(l)->get_labels();

// assign the values in the matrix (NAN) that are in-bag!
for (index_t j = 0; j < oob.vlen; j++)
Expand All @@ -283,7 +284,6 @@ float64_t CBaggingMachine::get_oob_error(CEvaluation* eval) const
SG_UNREF(m);
SG_UNREF(l);
}
output.display_matrix();

DynArray<index_t> idx;
for (index_t i = 0; i < m_features->get_num_vectors(); i++)
Expand All @@ -293,19 +293,23 @@ float64_t CBaggingMachine::get_oob_error(CEvaluation* eval) const
}

SGVector<float64_t> combined = m_combination_rule->combine(output);
SGVector<float64_t> lab(idx.get_num_elements());
for (int32_t i=0;i<lab.vlen;i++)
lab[i]=combined[idx.get_element(i)];

CLabels* predicted = NULL;
switch (m_labels->get_label_type())
{
case LT_BINARY:
predicted = new CBinaryLabels(combined);
predicted = new CBinaryLabels(lab);
break;

case LT_MULTICLASS:
predicted = new CMulticlassLabels(combined);
predicted = new CMulticlassLabels(lab);
break;

case LT_REGRESSION:
predicted = new CRegressionLabels(combined);
predicted = new CRegressionLabels(lab);
break;

default:
Expand All @@ -316,6 +320,7 @@ float64_t CBaggingMachine::get_oob_error(CEvaluation* eval) const
float64_t res = eval->evaluate(predicted, m_labels);
m_labels->remove_subset();

SG_UNREF(predicted);
return res;
}

Expand Down
21 changes: 17 additions & 4 deletions src/shogun/machine/RandomForest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ CRandomForest::CRandomForest(CFeatures* features, CLabels* labels, SGVector<floa
dynamic_cast<CRandomCARTree*>(m_machine)->set_feature_subset_size(rand_numfeats);
}

CRandomForest::~CRandomForest()
{
}

void CRandomForest::set_bag_size(int32_t bag_size)
{
SG_ERROR("Bag Size is set to be equal to number of training vectors and cannot be changed\n")
Expand Down Expand Up @@ -116,12 +120,12 @@ void CRandomForest::set_machine_problem_type(EProblemType mode)
dynamic_cast<CRandomCARTree*>(m_machine)->set_machine_problem_type(mode);
}

void CRandomForest::set_random_features_num(int32_t rand_featsize)
void CRandomForest::set_num_random_features(int32_t rand_featsize)
{
dynamic_cast<CRandomCARTree*>(m_machine)->set_feature_subset_size(rand_featsize);
}

int32_t CRandomForest::get_random_features_num() const
int32_t CRandomForest::get_num_random_features() const
{
return dynamic_cast<CRandomCARTree*>(m_machine)->get_feature_subset_size();
}
Expand All @@ -132,13 +136,22 @@ void CRandomForest::set_machine_parameters(CMachine* m, SGVector<index_t> idx)
CRandomCARTree* tree=dynamic_cast<CRandomCARTree*>(m);

SGVector<float64_t> weights(idx.vlen);
for (int32_t i=0;i<idx.vlen;i++)
weights[i]=m_weights[idx[i]];

if (m_weights.vlen==0)
{
weights.fill_vector(weights.vector,weights.vlen,1.0);
}
else
{
for (int32_t i=0;i<idx.vlen;i++)
weights[i]=m_weights[idx[i]];
}

tree->set_weights(weights);
}

void CRandomForest::init()
{
m_machine=new CRandomCARTree();
SG_REF(m_machine);
}
62 changes: 28 additions & 34 deletions src/shogun/machine/RandomForest.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,30 +36,34 @@

namespace shogun
{

/** @brief This class implements the Random Forests algorithm. In Random Forests algorithm, we train a number of randomized CART trees
* (see class CRandomCARTree) using the supplied training data. The number of trees to be trained is a parameter (called number of bags)
* controlled by the user. Test feature vectors are classified/regressed by combining the outputs of all these trained candidate trees using a
* combination rule (see class CCombinationRule). The feature for calculating out-of-box error is also provided to help determine the
* appropriate number of bags. The evaluatin criteria for calculating this out-of-box error is specified by the user (see class CEvaluation).
*/
class CRandomForest : public CBaggingMachine
{
public:
/** constructor */
CRandomForest();

/**
* constructor
/** constructor
*
* @param rand_numfeats number of attributes chosen randomly during node split in candidate trees
*/
CRandomForest(int32_t rand_numfeats);

/**
* constructor
/** constructor
*
* @param features training features
* @param labels training labels
* @param rand_numfeats number of attributes chosen randomly during node split in candidate trees
*/
CRandomForest(CFeatures* features, CLabels* labels, int32_t rand_numfeats=0);

/**
* constructor
/** constructor
*
* @param features training features
* @param labels training labels
Expand All @@ -68,94 +72,84 @@ class CRandomForest : public CBaggingMachine
*/
CRandomForest(CFeatures* features, CLabels* labels, SGVector<float64_t> weights, int32_t rand_numfeats=0);

/**
* Bag size is set at number of training feature vectors and cannot be changed.
/** destructor */
virtual ~CRandomForest();

/** Bag size is set to number of training feature vectors and cannot be changed.
*
* @param bag_size number of vectors to use for a bag
*/
virtual void set_bag_size(int32_t bag_size);

/**
* Get number of feature vectors that are use
/** Get number of feature vectors that are use
* for training each bag/machine
*
* @return number of vectors used for training for each bag.
*/
virtual int32_t get_bag_size() const;

/**
* get name
/** get name
*
* @return RandomForest
*/
virtual const char* get_name() const { return "RandomForest"; }

/**
* machine is set to modified CART(RandomCART) and cannot be changed
/** machine is set to modified CART(RandomCART) and cannot be changed
*
* @param machine the machine to use for bagging
*/
virtual void set_machine(CMachine* machine);

/**
* set weights
/** set weights
*
* @param weights of training feature vectors
*/
void set_weights(SGVector<float64_t> weights);

/**
* get weights
/** get weights
*
* @return weights of training feature vectors
*/
SGVector<float64_t> get_weights() const;

/**
* set feature types of various features
/** set feature types of various features
*
* @param ft bool vector true for nominal feature false for continuous feature type
*/
void set_feature_types(SGVector<bool> ft);

/**
* get feature types of various features
/** get feature types of various features
*
* @return bool vector - true for nominal feature false for continuous feature type
*/
SGVector<bool> get_feature_types() const;

/**
* get problem type - multiclass classification or regression
/** get problem type - multiclass classification or regression
*
* @return PT_MULTICLASS or PT_REGRESSION
*/
virtual EProblemType get_machine_problem_type() const;

/**
* set problem type - multiclass classification or regression
/** set problem type - multiclass classification or regression
*
* @param mode EProblemType PT_MULTICLASS or PT_REGRESSION
*/
void set_machine_problem_type(EProblemType mode);

/**
* set number of random features to be chosen during node splits
/** set number of random features to be chosen during node splits
*
* @param rand_featsize number of randomly chosen features during each node split
*/
void set_random_features_num(int32_t rand_featsize);
void set_num_random_features(int32_t rand_featsize);

/**
* get number of random features to be chosen during node splits
/** get number of random features to be chosen during node splits
*
* @return number of randomly chosen features during each node split
*/
int32_t get_random_features_num() const;
int32_t get_num_random_features() const;

protected:
/**
* sets parameters of CARTree - sets machine labels and weights here
/** sets parameters of CARTree - sets machine labels and weights here
*
* @param m machine
* @param idx indices of training vectors chosen in current bag
Expand Down
Loading

0 comments on commit 4f8d0f5

Please sign in to comment.