Skip to content

Commit

Permalink
unittest added + brief added
Browse files Browse the repository at this point in the history
  • Loading branch information
mazumdarparijat committed Mar 11, 2014
1 parent a551ff2 commit 8712416
Show file tree
Hide file tree
Showing 3 changed files with 340 additions and 122 deletions.
250 changes: 137 additions & 113 deletions src/shogun/multiclass/tree/ID3ClassifierTree.cpp
@@ -1,6 +1,5 @@
/*
* Copyright (c) The Shogun Machine Learning Toolbox
* Written (w) 2013 Monica Dragan
* Written (w) 2014 Parijat Mazumdar
* All rights reserved.
*
Expand Down Expand Up @@ -33,6 +32,8 @@
#include <shogun/features/DenseFeatures.h>
#include <shogun/labels/MulticlassLabels.h>
#include <shogun/multiclass/tree/ID3ClassifierTree.h>
#include <iostream>
using namespace std;

using namespace shogun;

Expand All @@ -45,77 +46,55 @@ CID3ClassifierTree::~CID3ClassifierTree()
{
}

float64_t CID3ClassifierTree::informational_gain_attribute(int32_t attr_no, CFeatures* data,
CMulticlassLabels* class_labels)
CMulticlassLabels* CID3ClassifierTree::apply_multiclass(CFeatures* data)
{
REQUIRE(data,"data required for information gain calculation")
REQUIRE(data->get_feature_class()==C_DENSE,
"Dense data required for information gain calculation")
REQUIRE(data, "Data required for classification in apply_multiclass")

float64_t gain = 0;
CDenseFeatures<float64_t>* feats = (CDenseFeatures<float64_t>*) data;
int32_t num_vecs = feats->get_num_vectors();
SGVector<float64_t> labels = SGVector<float64_t>(num_vecs);

//get attribute values for attribute
SGVector<float64_t> attribute_values = SGVector<float64_t>(num_vecs);

for(int32_t i=0; i<num_vecs; i++)
attribute_values[i] = (feats->get_feature_vector(i))[attr_no];

CMulticlassLabels* attribute_labels = new CMulticlassLabels(attribute_values);
SGVector<float64_t> attr_val_unique = attribute_labels->get_unique_labels();

for(int32_t i=0; i<attr_val_unique.vlen; i++)
for (int32_t i=0; i<num_vecs; i++)
{
//calculate class entropy for the specific attribute_value
int32_t attr_count=0;
SGVector<float64_t> sample = feats->get_feature_vector(i);
node_t* node = get_root();
CDynamicObjectArray* children = node->get_children();

for(int32_t j=0; i<num_vecs; j++)
while (children->get_num_elements())
{
if(attribute_values[j] == attr_val_unique[i])
attr_count++;
}

float64_t label_entropy = entropy(class_labels,
attribute_values.vector, attr_val_unique[i]);

gain += (attr_count-0.f)/(num_vecs-0.f)*label_entropy;

}
int32_t flag = 0;
for (int32_t j=0; j<children->get_num_elements(); j++)
{
node_t* child = (node_t*) children->get_element(j);
if (child->data.transit_if_feature_value
== sample[node->data.attribute_id])
{
flag = 1;

SG_UNREF(attribute_labels);
SG_UNREF(node);
node = child;

float64_t data_entropy = entropy(class_labels);
gain = data_entropy-gain;

return gain;
}
SG_UNREF(children);
children = node->get_children();

float64_t CID3ClassifierTree::entropy(CMulticlassLabels* labels, float64_t*
feature_values, float64_t active_value)
{
float64_t entr = 0;
break;
}

for(int32_t i=0;i<labels->get_unique_labels().size();i++)
{
int32_t count = 0;
for(int32_t j=0;j<labels->get_num_labels();j++)
{
if((feature_values == NULL) ||
(feature_values[j] == active_value))
{
if(labels->get_unique_labels()[i] ==
labels->get_label(j))
count++;
SG_UNREF(child);
}

if (!flag)
break;
}
float64_t ratio = (count-0.f)/(labels->get_num_labels()-0.f);

labels[i] = node->data.class_label;

if(ratio != 0)
entr -= ratio*(CMath::log2(ratio));
SG_UNREF(node);
SG_UNREF(children);
}

return entr;

CMulticlassLabels* ret = new CMulticlassLabels(labels);
return ret;
}

bool CID3ClassifierTree::train_machine(CFeatures* data)
Expand All @@ -125,43 +104,70 @@ bool CID3ClassifierTree::train_machine(CFeatures* data)

int32_t num_features = ((CDenseFeatures<float64_t>*) data)->get_num_features();
SGVector<int32_t> feature_ids = SGVector<int32_t>(num_features);
feature_ids.range_fill();

for (int32_t i=0; i<num_features; i++)
feature_ids[i] = i;

m_root = id3train(data, (CMulticlassLabels*) m_labels, feature_ids, 0);
set_root(id3train(data, (CMulticlassLabels*) m_labels, feature_ids, 0));

return true;
}

CTreeMachineNode<id3TreeNodeData>* CID3ClassifierTree::id3train(CFeatures* data,
CMulticlassLabels* class_labels, SGVector<int32_t> feature_id_vector, int32_t level)
{
{
node_t* node = new node_t();
CDenseFeatures<float64_t>* feats = (CDenseFeatures<float64_t>*) data;
int32_t num_vecs = feats->get_num_vectors();

//if all samples belong to the same class
// if all samples belong to the same class
if(class_labels->get_unique_labels().size() == 1)
{
node->data.class_label=class_labels->get_unique_labels()[0];
node->data.class_label = class_labels->get_unique_labels()[0];
return node;
}

//if only one feature is left
// if no feature is left
if(feature_id_vector.vlen == 0)
{
// decide label - label occuring max times
SGVector<float64_t> labels = class_labels->get_labels();
labels.qsort();

int32_t most_label = labels[0];
int32_t most_num = 1;
int32_t count = 1;
int32_t i = 1;

while (i<labels.vlen)
{
while ((labels[i] == labels[i-1]) && (i<labels.vlen))
{
count++;
i++;
}

if (count>most_num)
{
most_num = count;
most_label = labels[i-1];
}

count = 1;
i++;
}

node->data.class_label = most_label;
return node;
}

//else get the feature with the highest informational gain
// else get the feature with the highest informational gain
float64_t max = 0;
int32_t best_feature_index = -1;
for(int32_t i=0; i<feats->get_num_features(); i++)
{
float64_t gain = informational_gain_attribute(i,feats,class_labels);
float64_t gain = informational_gain_attribute(i,feats,class_labels);

if(gain > max){
if(gain > max)
{
max = gain;
best_feature_index = i;
}
Expand All @@ -184,13 +190,10 @@ CTreeMachineNode<id3TreeNodeData>* CID3ClassifierTree::id3train(CFeatures* data,
for(int32_t j=0; j<num_vecs; j++)
{
if( active_feature_value == best_feature_values[j])
{
num_cols++;
}
}

SGMatrix<float64_t> mat = SGMatrix<float64_t>(feats->get_num_features()-1,
num_cols);

SGMatrix<float64_t> mat = SGMatrix<float64_t>(feats->get_num_features()-1, num_cols);
SGVector<float64_t> new_labels_vector = SGVector<float64_t>(num_cols);

int32_t cnt = 0;
Expand All @@ -212,22 +215,19 @@ CTreeMachineNode<id3TreeNodeData>* CID3ClassifierTree::id3train(CFeatures* data,
}
}

CMulticlassLabels* new_class_labels = new CMulticlassLabels(new_labels_vector);

//remove the best_attribute from the remaining attributes index vector
SGVector<int32_t> new_feature_id_vector =
SGVector<int32_t>(feature_id_vector.vlen-1);
SGVector<int32_t> new_feature_id_vector = SGVector<int32_t>(feature_id_vector.vlen-1);
cnt = -1;
for(int32_t j=0;j<feature_id_vector.vlen;j++)
{
if(j!=best_feature_index)
new_feature_id_vector[++cnt] = feature_id_vector[j];
}

CMulticlassLabels* new_class_labels = new CMulticlassLabels(new_labels_vector);
CDenseFeatures<float64_t>* new_data = new CDenseFeatures<float64_t>(mat);

node_t* child = id3train(new_data, new_class_labels,
new_feature_id_vector, level+1);
node_t* child = id3train(new_data, new_class_labels, new_feature_id_vector, level+1);
child->data.transit_if_feature_value = active_feature_value;
node->data.attribute_id = feature_id_vector[best_feature_index];
node->add_child(child);
Expand All @@ -241,55 +241,79 @@ CTreeMachineNode<id3TreeNodeData>* CID3ClassifierTree::id3train(CFeatures* data,
return node;
}

CMulticlassLabels* CID3ClassifierTree::apply_multiclass(CFeatures* data)
float64_t CID3ClassifierTree::informational_gain_attribute(int32_t attr_no, CFeatures* data,
CMulticlassLabels* class_labels)
{
REQUIRE(data, "Data required for classification in apply_multiclass")
REQUIRE(data,"Data required for information gain calculation")
REQUIRE(data->get_feature_class()==C_DENSE,
"Dense data required for information gain calculation")

float64_t gain = 0;
CDenseFeatures<float64_t>* feats = (CDenseFeatures<float64_t>*) data;
int32_t num_vecs = feats->get_num_vectors();
SGVector<float64_t> labels = SGVector<float64_t>(num_vecs);

for (int32_t i=0; i<num_vecs; i++)
{
SGVector<float64_t> sample = feats->get_feature_vector(i);
node_t* node = m_root;
SG_REF(node);
CDynamicObjectArray* children = node->get_children();
//get attribute values for attribute
SGVector<float64_t> attribute_values = SGVector<float64_t>(num_vecs);

while (children->get_num_elements())
{
int32_t flag = 0;
for (int32_t j=0; j<children->get_num_elements(); j++)
{
node_t* child = (node_t*) children->get_element(j);
if (child->data.transit_if_feature_value
== sample[node->data.attribute_id])
{
flag = 1;
for(int32_t i=0; i<num_vecs; i++)
attribute_values[i] = (feats->get_feature_vector(i))[attr_no];

SG_UNREF(node);
SG_REF(child);
node = child;
CMulticlassLabels* attribute_labels = new CMulticlassLabels(attribute_values);
SGVector<float64_t> attr_val_unique = attribute_labels->get_unique_labels();

SG_UNREF(children);
children = node->get_children();
for(int32_t i=0; i<attr_val_unique.vlen; i++)
{
//calculate class entropy for the specific attribute_value
int32_t attr_count=0;

break;
}
for(int32_t j=0; j<num_vecs; j++)
{
if(attribute_values[j] == attr_val_unique[i])
attr_count++;
}

SG_UNREF(child);
}
SGVector<float64_t> sub_class = SGVector<float64_t>(attr_count);
int32_t count = 0;

if (!flag)
break;
for(int32_t j=0; j<num_vecs; j++)
{
if(attribute_values[j] == attr_val_unique[i])
sub_class[count++] = class_labels->get_label(j);
}

labels[i] = node->data.class_label;
CMulticlassLabels* sub_labels = new CMulticlassLabels(sub_class);
float64_t sub_entropy = entropy(sub_labels);
gain += sub_entropy*(attr_count-0.f)/(num_vecs-0.f);

SG_UNREF(node);
SG_UNREF(children);
SG_UNREF(sub_labels);
}

float64_t data_entropy = entropy(class_labels);
gain = data_entropy-gain;

SG_UNREF(attribute_labels);

CMulticlassLabels* ret = new CMulticlassLabels(labels);
return ret;
return gain;
}

float64_t CID3ClassifierTree::entropy(CMulticlassLabels* labels)
{
float64_t entr = 0;

for(int32_t i=0;i<labels->get_unique_labels().size();i++)
{
int32_t count = 0;
for(int32_t j=0;j<labels->get_num_labels();j++)
{
if(labels->get_unique_labels()[i] == labels->get_label(j))
count++;
}

float64_t ratio = (count-0.f)/(labels->get_num_labels()-0.f);

if(ratio != 0)
entr -= ratio*(CMath::log2(ratio));
}

return entr;
}

0 comments on commit 8712416

Please sign in to comment.