Skip to content

Commit

Permalink
Merge pull request #420 from pluskid/normalized-mutual-info
Browse files Browse the repository at this point in the history
Normalized mutual information
  • Loading branch information
Soeren Sonnenburg committed Apr 5, 2012
2 parents 7a9d7d0 + 4658048 commit 29f4059
Show file tree
Hide file tree
Showing 9 changed files with 209 additions and 39 deletions.
5 changes: 5 additions & 0 deletions examples/undocumented/python_modular/evaluation_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from shogun.Features import RealFeatures
from shogun.Features import Labels
from shogun.Evaluation import ClusteringAccuracy
from shogun.Evaluation import ClusteringMutualInformation

def get_dataset():
from os.path import exists
Expand Down Expand Up @@ -65,3 +66,7 @@ def assign_labels(data, centroids):
accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
print('Clustering accuracy = %.4f' % accuracy)

MIEval = ClusteringMutualInformation()
mutual_info = MIEval.evaluate(gnd_hat, gnd)
print('Clustering mutual information = %.4f' % mutual_info)

2 changes: 2 additions & 0 deletions src/interfaces/modular/Evaluation.i
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
%rename(BinaryClassEvaluation) CBinaryClassEvaluation;
%rename(ClusteringEvaluation) CClusteringEvaluation;
%rename(ClusteringAccuracy) CClusteringAccuracy;
%rename(ClusteringMutualInformation) CClusteringMutualInformation;
%rename(ContingencyTableEvaluation) CContingencyTableEvaluation;
%rename(MulticlassAccuracy) CMulticlassAccuracy;
%rename(MeanAbsoluteError) CMeanAbsoluteError;
Expand All @@ -39,6 +40,7 @@
%include <shogun/evaluation/BinaryClassEvaluation.h>
%include <shogun/evaluation/ClusteringEvaluation.h>
%include <shogun/evaluation/ClusteringAccuracy.h>
%include <shogun/evaluation/ClusteringMutualInformation.h>
%include <shogun/evaluation/ContingencyTableEvaluation.h>
%include <shogun/evaluation/MulticlassAccuracy.h>
%include <shogun/evaluation/MeanAbsoluteError.h>
Expand Down
1 change: 1 addition & 0 deletions src/interfaces/modular/Evaluation_includes.i
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <shogun/evaluation/BinaryClassEvaluation.h>
#include <shogun/evaluation/ClusteringEvaluation.h>
#include <shogun/evaluation/ClusteringAccuracy.h>
#include <shogun/evaluation/ClusteringMutualInformation.h>
#include <shogun/evaluation/ContingencyTableEvaluation.h>
#include <shogun/evaluation/MulticlassAccuracy.h>
#include <shogun/evaluation/MeanAbsoluteError.h>
Expand Down
26 changes: 26 additions & 0 deletions src/shogun/evaluation/ClusteringAccuracy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2012 Chiyuan Zhang
* Copyright (C) 2012 Chiyuan Zhang
*/

#include <shogun/evaluation/ClusteringAccuracy.h>

using namespace shogun;

float64_t CClusteringAccuracy::evaluate(CLabels* predicted, CLabels* ground_truth)
{
SGVector<int32_t> predicted_ilabels=predicted->get_int_labels();
SGVector<int32_t> groundtruth_ilabels=ground_truth->get_int_labels();
int32_t correct=0;
for (int32_t i=0; i < predicted_ilabels.vlen; ++i)
{
if (predicted_ilabels[i] == groundtruth_ilabels[i])
correct++;
}
return float64_t(correct)/predicted_ilabels.vlen;
}
13 changes: 1 addition & 12 deletions src/shogun/evaluation/ClusteringAccuracy.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,7 @@ class CClusteringAccuracy: public CClusteringEvaluation
* @param ground_truth labels assumed to be correct
* @return evaluation result
*/
virtual float64_t evaluate(CLabels* predicted, CLabels* ground_truth)
{
SGVector<int32_t> predicted_ilabels=predicted->get_int_labels();
SGVector<int32_t> groundtruth_ilabels=ground_truth->get_int_labels();
int32_t correct=0;
for (int32_t i=0; i < predicted_ilabels.vlen; ++i)
{
if (predicted_ilabels[i] == groundtruth_ilabels[i])
correct++;
}
return float64_t(correct)/predicted_ilabels.vlen;
}
virtual float64_t evaluate(CLabels* predicted, CLabels* ground_truth);

/** @return whether criterium has to be maximized or minimized */
virtual EEvaluationDirection get_evaluation_direction()
Expand Down
51 changes: 24 additions & 27 deletions src/shogun/evaluation/ClusteringEvaluation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,62 +19,57 @@
using namespace shogun;
using namespace std;

static void unique_labels(CLabels* labels, vector<int32_t>& result)
int32_t CClusteringEvaluation::find_match_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2)
{
set<int32_t> uniq_lbl;
for (int32_t i=labels->get_num_labels()-1; i >= 0; --i)
int32_t match_count=0;
for (int32_t i=l1.vlen-1; i >= 0; --i)
{
uniq_lbl.insert(labels->get_int_label(i));
if (l1[i] == m1 && l2[i] == m2)
match_count++;
}
result.assign(uniq_lbl.begin(), uniq_lbl.end());

return match_count;
}

static int32_t find_mismatch_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2)
int32_t CClusteringEvaluation::find_mismatch_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2)
{
int32_t mismatch_count=0;
for (int32_t i=l1.vlen-1; i >= 0; --i)
{
if (l1[i] != m1 || l2[i] != m2)
mismatch_count++;
}

return mismatch_count;
return l1.vlen - find_match_count(l1, m1, l2, m2);
}

void CClusteringEvaluation::best_map(CLabels* predicted, CLabels* ground_truth)
{
ASSERT(predicted->get_num_labels() == ground_truth->get_num_labels());
vector<int32_t> label_p, label_g;
unique_labels(predicted, label_p);
unique_labels(ground_truth, label_g);
SGVector<float64_t> label_p=predicted->get_unique_labels();
SGVector<float64_t> label_g=ground_truth->get_unique_labels();

SGVector<int32_t> predicted_ilabels=predicted->get_int_labels();
SGVector<int32_t> groundtruth_ilabels=ground_truth->get_int_labels();

int32_t n_class=max(label_p.size(), label_g.size());
SGMatrix<double> G(n_class, n_class);
int32_t n_class=max(label_p.vlen, label_g.vlen);
SGMatrix<float64_t> G(n_class, n_class);
G.zero();

for (size_t i=0; i < label_g.size(); ++i)
for (size_t i=0; i < label_g.vlen; ++i)
{
for (size_t j=0; j < label_p.size(); ++j)
for (size_t j=0; j < label_p.vlen; ++j)
{
G(i, j)=find_mismatch_count(groundtruth_ilabels, label_g[i],
predicted_ilabels, label_p[j]);
G(i, j)=find_mismatch_count(groundtruth_ilabels, static_cast<int32_t>(label_g[i]),
predicted_ilabels, static_cast<int32_t>(label_p[j]));
}
}

Munkres munkres_solver(G);
munkres_solver.solve();

map<int32_t, int32_t> label_map;
for (size_t i=0; i < label_p.size(); ++i)
std::map<int32_t, int32_t> label_map;
for (size_t i=0; i < label_p.vlen; ++i)
{
for (size_t j=0; j < label_g.size(); ++j)
for (size_t j=0; j < label_g.vlen; ++j)
{
if (G(j, i) == 0)
{
label_map.insert(make_pair(label_p[i], label_g[j]));
label_map.insert(make_pair(static_cast<int32_t>(label_p[i]),
static_cast<int32_t>(label_g[j])));
break;
}
}
Expand All @@ -83,5 +78,7 @@ void CClusteringEvaluation::best_map(CLabels* predicted, CLabels* ground_truth)
for (int32_t i= 0; i < predicted_ilabels.vlen; ++i)
predicted->set_int_label(i, label_map[predicted_ilabels[i]]);

label_p.free_vector();
label_g.free_vector();
G.destroy_matrix();
}
18 changes: 18 additions & 0 deletions src/shogun/evaluation/ClusteringEvaluation.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#ifndef __CLUSTERINGEVALUATION_H__
#define __CLUSTERINGEVALUATION_H__

#include <vector>

#include <shogun/evaluation/Evaluation.h>
#include <shogun/features/Labels.h>

Expand Down Expand Up @@ -43,6 +45,22 @@ class CClusteringEvaluation: public CEvaluation
* @return evaluation result
*/
virtual float64_t evaluate(CLabels* predicted, CLabels* ground_truth) = 0;
protected:
/** find number of matches in the two labels sequence.
*
* For each index i, if l1[i] == m1 and l2[i] == m2, then we get a match.
* @param l1 the first label sequence to be matched
* @param m1 the first label to match
* @param l2 the second label sequence to be matched
* @param m2 the second label to match
* @return number of matches
*/
int32_t find_match_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2);

/** find number of mismatches in the two labels sequence.
* @see find_match_count
*/
int32_t find_mismatch_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2);
};

} // namespace shogun
Expand Down
73 changes: 73 additions & 0 deletions src/shogun/evaluation/ClusteringMutualInformation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2012 Chiyuan Zhang
* Copyright (C) 2012 Chiyuan Zhang
*/

#include <vector>
#include <cmath>
#include <algorithm>

#include <shogun/evaluation/ClusteringMutualInformation.h>

using namespace shogun;

float64_t CClusteringMutualInformation::evaluate(CLabels* predicted, CLabels* ground_truth)
{
SGVector<float64_t> label_p=predicted->get_unique_labels();
SGVector<float64_t> label_g=ground_truth->get_unique_labels();

if (label_p.vlen != label_g.vlen)
SG_ERROR("Number of classes are different\n");
uint32_t n_class=label_p.vlen;
float64_t n_label=predicted->get_num_labels();

SGVector<int32_t> ilabels_p=predicted->get_int_labels();
SGVector<int32_t> ilabels_g=ground_truth->get_int_labels();

SGMatrix<float64_t> G(n_class, n_class);
for (size_t i=0; i < n_class; ++i)
{
for (size_t j=0; j < n_class; ++j)
G(i, j)=find_match_count(ilabels_g, label_g[i],
ilabels_p, label_p[j])/n_label;
}

std::vector<float64_t> G_rowsum(n_class), G_colsum(n_class);
for (size_t i=0; i < n_class; ++i)
{
for (size_t j=0; j < n_class; ++j)
{
G_rowsum[i] += G(i, j);
G_colsum[i] += G(j, i);
}
}

float64_t mutual_info = 0;
for (size_t i=0; i < n_class; ++i)
{
for (size_t j=0; j < n_class; ++j)
{
if (G(i, j) != 0)
mutual_info += G(i, j) * log(G(i,j) /
(G_rowsum[i]*G_colsum[j]))/log(2.);
}
}

float64_t entropy_p = 0, entropy_g = 0;
for (size_t i=0; i < n_class; ++i)
{
entropy_g += -G_rowsum[i] * log(G_rowsum[i])/log(2.);
entropy_p += -G_colsum[i] * log(G_colsum[i])/log(2.);
}

label_p.free_vector();
label_g.free_vector();
G.destroy_matrix();

return mutual_info / std::max(entropy_g, entropy_p);
}
59 changes: 59 additions & 0 deletions src/shogun/evaluation/ClusteringMutualInformation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2012 Chiyuan Zhang
* Copyright (C) 2012 Chiyuan Zhang
*/

#ifndef __CLUSTERINGMUTUALINFORMATION_H__
#define __CLUSTERINGMUTUALINFORMATION_H__

#include <shogun/evaluation/ClusteringEvaluation.h>

namespace shogun
{

/** @brief clustering (normalized) mutual information
*/
class CClusteringMutualInformation: public CClusteringEvaluation
{
public:
/** constructor */
CClusteringMutualInformation(): CClusteringEvaluation() {}

/** destructor */
virtual ~CClusteringMutualInformation() {}

/** evaluate labels
* Make sure to call CClusteringEvaluation::best_map to map the predicted label
* before calculating mutual information.
*
* @param predicted labels for evaluating
* @param ground_truth labels assumed to be correct
* @return evaluation result
*/
virtual float64_t evaluate(CLabels* predicted, CLabels* ground_truth);

/** @return whether criterium has to be maximized or minimized */
virtual EEvaluationDirection get_evaluation_direction()
{
return ED_MINIMIZE;
}

/** Returns the name of the SGSerializable instance. It MUST BE
* the CLASS NAME without the prefixed `C'.
*
* @return name of the SGSerializable
*/
virtual const char* get_name() const
{
return "ClusteringMutualInformation";
}
};

}

#endif /* end of include guard: __CLUSTERINGMUTUALINFORMATION_H__ */

0 comments on commit 29f4059

Please sign in to comment.