Implement RescaleFeatures preprocessor

See the doxygen class description. Add new get_row_vector(index) method for SGMatrix
shogun-toolbox · May 10, 2013 · 173b59d · 173b59d
1 parent f68d3cb
commit 173b59d
Show file tree

Hide file tree

Showing 8 changed files with 279 additions and 1 deletion.
diff --git a/src/interfaces/modular/Preprocessor.i b/src/interfaces/modular/Preprocessor.i
@@ -18,6 +18,7 @@
 %rename(RandomFourierGaussPreproc) CRandomFourierGaussPreproc;
 %rename(HomogeneousKernelMap) CHomogeneousKernelMap;
 %rename(PNorm) CPNorm;
+%rename(RescaleFeatures) CRescaleFeatures;
 
 %rename(DimensionReductionPreprocessor) CDimensionReductionPreprocessor;
 %rename(PCA) CPCA;
@@ -102,6 +103,7 @@ namespace shogun
 %include <shogun/preprocessor/RandomFourierGaussPreproc.h>
 %include <shogun/preprocessor/HomogeneousKernelMap.h>
 %include <shogun/preprocessor/PNorm.h>
+%include <shogun/preprocessor/RescaleFeatures.h>
 
 %include <shogun/preprocessor/PCA.h>
 %include <shogun/preprocessor/KernelPCA.h>

diff --git a/src/interfaces/modular/Preprocessor_includes.i b/src/interfaces/modular/Preprocessor_includes.i
@@ -12,6 +12,7 @@
 #include <shogun/preprocessor/RandomFourierGaussPreproc.h>
 #include <shogun/preprocessor/HomogeneousKernelMap.h>
 #include <shogun/preprocessor/PNorm.h>
+#include <shogun/preprocessor/RescaleFeatures.h>
 
 #include <shogun/preprocessor/DimensionReductionPreprocessor.h>
 #include <shogun/preprocessor/PCA.h>

diff --git a/src/shogun/lib/SGMatrix.cpp b/src/shogun/lib/SGMatrix.cpp
@@ -893,6 +893,17 @@ void SGMatrix<T>::save(CFile* writer)
 	SG_RESET_LOCALE;
 }
 
+template<class T>
+SGVector<T> SGMatrix<T>::get_row_vector(index_t row) const
+{
+	SGVector<T> rowv(num_cols, false);
+	for (index_t i = 0; i < num_cols; i++)
+	{
+		rowv[i] = matrix[i*num_rows+row];
+	}
+	return rowv;
+}
+
 template class SGMatrix<bool>;
 template class SGMatrix<char>;
 template class SGMatrix<int8_t>;

diff --git a/src/shogun/lib/SGMatrix.h b/src/shogun/lib/SGMatrix.h
@@ -53,6 +53,13 @@ template<class T> class SGMatrix : public SGReferencedData
 			return &matrix[col*num_rows];
 		}
 
+		/** get a row vector
+		 *
+		 * @param row row index
+		 * @return row vector
+		 */
+		SGVector<T> get_row_vector(index_t row) const;
+
 		/** operator overload for matrix read only access
 		 * @param i_row
 		 * @param i_col

diff --git a/src/shogun/preprocessor/Preprocessor.h b/src/shogun/preprocessor/Preprocessor.h
@@ -48,7 +48,8 @@ enum EPreprocessorType
 	P_DIMENSIONREDUCTIONPREPROCESSOR=160,
 	P_SUMONE=170,
 	P_HOMOGENEOUSKERNELMAP = 180,
-	P_PNORM = 190
+	P_PNORM = 190,
+	P_RESCALEFEATURES = 200
 };
 
 /** @brief Class Preprocessor defines a preprocessor interface.

diff --git a/src/shogun/preprocessor/RescaleFeatures.cpp b/src/shogun/preprocessor/RescaleFeatures.cpp
@@ -0,0 +1,104 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Written (W) 20013 Viktor Gal
+ * Copyright (C) 2013 Viktor Gal
+ */
+
+#include <shogun/preprocessor/RescaleFeatures.h>
+
+using namespace shogun;
+
+CRescaleFeatures::CRescaleFeatures()
+ : CDensePreprocessor<float64_t>()
+{
+
+}
+
+CRescaleFeatures::~CRescaleFeatures()
+{
+
+}
+
+bool CRescaleFeatures::init(CFeatures* features)
+{
+	ASSERT(features->get_feature_class()==C_DENSE);
+	ASSERT(features->get_feature_type()==F_DREAL);
+	return true;
+}
+
+void CRescaleFeatures::cleanup()
+{
+
+}
+
+bool CRescaleFeatures::load(FILE* f)
+{
+	SG_SET_LOCALE_C;
+	SG_RESET_LOCALE;
+	return false;
+}
+
+bool CRescaleFeatures::save(FILE* f)
+{
+	SG_SET_LOCALE_C;
+	SG_RESET_LOCALE;
+	return false;
+}
+
+SGMatrix<float64_t> CRescaleFeatures::apply_to_feature_matrix(CFeatures* features)
+{
+	SGMatrix<float64_t> feature_matrix=((CDenseFeatures<float64_t>*)features)->get_feature_matrix();
+	for (index_t i = 0; i < feature_matrix.num_rows; i++)
+	{
+		SGVector<float64_t> vec = feature_matrix.get_row_vector(i);
+		float64_t min = vec[0];
+		float64_t max = vec[0];
+
+		/* find the max and min values in one loop */
+		for (index_t j = 1; j < vec.vlen; j++)
+		{
+			min = CMath::min(vec[j], min);
+			max = CMath::max(vec[j], max);
+		}
+		float64_t range = max-min;
+
+		if (range > 0)
+		{
+			for (index_t j = 0; j < feature_matrix.num_cols; j++)
+			{
+				float64_t& k = feature_matrix(i, j);
+				k = (k-min)/range;
+			}
+		}
+	}
+
+	return feature_matrix;
+}
+
+SGVector<float64_t> CRescaleFeatures::apply_to_feature_vector(SGVector<float64_t> vector)
+{
+	ASSERT(vector.vlen > 0);
+	SGVector<float64_t> rescaled_vec = vector.clone();
+	float64_t min = vector[0];
+	float64_t max = vector[0];
+
+		/* find the max and min values in one loop */
+	for (index_t i = 1; i < vector.vlen; i++)
+	{
+		min = CMath::min(vector[i], min);
+		max = CMath::max(vector[i], max);
+	}
+	float64_t range = max - min;
+
+	if (range > 0)
+	{
+		rescaled_vec.add(-min);
+		rescaled_vec.scale(1/range);
+	}
+
+	return rescaled_vec;
+}
diff --git a/src/shogun/preprocessor/RescaleFeatures.h b/src/shogun/preprocessor/RescaleFeatures.h
@@ -0,0 +1,83 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Written (W) 20013 Viktor Gal
+ * Copyright (C) 2013 Viktor Gal
+ */
+
+#ifndef __RESCALEFEATURES_H__
+#define __RESCALEFEATURES_H__
+
+#include <shogun/preprocessor/DensePreprocessor.h>
+
+namespace shogun
+{
+	/**@brief Preprocessor RescaleFeautres is rescaling the range of features to
+	 * make the features independent of each other and aims to scale the range 
+	 * in [0, 1] or [−1, 1].
+	 *
+	 * The general formula is given as:
+	 * \f[
+	 * x' = \frac{x - min}{max - min}
+	 * \f]
+	 * where \f$x\f$ is an original value, \f$x'\f$ is the normalized value.
+	 * It does not need any initialization.
+     */
+	class CRescaleFeatures : public CDensePreprocessor<float64_t>
+	{
+		public:
+			/** default ctor */
+			CRescaleFeatures();
+
+			/** dtor */
+			virtual ~CRescaleFeatures();
+
+			/**
+			 * initialize preprocessor from features
+			 * initialization is not required by this preprocessor.
+			 */
+			virtual bool init(CFeatures* features);
+
+			/**
+			 * Cleanup
+			 */
+			virtual void cleanup();
+
+			/**
+			 * initialize preprocessor from file
+			 */
+			virtual bool load(FILE* f);
+
+			/**
+			 * save preprocessor init-data to file
+			 */
+			virtual bool save(FILE* f);
+
+			/**
+			 * Apply preproc on a feature matrix
+			 *
+			 * @param features input feature matrix
+			 * @return pointer to feature_matrix, i.e. f->get_feature_matrix();
+			 */
+			virtual SGMatrix<float64_t> apply_to_feature_matrix(CFeatures* features);
+
+			/**
+			 * Apply preproc on a single feature vector
+			 *
+			 * @param vector the input feature vector
+			 * @return the output feature vector
+			 */			
+			virtual SGVector<float64_t> apply_to_feature_vector(SGVector<float64_t> vector);
+
+			/** @return object name */
+			virtual const char* get_name() const { return "RescaleFeatures"; }
+
+			/** return a type of preprocessor */
+			virtual EPreprocessorType get_type() const { return P_RESCALEFEATURES; }
+	};
+}
+
+#endif /* __RESCALEFEATURES_H__ */
diff --git a/tests/unit/preprocessor/RescaleFeatures_unittest.cc b/tests/unit/preprocessor/RescaleFeatures_unittest.cc
@@ -0,0 +1,69 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Written (W) 2013 Viktor Gal
+ */
+
+#include <shogun/preprocessor/RescaleFeatures.h>
+#include <gtest/gtest.h>
+
+using namespace shogun;
+
+TEST(RescaleFeatures, apply_to_feature_vector)
+{
+	index_t vlen = 10;
+	SGVector<float64_t> t(vlen);
+	CRescaleFeatures rescaler;
+
+	sg_rand->set_seed(12345);
+	t.random(-1024, 1024);
+	float64_t min = SGVector<float64_t>::min(t, vlen);
+	float64_t max = SGVector<float64_t>::max(t, vlen);
+	float64_t range = max - min;
+	SGVector<float64_t> out = rescaler.apply_to_feature_vector(t);
+
+	for (index_t i = 0; i < vlen; i++) {
+		float64_t e = (t[i]-min)/range;
+		EXPECT_DOUBLE_EQ(e, out[i]);
+	}
+}
+
+TEST(RescaleFeatures, apply_to_feature_matrix)
+{
+	index_t num_features = 3;
+	index_t num_vectors = 10;
+	SGVector<float64_t> min(num_features), range(num_features);
+	SGVector<float64_t> v(num_features*num_vectors), ev; 
+	v.random(-1024, 1024);
+	ev = v.clone();
+
+	SGMatrix<float64_t> m(v.vector, num_features, num_vectors, false);
+	SGMatrix<float64_t> em(ev.vector, num_features, num_vectors, false);
+	CDenseFeatures<float64_t>* feats = new CDenseFeatures<float64_t>(m);
+	CRescaleFeatures* rescaler = new CRescaleFeatures();
+
+	/* find the min and range for each feature among all the vectors */
+	for (index_t i = 0; i < num_features; i++)
+	{
+		SGVector<float64_t> t = em.get_row_vector(i);
+		min[i] = SGVector<float64_t>::min(t.vector, t.vlen);
+		range[i] = SGVector<float64_t>::max(t.vector, t.vlen) - min[i];
+	}
+
+	feats->add_preprocessor(rescaler);
+	feats->apply_preprocessor();
+	for (index_t i = 0; i < num_vectors; i++)
+	{
+		SGVector<float64_t> v = feats->get_feature_vector(i);
+		float64_t* v_orig = em.get_column_vector(i);
+		for (index_t j = 0; j < num_features; j++) {
+			float64_t e = (v_orig[j]-min[j])/range[j];
+			EXPECT_DOUBLE_EQ(e, v[j]);
+		}
+	}
+
+	SG_UNREF(feats);
+}