From bd74574be7e6e47c83bba97173fa5a3c62d8cd0d Mon Sep 17 00:00:00 2001 From: Alesis Novik Date: Mon, 18 Apr 2011 06:13:40 +0100 Subject: [PATCH 1/4] A very simple and basic implementation of EM for GMM and fix for Gaussian.cpp. --- src/libshogun/clustering/GMM.cpp | 201 +++++++++++++++++++++++ src/libshogun/clustering/GMM.h | 155 +++++++++++++++++ src/libshogun/distributions/Gaussian.cpp | 7 +- 3 files changed, 360 insertions(+), 3 deletions(-) create mode 100644 src/libshogun/clustering/GMM.cpp create mode 100644 src/libshogun/clustering/GMM.h diff --git a/src/libshogun/clustering/GMM.cpp b/src/libshogun/clustering/GMM.cpp new file mode 100644 index 00000000000..38a4d2e623c --- /dev/null +++ b/src/libshogun/clustering/GMM.cpp @@ -0,0 +1,201 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * Written (W) 2011 Alesis Novik + * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society + */ +#include "lib/config.h" + +#ifdef HAVE_LAPACK + +#include "clustering/GMM.h" +#include "clustering/KMeans.h" +#include "distance/EuclidianDistance.h" +#include "base/Parameter.h" +#include "lib/Mathematics.h" +#include "lib/lapack.h" + +using namespace shogun; + +CGMM::CGMM() : CDistribution(), m_components(NULL), m_n(0), + m_coefficients(NULL), m_coef_size(0), m_max_iter(0), m_minimal_change(0) +{ +} + +CGMM::CGMM(int32_t n_, int32_t max_iter_, float64_t min_change_) : CDistribution(), m_components(NULL), m_n(n_), + m_coefficients(NULL), m_coef_size(n_), m_max_iter(max_iter_), + m_minimal_change(min_change_) +{ + register_params(); +} + +CGMM::~CGMM() +{ + if (m_components) + cleanup(); +} + +void CGMM::cleanup() +{ + for (int i = 0; i < m_n; i++) + SG_UNREF(m_components[i]); + delete[] m_components; + delete[] m_coefficients; +} + +bool CGMM::train(CFeatures* data) +{ + ASSERT(m_n != 0); + if (m_components) + cleanup(); + /** init features with data if necessary and assure type is correct */ + if (data) + { + if (!data->has_property(FP_DOT)) + SG_ERROR("Specified features are not of type CDotFeatures\n"); + set_features(data); + } + CDotFeatures* dotdata = (CDotFeatures *) data; + int32_t num_vectors = dotdata->get_num_vectors(); + int32_t num_dim = dotdata->get_dim_feature_space(); + + CEuclidianDistance* dist = new CEuclidianDistance(); + CKMeans* init_k_means = new CKMeans(m_n, dist); + init_k_means->train(dotdata); + float64_t* init_means; + int32_t init_mean_dim; + int32_t init_mean_size; + init_k_means->get_cluster_centers(&init_means, &init_mean_dim, &init_mean_size); + + float64_t* init_cov; + int32_t init_cov_rows; + int32_t init_cov_cols; + dotdata->get_cov(&init_cov, &init_cov_rows, &init_cov_cols); + + m_coefficients = new float64_t[m_coef_size]; + m_components = new CGaussian*[m_n]; + for (int i=0; im_minimal_change) + { + e_log_likelihood_old = e_log_likelihood_new; + e_log_likelihood_new = 0; + /** Precomputing likelihoods */ + float64_t* point; + int32_t point_len; + for (int i=0; iget_feature_vector(&point, &point_len, i); + for (int j=0; jcompute_PDF(point, point_len); + } + delete[] point; + } + for (int i=0; iget_feature_vector(&point, &point_len, j); + CMath::add(mean_sum, T[j*m_n+i], point, 1, mean_sum, point_len); + delete[] point; + } + m_coefficients[i] = T_sum/num_vectors; + for (int j=0; jset_mean(mean_sum, num_dim); + + cov_sum = new float64_t[num_dim*num_dim]; + memset(cov_sum, 0, num_dim*num_dim*sizeof(float64_t)); + for (int j=0; jget_feature_vector(&point, &point_len, j); + CMath::add(point, 1, point, -1, mean_sum, point_len); + cblas_dger(CblasRowMajor, num_dim, num_dim, T[j*m_n+i], point, 1, point, + 1, (double*) cov_sum, num_dim); + delete[] point; + } + for (int j=0; jset_cov(cov_sum, num_dim, num_dim); + + delete[] mean_sum; + delete[] cov_sum; + } + iter++; + } + + delete[] pdfs; + delete[] T; + return true; +} + +int32_t CGMM::get_num_model_parameters() +{ + return 1; +} + +float64_t CGMM::get_log_model_parameter(int32_t num_param) +{ + ASSERT(num_param==1); + return CMath::log(m_n); +} + +float64_t CGMM::get_log_derivative(int32_t num_param, int32_t num_example) +{ + return 0; +} + +float64_t CGMM::get_log_likelihood_example(int32_t num_example) +{ + return 1; +} + +void CGMM::register_params() +{ + m_parameters->add_vector((CSGObject***) &m_components, + &m_n, "m_components", "Mixture components"); + m_parameters->add_vector(&m_coefficients, &m_coef_size, "m_coefficients", "Mixture coefficients."); + m_parameters->add(&m_max_iter, "m_max_iter", "Maximum number of iterations."); + m_parameters->add(&m_minimal_change, "m_minimal_change", "Minimal expected log-likelihood change."); +} + +#endif diff --git a/src/libshogun/clustering/GMM.h b/src/libshogun/clustering/GMM.h new file mode 100644 index 00000000000..f378e508e69 --- /dev/null +++ b/src/libshogun/clustering/GMM.h @@ -0,0 +1,155 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * Written (W) 2011 Alesis Novik + * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society + */ +#ifndef _GMM_H__ +#define _GMM_H__ + +#include "lib/config.h" + +#ifdef HAVE_LAPACK + +#include "distributions/Distribution.h" +#include "distributions/Gaussian.h" +#include "lib/common.h" + +namespace shogun +{ +/** @brief Gaussian distribution interface. + * + * Takes input of number of Gaussians to fit + */ +class CGMM : public CDistribution +{ + public: + /** default constructor */ + CGMM(); + /** constructor + * + * @param n number of Gaussians + * @param max_iter maximum iterations + * @param min_change minimal expected log likelihood change + */ + CGMM(int32_t n, int32_t max_iter, float64_t min_change); + virtual ~CGMM(); + + /** cleanup */ + void cleanup(); + + /** learn distribution + * + * @param data training data + * + * @return whether training was successful + */ + virtual bool train(CFeatures* data=NULL); + + /** get number of parameters in model + * + * @return number of parameters in model + */ + virtual int32_t get_num_model_parameters(); + + /** get model parameter (logarithmic) + * + * @return model parameter (logarithmic) if num_param < m_dim returns + * an element from the mean, else return an element from the covariance + */ + virtual float64_t get_log_model_parameter(int32_t num_param); + + /** get partial derivative of likelihood function (logarithmic) + * + * @param num_param derivative against which param + * @param num_example which example + * @return derivative of likelihood (logarithmic) + */ + virtual float64_t get_log_derivative( + int32_t num_param, int32_t num_example); + + /** compute log likelihood for example + * + * abstract base method + * + * @param num_example which example + * @return log likelihood for example + */ + virtual float64_t get_log_likelihood_example(int32_t num_example); + + /** compute likelihood for example + * + * abstract base method + * + * @param num_example which example + * @return likelihood for example + */ + virtual float64_t get_likelihood_example(int32_t num_example) + { + return CMath::exp(get_log_likelihood_example(num_example)); + } + + /** get nth mean + * + * @param mean copy of the mean + * @param mean_length + * @param num which mean to retrieve + */ + virtual inline void get_nth_mean(float64_t** mean, int32_t* mean_length, int32_t num) + { + ASSERT(numget_mean(mean, mean_length); + } + + /** get nth cov + * + * @param cov copy of the cov + * @param cov_rows + * @param cov_cols + * @param num which covariance to retrieve + */ + virtual inline void get_nth_cov(float64_t** cov, int32_t* cov_rows, int32_t* cov_cols, int32_t num) + { + ASSERT(numget_cov(cov, cov_rows, cov_cols); + } + + /** get coefficients + * + * @param coef copy of coeffiecients + * @param coef_length coef vector length + */ + virtual inline void get_coef(float64_t** coef, int32_t* coef_length) + { + *coef = new float64_t[m_coef_size]; + memcpy(*coef, m_coefficients, sizeof(float64_t)*m_coef_size); + *coef_length = m_coef_size; + } + + /** @return object name */ + inline virtual const char* get_name() const { return "GMM"; } + + private: + /** Initialize parameters for serialization */ + void register_params(); + + protected: + /** Mixture components */ + CGaussian** m_components; + /** Number of mixture components */ + int32_t m_n; + /** Mixture coefficients */ + float64_t* m_coefficients; + /** Coefficient vector size */ + int32_t m_coef_size; + /** Maximum number of iterations */ + int32_t m_max_iter; + /** Minimum expected log-likelihood change */ + float64_t m_minimal_change; +}; +} +#endif //HAVE_LAPACK +#endif //_GMM_H__ diff --git a/src/libshogun/distributions/Gaussian.cpp b/src/libshogun/distributions/Gaussian.cpp index 46a8f43ffea..e1f9176285a 100644 --- a/src/libshogun/distributions/Gaussian.cpp +++ b/src/libshogun/distributions/Gaussian.cpp @@ -25,7 +25,8 @@ m_mean_length(0) } CGaussian::CGaussian(float64_t* mean, int32_t mean_length, - float64_t* cov, int32_t cov_rows, int32_t cov_cols) : CDistribution() + float64_t* cov, int32_t cov_rows, int32_t cov_cols) : CDistribution(), + m_cov_inverse(NULL) { ASSERT(mean_length == cov_rows); ASSERT(cov_rows == cov_cols); @@ -42,7 +43,8 @@ CGaussian::CGaussian(float64_t* mean, int32_t mean_length, void CGaussian::init() { - delete[] m_cov_inverse; + if (m_cov_inverse) + delete[] m_cov_inverse; m_cov_inverse_rows = m_cov_cols; m_cov_inverse_cols = m_cov_rows; @@ -54,7 +56,6 @@ void CGaussian::init() for (int i = 0; i < m_cov_rows; i++) m_constant *= m_cov_inverse[i*m_cov_rows+i]; - m_constant = 1/m_constant; m_constant *= pow(2*M_PI, (float64_t) -m_cov_rows/2); From 3676086a20abd88d6c1ba8c54f526508f1db7381 Mon Sep 17 00:00:00 2001 From: Alesis Novik Date: Mon, 18 Apr 2011 17:30:21 +0100 Subject: [PATCH 2/4] Removed GMM from this pull request --- src/libshogun/clustering/GMM.cpp | 201 ------------------------------- src/libshogun/clustering/GMM.h | 155 ------------------------ 2 files changed, 356 deletions(-) delete mode 100644 src/libshogun/clustering/GMM.cpp delete mode 100644 src/libshogun/clustering/GMM.h diff --git a/src/libshogun/clustering/GMM.cpp b/src/libshogun/clustering/GMM.cpp deleted file mode 100644 index 38a4d2e623c..00000000000 --- a/src/libshogun/clustering/GMM.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2011 Alesis Novik - * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society - */ -#include "lib/config.h" - -#ifdef HAVE_LAPACK - -#include "clustering/GMM.h" -#include "clustering/KMeans.h" -#include "distance/EuclidianDistance.h" -#include "base/Parameter.h" -#include "lib/Mathematics.h" -#include "lib/lapack.h" - -using namespace shogun; - -CGMM::CGMM() : CDistribution(), m_components(NULL), m_n(0), - m_coefficients(NULL), m_coef_size(0), m_max_iter(0), m_minimal_change(0) -{ -} - -CGMM::CGMM(int32_t n_, int32_t max_iter_, float64_t min_change_) : CDistribution(), m_components(NULL), m_n(n_), - m_coefficients(NULL), m_coef_size(n_), m_max_iter(max_iter_), - m_minimal_change(min_change_) -{ - register_params(); -} - -CGMM::~CGMM() -{ - if (m_components) - cleanup(); -} - -void CGMM::cleanup() -{ - for (int i = 0; i < m_n; i++) - SG_UNREF(m_components[i]); - delete[] m_components; - delete[] m_coefficients; -} - -bool CGMM::train(CFeatures* data) -{ - ASSERT(m_n != 0); - if (m_components) - cleanup(); - /** init features with data if necessary and assure type is correct */ - if (data) - { - if (!data->has_property(FP_DOT)) - SG_ERROR("Specified features are not of type CDotFeatures\n"); - set_features(data); - } - CDotFeatures* dotdata = (CDotFeatures *) data; - int32_t num_vectors = dotdata->get_num_vectors(); - int32_t num_dim = dotdata->get_dim_feature_space(); - - CEuclidianDistance* dist = new CEuclidianDistance(); - CKMeans* init_k_means = new CKMeans(m_n, dist); - init_k_means->train(dotdata); - float64_t* init_means; - int32_t init_mean_dim; - int32_t init_mean_size; - init_k_means->get_cluster_centers(&init_means, &init_mean_dim, &init_mean_size); - - float64_t* init_cov; - int32_t init_cov_rows; - int32_t init_cov_cols; - dotdata->get_cov(&init_cov, &init_cov_rows, &init_cov_cols); - - m_coefficients = new float64_t[m_coef_size]; - m_components = new CGaussian*[m_n]; - for (int i=0; im_minimal_change) - { - e_log_likelihood_old = e_log_likelihood_new; - e_log_likelihood_new = 0; - /** Precomputing likelihoods */ - float64_t* point; - int32_t point_len; - for (int i=0; iget_feature_vector(&point, &point_len, i); - for (int j=0; jcompute_PDF(point, point_len); - } - delete[] point; - } - for (int i=0; iget_feature_vector(&point, &point_len, j); - CMath::add(mean_sum, T[j*m_n+i], point, 1, mean_sum, point_len); - delete[] point; - } - m_coefficients[i] = T_sum/num_vectors; - for (int j=0; jset_mean(mean_sum, num_dim); - - cov_sum = new float64_t[num_dim*num_dim]; - memset(cov_sum, 0, num_dim*num_dim*sizeof(float64_t)); - for (int j=0; jget_feature_vector(&point, &point_len, j); - CMath::add(point, 1, point, -1, mean_sum, point_len); - cblas_dger(CblasRowMajor, num_dim, num_dim, T[j*m_n+i], point, 1, point, - 1, (double*) cov_sum, num_dim); - delete[] point; - } - for (int j=0; jset_cov(cov_sum, num_dim, num_dim); - - delete[] mean_sum; - delete[] cov_sum; - } - iter++; - } - - delete[] pdfs; - delete[] T; - return true; -} - -int32_t CGMM::get_num_model_parameters() -{ - return 1; -} - -float64_t CGMM::get_log_model_parameter(int32_t num_param) -{ - ASSERT(num_param==1); - return CMath::log(m_n); -} - -float64_t CGMM::get_log_derivative(int32_t num_param, int32_t num_example) -{ - return 0; -} - -float64_t CGMM::get_log_likelihood_example(int32_t num_example) -{ - return 1; -} - -void CGMM::register_params() -{ - m_parameters->add_vector((CSGObject***) &m_components, - &m_n, "m_components", "Mixture components"); - m_parameters->add_vector(&m_coefficients, &m_coef_size, "m_coefficients", "Mixture coefficients."); - m_parameters->add(&m_max_iter, "m_max_iter", "Maximum number of iterations."); - m_parameters->add(&m_minimal_change, "m_minimal_change", "Minimal expected log-likelihood change."); -} - -#endif diff --git a/src/libshogun/clustering/GMM.h b/src/libshogun/clustering/GMM.h deleted file mode 100644 index f378e508e69..00000000000 --- a/src/libshogun/clustering/GMM.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2011 Alesis Novik - * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society - */ -#ifndef _GMM_H__ -#define _GMM_H__ - -#include "lib/config.h" - -#ifdef HAVE_LAPACK - -#include "distributions/Distribution.h" -#include "distributions/Gaussian.h" -#include "lib/common.h" - -namespace shogun -{ -/** @brief Gaussian distribution interface. - * - * Takes input of number of Gaussians to fit - */ -class CGMM : public CDistribution -{ - public: - /** default constructor */ - CGMM(); - /** constructor - * - * @param n number of Gaussians - * @param max_iter maximum iterations - * @param min_change minimal expected log likelihood change - */ - CGMM(int32_t n, int32_t max_iter, float64_t min_change); - virtual ~CGMM(); - - /** cleanup */ - void cleanup(); - - /** learn distribution - * - * @param data training data - * - * @return whether training was successful - */ - virtual bool train(CFeatures* data=NULL); - - /** get number of parameters in model - * - * @return number of parameters in model - */ - virtual int32_t get_num_model_parameters(); - - /** get model parameter (logarithmic) - * - * @return model parameter (logarithmic) if num_param < m_dim returns - * an element from the mean, else return an element from the covariance - */ - virtual float64_t get_log_model_parameter(int32_t num_param); - - /** get partial derivative of likelihood function (logarithmic) - * - * @param num_param derivative against which param - * @param num_example which example - * @return derivative of likelihood (logarithmic) - */ - virtual float64_t get_log_derivative( - int32_t num_param, int32_t num_example); - - /** compute log likelihood for example - * - * abstract base method - * - * @param num_example which example - * @return log likelihood for example - */ - virtual float64_t get_log_likelihood_example(int32_t num_example); - - /** compute likelihood for example - * - * abstract base method - * - * @param num_example which example - * @return likelihood for example - */ - virtual float64_t get_likelihood_example(int32_t num_example) - { - return CMath::exp(get_log_likelihood_example(num_example)); - } - - /** get nth mean - * - * @param mean copy of the mean - * @param mean_length - * @param num which mean to retrieve - */ - virtual inline void get_nth_mean(float64_t** mean, int32_t* mean_length, int32_t num) - { - ASSERT(numget_mean(mean, mean_length); - } - - /** get nth cov - * - * @param cov copy of the cov - * @param cov_rows - * @param cov_cols - * @param num which covariance to retrieve - */ - virtual inline void get_nth_cov(float64_t** cov, int32_t* cov_rows, int32_t* cov_cols, int32_t num) - { - ASSERT(numget_cov(cov, cov_rows, cov_cols); - } - - /** get coefficients - * - * @param coef copy of coeffiecients - * @param coef_length coef vector length - */ - virtual inline void get_coef(float64_t** coef, int32_t* coef_length) - { - *coef = new float64_t[m_coef_size]; - memcpy(*coef, m_coefficients, sizeof(float64_t)*m_coef_size); - *coef_length = m_coef_size; - } - - /** @return object name */ - inline virtual const char* get_name() const { return "GMM"; } - - private: - /** Initialize parameters for serialization */ - void register_params(); - - protected: - /** Mixture components */ - CGaussian** m_components; - /** Number of mixture components */ - int32_t m_n; - /** Mixture coefficients */ - float64_t* m_coefficients; - /** Coefficient vector size */ - int32_t m_coef_size; - /** Maximum number of iterations */ - int32_t m_max_iter; - /** Minimum expected log-likelihood change */ - float64_t m_minimal_change; -}; -} -#endif //HAVE_LAPACK -#endif //_GMM_H__ From 940d6bbe774115cc4e8ce66795764e7d590460b6 Mon Sep 17 00:00:00 2001 From: Alesis Novik Date: Mon, 18 Apr 2011 17:32:22 +0100 Subject: [PATCH 3/4] Fixed Gaussian.cpp --- src/libshogun/distributions/Gaussian.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/libshogun/distributions/Gaussian.cpp b/src/libshogun/distributions/Gaussian.cpp index e1f9176285a..799e958d0c9 100644 --- a/src/libshogun/distributions/Gaussian.cpp +++ b/src/libshogun/distributions/Gaussian.cpp @@ -43,8 +43,7 @@ CGaussian::CGaussian(float64_t* mean, int32_t mean_length, void CGaussian::init() { - if (m_cov_inverse) - delete[] m_cov_inverse; + delete[] m_cov_inverse; m_cov_inverse_rows = m_cov_cols; m_cov_inverse_cols = m_cov_rows; From 0e40f95ea071c86a781097dc90ab65aa6dce4f16 Mon Sep 17 00:00:00 2001 From: Alesis Novik Date: Mon, 18 Apr 2011 21:29:19 +0100 Subject: [PATCH 4/4] Improved readability of Gaussian.cpp --- src/libshogun/distributions/Gaussian.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libshogun/distributions/Gaussian.cpp b/src/libshogun/distributions/Gaussian.cpp index 799e958d0c9..44da760675b 100644 --- a/src/libshogun/distributions/Gaussian.cpp +++ b/src/libshogun/distributions/Gaussian.cpp @@ -55,6 +55,7 @@ void CGaussian::init() for (int i = 0; i < m_cov_rows; i++) m_constant *= m_cov_inverse[i*m_cov_rows+i]; + m_constant = 1/m_constant; m_constant *= pow(2*M_PI, (float64_t) -m_cov_rows/2);