diff --git a/data b/data index e4159a7250a..e316fd67f45 160000 --- a/data +++ b/data @@ -1 +1 @@ -Subproject commit e4159a7250a2995342e38c870bdc0d0a1e1c1290 +Subproject commit e316fd67f45cb6e1b8b799369bf50878ddb0c560 diff --git a/doc/cookbook/source/examples/clustering/kmeans.rst b/doc/cookbook/source/examples/clustering/kmeans.rst new file mode 100644 index 00000000000..223f254f701 --- /dev/null +++ b/doc/cookbook/source/examples/clustering/kmeans.rst @@ -0,0 +1,56 @@ +======= +K-means +======= +:math:`K`-means clustering aims to partition :math:`n` observations into :math:`k\leq n` clusters (sets :math:`\mathbf{S}`), +in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster. + +In other words, its objective is to minimize: + +.. math:: + \argmin_\mathbf{S} \sum_{i=1}^{k}\sum_{\mathbf{x}\in S_k}\left \|\boldsymbol{x} - \boldsymbol{\mu}_i \right \|^{2} + +where :math:`\mathbf{μ}_i` is the mean of points in :math:`S_i`. + +See Chapter 20 in :cite:`barber2012bayesian` for a detailed introduction. + +------- +Example +------- +Imagine we have files with training and test data. We create CDenseFeatures (here 64 bit floats aka RealFeatures) as + +.. sgexample:: kmeans.sg:create_features + +In order to run :sgclass:`CKMeans`, we need to choose a distance, for example :sgclass:`CEuclideanDistance`, or other sub-classes of :sgclass:`CDistance`. The distance is initialized with the data we want to classify. + +.. sgexample:: kmeans.sg:choose_distance + +Once we have chosen a distance, we create an instance of the :sgclass:`CKMeans` classifier. +We explicitly set :math:`k`, the number of clusters we are expecting to have as 3 and pass it to :sgclass:`CKMeans`. In this example, we apply Lloyd's method for `k`-means clustering. + +.. sgexample:: kmeans.sg:create_instance_lloyd + +Then we train the model: + +.. sgexample:: kmeans.sg:train_dataset + +We can extract centers and radius of each cluster: + +.. sgexample:: kmeans.sg:extract_centers_and_radius + + +:sgclass:`CKMeans` also supports mini batch :math:`k`-means clustering. +We can create an instance of :sgclass:`CKMeans` classifier with mini batch :math:`k`-means method by providing the batch size and iteration number. + +.. sgexample:: kmeans.sg:create_instance_mb + +Then train the model and extract the centers and radius information as mentioned above. + +---------- +References +---------- +:wiki:`K-means_clustering` + +:wiki:`Lloyd's_algorithm` + +.. bibliography:: ../../references.bib + :filter: docname in docnames diff --git a/examples/meta/src/clustering/kmeans.sg b/examples/meta/src/clustering/kmeans.sg new file mode 100644 index 00000000000..fc452590ef0 --- /dev/null +++ b/examples/meta/src/clustering/kmeans.sg @@ -0,0 +1,28 @@ +CSVFile f_feats_train("../../data/classifier_binary_2d_linear_features_train.dat") +Math:init_random(1) + +#![create_features] +RealFeatures features_train(f_feats_train) +#![create_features] + +#![choose_distance] +EuclideanDistance distance(features_train, features_train) +#![choose_distance] + +#![create_instance_lloyd] +KMeans kmeans(2, distance) +#![create_instance_lloyd] + +#![train_dataset] +kmeans.train() +#![train_dataset] + +#![extract_centers_and_radius] +RealMatrix c = kmeans.get_cluster_centers() +RealVector r = kmeans.get_radiuses() +#![extract_centers_and_radius] + +#![create_instance_mb] +KMeansMiniBatch kmeans_mb(2, distance) +kmeans_mb.set_mb_params(4, 1000) +#![create_instance_mb] diff --git a/examples/undocumented/csharp_modular/clustering_kmeans_modular.cs b/examples/undocumented/csharp_modular/clustering_kmeans_modular.cs deleted file mode 100644 index 50fc649e6bc..00000000000 --- a/examples/undocumented/csharp_modular/clustering_kmeans_modular.cs +++ /dev/null @@ -1,25 +0,0 @@ -//import org.shogun.*; -//import org.jblas.*; -//import static org.shogun.Math.init_random; -using System; - -public class clustering_kmeans_modular { - public static void Main() { - modshogun.init_shogun_with_defaults(); - int k = 3; - // already tried init_random(17) - Math.init_random(17); - - double[,] fm_train = Load.load_numbers("../data/fm_train_real.dat"); - - RealFeatures feats_train = new RealFeatures(fm_train); - EuclideanDistance distance = new EuclideanDistance(feats_train, feats_train); - - KMeans kmeans = new KMeans(k, distance); - kmeans.train(); - - double[,] out_centers = kmeans.get_cluster_centers(); - kmeans.get_radiuses(); - - } -} diff --git a/examples/undocumented/java_modular/clustering_kmeans_modular.java b/examples/undocumented/java_modular/clustering_kmeans_modular.java deleted file mode 100644 index 0e1b4ca4add..00000000000 --- a/examples/undocumented/java_modular/clustering_kmeans_modular.java +++ /dev/null @@ -1,31 +0,0 @@ -import org.shogun.*; -import org.jblas.*; -import static org.shogun.Math.init_random; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -public class clustering_kmeans_modular { - static { - System.loadLibrary("modshogun"); - } - - public static void main(String argv[]) { - modshogun.init_shogun_with_defaults(); - int k = 3; - init_random(17); - - DoubleMatrix fm_train = Load.load_numbers("../data/fm_train_real.dat"); - - RealFeatures feats_train = new RealFeatures(fm_train); - EuclideanDistance distance = new EuclideanDistance(feats_train, feats_train); - - KMeans kmeans = new KMeans(k, distance); - kmeans.train(); - - DoubleMatrix out_centers = kmeans.get_cluster_centers(); - kmeans.get_radiuses(); - - } -} diff --git a/examples/undocumented/octave_modular/clustering_kmeans_modular.m b/examples/undocumented/octave_modular/clustering_kmeans_modular.m deleted file mode 100644 index 133f3ed497e..00000000000 --- a/examples/undocumented/octave_modular/clustering_kmeans_modular.m +++ /dev/null @@ -1,19 +0,0 @@ -modshogun - -% Explicit examples on how to use clustering - -addpath('tools'); -fm_train=load_matrix('../data/fm_train_real.dat'); - -% KMeans -disp('KMeans') - -k=4; -feats_train=RealFeatures(fm_train); -distance=EuclideanDistance(feats_train, feats_train); - -kmeans=KMeans(k, distance); -kmeans.train(); - -c=kmeans.get_cluster_centers(); -r=kmeans.get_radiuses(); diff --git a/examples/undocumented/python_modular/clustering_kmeans_modular.py b/examples/undocumented/python_modular/clustering_kmeans_modular.py deleted file mode 100644 index aaf53a0b81a..00000000000 --- a/examples/undocumented/python_modular/clustering_kmeans_modular.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python -traindat = '../data/fm_train_real.dat' - -parameter_list = [[traindat,3],[traindat,4]] - -def clustering_kmeans_modular (fm_train=traindat,k=3): - from modshogun import EuclideanDistance, RealFeatures, KMeans, Math_init_random, CSVFile - Math_init_random(17) - - feats_train=RealFeatures(CSVFile(fm_train)) - distance=EuclideanDistance(feats_train, feats_train) - - kmeans=KMeans(k, distance) - kmeans.train() - - out_centers = kmeans.get_cluster_centers() - kmeans.get_radiuses() - - return out_centers, kmeans - -if __name__=='__main__': - print('KMeans') - clustering_kmeans_modular(*parameter_list[0]) -