shogun-toolbox · karlnapf · Mar 8, 2013 · Mar 8, 2013 · Mar 8, 2013 · Mar 8, 2013
diff --git a/examples/descriptions/modular/statistics_hsic.txt b/examples/descriptions/modular/statistics_hsic.txt
@@ -0,0 +1,7 @@
+In this example, HSIC, a kernel-based test for independence is used to detect
+dependence of a mixture of Gaussians and a rotated version of the same data.
+The HSIC statistic is computed and available methods for computing a threshold
+of the null distribution are used. In addition, p-values of the test are
+computed. Note that these methods require more iterations than used here. A
+Gaussian kernel is selected via the median heuristic.
+See tutorial and Class documentation for more details.
diff --git a/examples/descriptions/modular/statistics_linear_time_mmd.txt b/examples/descriptions/modular/statistics_linear_time_mmd.txt
@@ -0,0 +1,10 @@
+In this example, the linear time MMD statistic for kernel-based two-sample
+testing is illustrated. It is a streaming based statistic for large amounts
+of data. The used dataset is a bunch of standard Gaussian vectors where the
+first dimensions differs in both distributions p and q. The test statistic
+is computed and available methods for computing a threshold of the null
+distribution are used. In addition, p-values for the test are computed.
+Note that these methods require more iterations/samples that used here. A
+Gaussian is selected via the median heuristic. There are more clever
+kernel selection methods available.
+See tutorial and Class documentation for more details.
diff --git a/examples/descriptions/modular/statistics_mmd_kernel_selection.txt b/examples/descriptions/modular/statistics_mmd_kernel_selection.txt
@@ -0,0 +1,10 @@
+In this example, kernel selection methods for MMD based statistics are
+illustrated. A difficult synthetic dataset is used to illustrate their
+performance in two-sample testing. All kernel selection methods for MMD
+work via creating a combined kernel with all desired baseline kernels.
+The example demonstrates how to perform kernel selection and use it
+for two-sample testing. Methods for both single and combined kernels
+are demonstrated. In addition, type I and II error estimates
+are computed. As usual, there are more iterations/samples required in
+practice.
+See tutorial and Class documentation for more details.
diff --git a/examples/descriptions/modular/statistics_quadratic_time_mmd.txt b/examples/descriptions/modular/statistics_quadratic_time_mmd.txt
@@ -0,0 +1,10 @@
+In this example, the quadratic time MMD statistic for kernel-based two-sample
+testing is illustrated. It is a statistic for smaller amounts of data where
+one is interested to compute the best possible test. The used dataset is a
+bunch of standard Gaussian vectors where the first dimensions differs in both
+distributions p and q. The test statistic is computed and available methods
+for computing a threshold of the null distribution are used. In addition,
+p-values for the test are computed. Note that these methods require more
+iterations/samples that used here. A Gaussian is with a fixed kernel size is
+used. There are more clever kernel selection methods available.
+See tutorial and Class documentation for more details.
diff --git a/examples/undocumented/python_modular/modelselection_grid_search_kernel.py b/examples/undocumented/python_modular/modelselection_grid_search_kernel.py
@@ -5,11 +5,11 @@
 # the Free Software Foundation either version 3 of the License, or
 # (at your option) any later version.
 #
-# Written (C) 2012 Heiko Strathmann
+# Written (C) 2012-2013 Heiko Strathmann
 #
 
 from numpy import array
-from numpy.random import rand
+from numpy import random
 
 from shogun.Evaluation import CrossValidation, CrossValidationResult
 from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY
@@ -22,6 +22,7 @@
 from shogun.ModelSelection import GridSearchModelSelection
 from shogun.ModelSelection import ModelSelectionParameters, R_EXP, R_LINEAR
 from shogun.ModelSelection import ParameterCombination
+from shogun.Mathematics import Math
 
 def create_param_tree():
 	root=ModelSelectionParameters()
@@ -75,14 +76,15 @@ def create_param_tree():
 
 	return root
 
+parameter_list = [[3,20,3]]
 
-def modelselection_grid_search_kernel ():
-	num_subsets=3
-	num_vectors=20
-	dim_vectors=3
-
+def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors):
+	# init seed for reproducability
+	Math.init_random(1)
+	random.seed(1);
+	
 	# create some (non-sense) data
-	matrix=rand(dim_vectors, num_vectors)
+	matrix=random.rand(dim_vectors, num_vectors)
 
 	# create num_feautres 2-dimensional vectors
 	features=RealFeatures()
@@ -127,11 +129,11 @@ def modelselection_grid_search_kernel ():
 	cross.set_num_runs(10)
 	cross.set_conf_int_alpha(0.01)
 	result=cross.evaluate()
-	print("result: ")
-	#result.print_result()
+	casted=CrossValidationResult.obtain_from_generic(result);
+	print "result mean:", casted.mean
 
-	return 0
+	return classifier,result,casted.mean
 
 if __name__=='__main__':
 	print('ModelselectionGridSearchKernel')
-	modelselection_grid_search_kernel()
+	modelselection_grid_search_kernel(*parameter_list[0])
diff --git a/examples/undocumented/python_modular/serialization_string_kernels_modular.py b/examples/undocumented/python_modular/serialization_string_kernels_modular.py
@@ -147,22 +147,21 @@ def construct_features(features):
 
     return feat_comb
 
+parameter_list = [[200, 1, 100]]
 
-def serialization_string_kernels_modular():
+def serialization_string_kernels_modular(n_data, num_shifts, size):
     """
     serialize svm with string kernels
     """
 
     ##################################################
     # set up toy data and svm
-    train_xt, train_lt = generate_random_data(200)
-    test_xt, test_lt = generate_random_data(200)
+    train_xt, train_lt = generate_random_data(n_data)
+    test_xt, test_lt = generate_random_data(n_data)
 
     feats_train = construct_features(train_xt)
     feats_test = construct_features(test_xt)
 
-    num_shifts = 1
-    size = 100
     max_len = len(train_xt[0])
     kernel_wdk = WeightedDegreePositionStringKernel(size, 5)
     shifts_vector = numpy.ones(max_len, dtype=numpy.int32)*num_shifts
@@ -212,9 +211,9 @@ def serialization_string_kernels_modular():
 
     print("all checks passed.")
 
-    return True
+    return out,out2
 
 
 if __name__=='__main__':
-    serialization_string_kernels_modular()
+    serialization_string_kernels_modular(*parameter_list[0])
 
diff --git a/examples/undocumented/python_modular/statistics_hsic.py b/examples/undocumented/python_modular/statistics_hsic.py
@@ -5,26 +5,28 @@
 # the Free Software Foundation either version 3 of the License, or
 # (at your option) any later version.
 #
-# Written (C) 2012 Heiko Strathmann
+# Written (C) 2012-2013 Heiko Strathmann
 #
 from numpy import *
 #from pylab import *
 from math import pi
 
-def statistics_hsic ():
+parameter_list = [[250,3,3]]
+
+def statistics_hsic (n, difference, angle):
 	from shogun.Features import RealFeatures
 	from shogun.Features import DataGenerator
 	from shogun.Kernel import GaussianKernel
 	from shogun.Statistics import HSIC
 	from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA
 	from shogun.Distance import EuclideanDistance
-	from shogun.Mathematics import Statistics, IntVector
+	from shogun.Mathematics import Math, Statistics, IntVector
+
+	# init seed for reproducability
+	Math.init_random(1)
 
 	# note that the HSIC has to store kernel matrices
 	# which upper bounds the sample size
-	n=250
-	difference=3
-	angle=pi/3
 
 	# use data generator class to produce example data
 	data=DataGenerator.generate_sym_mix_gauss(n,difference,angle)
@@ -72,19 +74,19 @@ def statistics_hsic ():
 	# normally, at least 250 iterations should be done, but that takes long
 	hsic.set_bootstrap_iterations(100)
 	# bootstrapping allows usage of unbiased or biased statistic
-	p_value=hsic.compute_p_value(statistic)
-	thresh=hsic.compute_threshold(alpha)
-	print "p_value:", p_value
-	print "threshold for 0.05 alpha:", thresh
-	print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value<alpha
+	p_value_boot=hsic.compute_p_value(statistic)
+	thresh_boot=hsic.compute_threshold(alpha)
+	print "p_value:", p_value_boot
+	print "threshold for 0.05 alpha:", thresh_boot
+	print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha
 
 	print "computing p-value using gamma method"
 	hsic.set_null_approximation_method(HSIC_GAMMA)
-	p_value=hsic.compute_p_value(statistic)
-	thresh=hsic.compute_threshold(alpha)
-	print "p_value:", p_value
-	print "threshold for 0.05 alpha:", thresh
-	print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value<alpha
+	p_value_gamma=hsic.compute_p_value(statistic)
+	thresh_gamma=hsic.compute_threshold(alpha)
+	print "p_value:", p_value_gamma
+	print "threshold for 0.05 alpha:", thresh_gamma
+	print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha
 
 	# sample from null distribution (these may be plotted or whatsoever)
 	# mean should be close to zero, variance stronly depends on data/kernel
@@ -96,7 +98,9 @@ def statistics_hsic ():
 	print "null mean:", mean(null_samples)
 	print "null variance:", var(null_samples)
 	#hist(null_samples, 100); show()
+
+	return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
 
 if __name__=='__main__':
 	print('HSIC')
-	statistics_hsic()
+	statistics_hsic(*parameter_list[0])
diff --git a/examples/undocumented/python_modular/statistics_kmm.py b/examples/undocumented/python_modular/statistics_kmm.py
@@ -2,13 +2,20 @@
 from numpy import *
 from numpy import random
 
-def statistics_kmm ():
+parameter_list = [[10,3]]
+
+def statistics_kmm (n,d):
 	from shogun.Features import RealFeatures
 	from shogun.Features import DataGenerator
 	from shogun.Kernel import GaussianKernel, MSG_DEBUG
 	from shogun.Statistics import KernelMeanMatching
+	from shogun.Mathematics import Math
+
+	# init seed for reproducability
+	Math.init_random(1)
+	random.seed(1);
 
-	data = random.randn(3,10)
+	data = random.randn(d,n)
 
 	# create shogun feature representation
 	features=RealFeatures(data)
@@ -22,7 +29,8 @@ def statistics_kmm ():
 	kmm = KernelMeanMatching(kernel,array([0,1,2,3,7,8,9],dtype=int32),array([4,5,6],dtype=int32))
 	w = kmm.compute_weights()
 	print w
+	return w
 
 if __name__=='__main__':
 	print('KernelMeanMatching')
-	statistics_kmm()
+	statistics_kmm(*parameter_list[0])
diff --git a/examples/undocumented/python_modular/statistics_linear_time_mmd.py b/examples/undocumented/python_modular/statistics_linear_time_mmd.py
@@ -5,11 +5,13 @@
 # the Free Software Foundation either version 3 of the License, or
 # (at your option) any later version.
 #
-# Written (C) 2012 Heiko Strathmann
+# Written (C) 2012-2013 Heiko Strathmann
 #
 from numpy import *
 
-def statistics_linear_time_mmd ():
+parameter_list = [[1000,2,0.5]]
+
+def statistics_linear_time_mmd (n,dim,difference):
 	from shogun.Features import RealFeatures
 	from shogun.Features import MeanShiftDataGenerator
 	from shogun.Kernel import GaussianKernel
@@ -18,11 +20,11 @@ def statistics_linear_time_mmd ():
 	from shogun.Distance import EuclideanDistance
 	from shogun.Mathematics import Statistics, Math
 
+	# init seed for reproducability
+	Math.init_random(1)
+
 	# note that the linear time statistic is designed for much larger datasets
 	# so increase to get reasonable results
-	n=1000
-	dim=2
-	difference=0.5
 
 	# streaming data generator for mean shift distributions
 	gen_p=MeanShiftDataGenerator(0, dim)
@@ -63,15 +65,15 @@ def statistics_linear_time_mmd ():
 	print "computing p-value using bootstrapping"
 	mmd.set_null_approximation_method(BOOTSTRAP)
 	mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed
-	p_value=mmd.compute_p_value(statistic)
-	print "p_value:", p_value
-	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
+	p_value_boot=mmd.compute_p_value(statistic)
+	print "p_value_boot:", p_value_boot
+	print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha
 
 	print "computing p-value using gaussian approximation"
 	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
-	p_value=mmd.compute_p_value(statistic)
-	print "p_value:", p_value
-	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
+	p_value_gaussian=mmd.compute_p_value(statistic)
+	print "p_value_gaussian:", p_value_gaussian
+	print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha
 
 	# sample from null distribution (these may be plotted or whatsoever)
 	# mean should be close to zero, variance stronly depends on data/kernel
@@ -98,7 +100,8 @@ def statistics_linear_time_mmd ():
 
 	print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)
 
+	return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
 
 if __name__=='__main__':
 	print('LinearTimeMMD')
-	statistics_linear_time_mmd()
+	statistics_linear_time_mmd(*parameter_list[0])