Skip to content

Commit

Permalink
Merge pull request #911 from karlnapf/master
Browse files Browse the repository at this point in the history
added integration tests
  • Loading branch information
karlnapf committed Mar 8, 2013
2 parents f7f3431 + 45e0d17 commit 9fe4085
Show file tree
Hide file tree
Showing 13 changed files with 271 additions and 158 deletions.
7 changes: 7 additions & 0 deletions examples/descriptions/modular/statistics_hsic.txt
@@ -0,0 +1,7 @@
In this example, HSIC, a kernel-based test for independence is used to detect
dependence of a mixture of Gaussians and a rotated version of the same data.
The HSIC statistic is computed and available methods for computing a threshold
of the null distribution are used. In addition, p-values of the test are
computed. Note that these methods require more iterations than used here. A
Gaussian kernel is selected via the median heuristic.
See tutorial and Class documentation for more details.
10 changes: 10 additions & 0 deletions examples/descriptions/modular/statistics_linear_time_mmd.txt
@@ -0,0 +1,10 @@
In this example, the linear time MMD statistic for kernel-based two-sample
testing is illustrated. It is a streaming based statistic for large amounts
of data. The used dataset is a bunch of standard Gaussian vectors where the
first dimensions differs in both distributions p and q. The test statistic
is computed and available methods for computing a threshold of the null
distribution are used. In addition, p-values for the test are computed.
Note that these methods require more iterations/samples that used here. A
Gaussian is selected via the median heuristic. There are more clever
kernel selection methods available.
See tutorial and Class documentation for more details.
10 changes: 10 additions & 0 deletions examples/descriptions/modular/statistics_mmd_kernel_selection.txt
@@ -0,0 +1,10 @@
In this example, kernel selection methods for MMD based statistics are
illustrated. A difficult synthetic dataset is used to illustrate their
performance in two-sample testing. All kernel selection methods for MMD
work via creating a combined kernel with all desired baseline kernels.
The example demonstrates how to perform kernel selection and use it
for two-sample testing. Methods for both single and combined kernels
are demonstrated. In addition, type I and II error estimates
are computed. As usual, there are more iterations/samples required in
practice.
See tutorial and Class documentation for more details.
10 changes: 10 additions & 0 deletions examples/descriptions/modular/statistics_quadratic_time_mmd.txt
@@ -0,0 +1,10 @@
In this example, the quadratic time MMD statistic for kernel-based two-sample
testing is illustrated. It is a statistic for smaller amounts of data where
one is interested to compute the best possible test. The used dataset is a
bunch of standard Gaussian vectors where the first dimensions differs in both
distributions p and q. The test statistic is computed and available methods
for computing a threshold of the null distribution are used. In addition,
p-values for the test are computed. Note that these methods require more
iterations/samples that used here. A Gaussian is with a fixed kernel size is
used. There are more clever kernel selection methods available.
See tutorial and Class documentation for more details.
Expand Up @@ -5,11 +5,11 @@
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012 Heiko Strathmann
# Written (C) 2012-2013 Heiko Strathmann
#

from numpy import array
from numpy.random import rand
from numpy import random

from shogun.Evaluation import CrossValidation, CrossValidationResult
from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY
Expand All @@ -22,6 +22,7 @@
from shogun.ModelSelection import GridSearchModelSelection
from shogun.ModelSelection import ModelSelectionParameters, R_EXP, R_LINEAR
from shogun.ModelSelection import ParameterCombination
from shogun.Mathematics import Math

def create_param_tree():
root=ModelSelectionParameters()
Expand Down Expand Up @@ -75,14 +76,15 @@ def create_param_tree():

return root

parameter_list = [[3,20,3]]

def modelselection_grid_search_kernel ():
num_subsets=3
num_vectors=20
dim_vectors=3

def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors):
# init seed for reproducability
Math.init_random(1)
random.seed(1);
# create some (non-sense) data
matrix=rand(dim_vectors, num_vectors)
matrix=random.rand(dim_vectors, num_vectors)

# create num_feautres 2-dimensional vectors
features=RealFeatures()
Expand Down Expand Up @@ -127,11 +129,11 @@ def modelselection_grid_search_kernel ():
cross.set_num_runs(10)
cross.set_conf_int_alpha(0.01)
result=cross.evaluate()
print("result: ")
#result.print_result()
casted=CrossValidationResult.obtain_from_generic(result);
print "result mean:", casted.mean

return 0
return classifier,result,casted.mean

if __name__=='__main__':
print('ModelselectionGridSearchKernel')
modelselection_grid_search_kernel()
modelselection_grid_search_kernel(*parameter_list[0])
Expand Up @@ -147,22 +147,21 @@ def construct_features(features):

return feat_comb

parameter_list = [[200, 1, 100]]

def serialization_string_kernels_modular():
def serialization_string_kernels_modular(n_data, num_shifts, size):
"""
serialize svm with string kernels
"""

##################################################
# set up toy data and svm
train_xt, train_lt = generate_random_data(200)
test_xt, test_lt = generate_random_data(200)
train_xt, train_lt = generate_random_data(n_data)
test_xt, test_lt = generate_random_data(n_data)

feats_train = construct_features(train_xt)
feats_test = construct_features(test_xt)

num_shifts = 1
size = 100
max_len = len(train_xt[0])
kernel_wdk = WeightedDegreePositionStringKernel(size, 5)
shifts_vector = numpy.ones(max_len, dtype=numpy.int32)*num_shifts
Expand Down Expand Up @@ -212,9 +211,9 @@ def serialization_string_kernels_modular():

print("all checks passed.")

return True
return out,out2


if __name__=='__main__':
serialization_string_kernels_modular()
serialization_string_kernels_modular(*parameter_list[0])

38 changes: 21 additions & 17 deletions examples/undocumented/python_modular/statistics_hsic.py
Expand Up @@ -5,26 +5,28 @@
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012 Heiko Strathmann
# Written (C) 2012-2013 Heiko Strathmann
#
from numpy import *
#from pylab import *
from math import pi

def statistics_hsic ():
parameter_list = [[250,3,3]]

def statistics_hsic (n, difference, angle):
from shogun.Features import RealFeatures
from shogun.Features import DataGenerator
from shogun.Kernel import GaussianKernel
from shogun.Statistics import HSIC
from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA
from shogun.Distance import EuclideanDistance
from shogun.Mathematics import Statistics, IntVector
from shogun.Mathematics import Math, Statistics, IntVector

# init seed for reproducability
Math.init_random(1)

# note that the HSIC has to store kernel matrices
# which upper bounds the sample size
n=250
difference=3
angle=pi/3

# use data generator class to produce example data
data=DataGenerator.generate_sym_mix_gauss(n,difference,angle)
Expand Down Expand Up @@ -72,19 +74,19 @@ def statistics_hsic ():
# normally, at least 250 iterations should be done, but that takes long
hsic.set_bootstrap_iterations(100)
# bootstrapping allows usage of unbiased or biased statistic
p_value=hsic.compute_p_value(statistic)
thresh=hsic.compute_threshold(alpha)
print "p_value:", p_value
print "threshold for 0.05 alpha:", thresh
print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value<alpha
p_value_boot=hsic.compute_p_value(statistic)
thresh_boot=hsic.compute_threshold(alpha)
print "p_value:", p_value_boot
print "threshold for 0.05 alpha:", thresh_boot
print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha

print "computing p-value using gamma method"
hsic.set_null_approximation_method(HSIC_GAMMA)
p_value=hsic.compute_p_value(statistic)
thresh=hsic.compute_threshold(alpha)
print "p_value:", p_value
print "threshold for 0.05 alpha:", thresh
print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value<alpha
p_value_gamma=hsic.compute_p_value(statistic)
thresh_gamma=hsic.compute_threshold(alpha)
print "p_value:", p_value_gamma
print "threshold for 0.05 alpha:", thresh_gamma
print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha

# sample from null distribution (these may be plotted or whatsoever)
# mean should be close to zero, variance stronly depends on data/kernel
Expand All @@ -96,7 +98,9 @@ def statistics_hsic ():
print "null mean:", mean(null_samples)
print "null variance:", var(null_samples)
#hist(null_samples, 100); show()

return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples

if __name__=='__main__':
print('HSIC')
statistics_hsic()
statistics_hsic(*parameter_list[0])
14 changes: 11 additions & 3 deletions examples/undocumented/python_modular/statistics_kmm.py
Expand Up @@ -2,13 +2,20 @@
from numpy import *
from numpy import random

def statistics_kmm ():
parameter_list = [[10,3]]

def statistics_kmm (n,d):
from shogun.Features import RealFeatures
from shogun.Features import DataGenerator
from shogun.Kernel import GaussianKernel, MSG_DEBUG
from shogun.Statistics import KernelMeanMatching
from shogun.Mathematics import Math

# init seed for reproducability
Math.init_random(1)
random.seed(1);

data = random.randn(3,10)
data = random.randn(d,n)

# create shogun feature representation
features=RealFeatures(data)
Expand All @@ -22,7 +29,8 @@ def statistics_kmm ():
kmm = KernelMeanMatching(kernel,array([0,1,2,3,7,8,9],dtype=int32),array([4,5,6],dtype=int32))
w = kmm.compute_weights()
print w
return w

if __name__=='__main__':
print('KernelMeanMatching')
statistics_kmm()
statistics_kmm(*parameter_list[0])
27 changes: 15 additions & 12 deletions examples/undocumented/python_modular/statistics_linear_time_mmd.py
Expand Up @@ -5,11 +5,13 @@
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012 Heiko Strathmann
# Written (C) 2012-2013 Heiko Strathmann
#
from numpy import *

def statistics_linear_time_mmd ():
parameter_list = [[1000,2,0.5]]

def statistics_linear_time_mmd (n,dim,difference):
from shogun.Features import RealFeatures
from shogun.Features import MeanShiftDataGenerator
from shogun.Kernel import GaussianKernel
Expand All @@ -18,11 +20,11 @@ def statistics_linear_time_mmd ():
from shogun.Distance import EuclideanDistance
from shogun.Mathematics import Statistics, Math

# init seed for reproducability
Math.init_random(1)

# note that the linear time statistic is designed for much larger datasets
# so increase to get reasonable results
n=1000
dim=2
difference=0.5

# streaming data generator for mean shift distributions
gen_p=MeanShiftDataGenerator(0, dim)
Expand Down Expand Up @@ -63,15 +65,15 @@ def statistics_linear_time_mmd ():
print "computing p-value using bootstrapping"
mmd.set_null_approximation_method(BOOTSTRAP)
mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed
p_value=mmd.compute_p_value(statistic)
print "p_value:", p_value
print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
p_value_boot=mmd.compute_p_value(statistic)
print "p_value_boot:", p_value_boot
print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha

print "computing p-value using gaussian approximation"
mmd.set_null_approximation_method(MMD1_GAUSSIAN)
p_value=mmd.compute_p_value(statistic)
print "p_value:", p_value
print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
p_value_gaussian=mmd.compute_p_value(statistic)
print "p_value_gaussian:", p_value_gaussian
print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha

# sample from null distribution (these may be plotted or whatsoever)
# mean should be close to zero, variance stronly depends on data/kernel
Expand All @@ -98,7 +100,8 @@ def statistics_linear_time_mmd ():

print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors

if __name__=='__main__':
print('LinearTimeMMD')
statistics_linear_time_mmd()
statistics_linear_time_mmd(*parameter_list[0])

0 comments on commit 9fe4085

Please sign in to comment.