Dataset for dealing with precomputed kernels.

scipy · Jul 14, 2006 · 69cd19a · 69cd19a
1 parent 6c5f3df
commit 69cd19a
Show file tree

Hide file tree

Showing 7 changed files with 59 additions and 52 deletions.
diff --git a/Lib/sandbox/svm/dataset.py b/Lib/sandbox/svm/dataset.py
@@ -44,34 +44,31 @@ def __init__(self, kernel, origdata=None):
 
         self.iddatamap = {}
 
-        # Create Gram matrix as a list of vectors that have extra
-        # entries for id and end of record marker.
+        # Create Gram matrix as a list of vectors which an extra entry
+        # for the id field.
         n = len(origdata)
-        grammat = [N.empty((n+2,), dtype=libsvm.svm_node_dtype)
+        grammat = [N.empty((n+1,), dtype=libsvm.svm_node_dtype)
                    for i in range(n)]
         self.grammat = grammat
 
         # Calculate Gram matrix. Refer to Kernel::kernel_precomputed
         # in svm.cpp to see how this precomputed setup works.
-        for i, (y1, x1) in enumerate(origdata):
+        for i, (yi, xi) in enumerate(origdata):
             id = i + 1
-            # XXX possible numpy bug
-            #grammat[i][[0,-1]] = (0, id), (-1, 0.0)
             grammat[i][0] = 0, id
-            grammat[i][-1] = -1, 0.0
-            for j, (y2, x2) in enumerate(origdata[i:]):
+            # Map id to original vector so that we can find it again
+            # after the model has been trained. libsvm essentially
+            # provides the ids of the support vectors.
+            self.iddatamap[id] = xi
+            for j, (yj, xj) in enumerate(origdata[i:]):
                 # Gram matrix is symmetric, so calculate dot product
                 # once and store it in both required locations
-                z = kernel(x1, x2, svm_node_dot)
+                z = self.kernel(xi, xj, svm_node_dot)
                 # fix index so we assign to the right place
                 j += i
-                grammat[i][j+1] = 0, z
-                grammat[j][i+1] = 0, z
-            # Map id to original vector so that we can find it again
-            # after the model has been trained. libsvm essentially
-            # provides the ids of the support vectors.
-            self.iddatamap[id] = x1
-
+                grammat[i][j + 1] = 0, z
+                grammat[j][i + 1] = 0, z
+
     def getdata(self):
         return zip(map(lambda x: x[0], self.origdata), self.grammat)
     data = property(getdata)
@@ -89,30 +86,45 @@ def combine(self, dataset):
         Combine this dataset with another dataset by extending the
         Gram matrix with the new inner products into a new matrix.
         """
-        n = len(self.origdata) + len(dataset.data)
+        n = len(self.origdata) + len(dataset.data) + 1
         newgrammat = []
 
         # copy original Gram matrix
         for i in range(len(self.origdata)):
-            row = N.empty((n,), dtype=libsvm.svm_node_dtype)
-            row[:-1] = self.grammat[i]
-            newgrammat.append(row)
-
-        # copy id->vector map
-        newiddatamap = dict(self.iddatamap.items())
+            newrow = N.zeros((n,), dtype=libsvm.svm_node_dtype)
+            oldrow = self.grammat[i]
+            newrow[:len(oldrow)] = oldrow
+            newgrammat.append(newrow)
 
         # prepare Gram matrix for new data
         for i in range(len(dataset.data)):
-            id = i + len(self.origdata) + 1
-            row = N.empty((n,), dtype=libsvm.svm_node_dtype)
-            row[[0,-1]] = (0, id), (-1, 0.0)
+            row = N.zeros((n,), dtype=libsvm.svm_node_dtype)
             newgrammat.append(row)
-            newiddatamap[id] = dataset.data[i][1]
+
+        newiddatamap = dict(self.iddatamap.items())
+        m = len(self.origdata)
+        for i, (yi, xi) in enumerate(dataset.data):
+            i += m
+            for j, (yj, xj) in enumerate(self.origdata):
+                z = self.kernel(xi, xj, svm_node_dot)
+                newgrammat[i][j + 1] = 0, z
+                newgrammat[j][i + 1] = 0, z
+        for i, (yi, xi) in enumerate(dataset.data):
+            k = m + i
+            id = k + 1
+            newgrammat[k][0] = 0, id
+            newiddatamap[id] = xi
+            for j, (yj, xj) in enumerate(dataset.data[i:]):
+                z = self.kernel(xi, xj, svm_node_dot)
+                j += k
+                newgrammat[k][j + 1] = 0, z
+                newgrammat[j][k + 1] = 0, z
 
         newdataset = self.__class__(self.kernel)
         newdataset.origdata = self.origdata + dataset.data
         newdataset.iddatamap = newiddatamap
         newdataset.grammat = newgrammat
+        return newdataset
 
 class LibSvmRegressionDataSet(LibSvmDataSet):
     def __init__(self, origdata):

diff --git a/Lib/sandbox/svm/tests/test_classification.py b/Lib/sandbox/svm/tests/test_classification.py
@@ -1,13 +1,11 @@
+from numpy.testing import *
 import numpy as N
 
-from numpy.testing import *
 set_local_path('../..')
-
 from svm.classification import *
 from svm.dataset import LibSvmClassificationDataSet
 from svm.dataset import LibSvmTestDataSet
 from svm.kernel import *
-
 restore_path()
 
 class test_classification(NumpyTestCase):

diff --git a/Lib/sandbox/svm/tests/test_dataset.py b/Lib/sandbox/svm/tests/test_dataset.py
@@ -1,13 +1,11 @@
+from numpy.testing import *
 import numpy as N
 
-from numpy.testing import *
 set_local_path('../..')
-
 from svm.dataset import *
 from svm.kernel import *
 from svm.dataset import convert_to_svm_node, svm_node_dot
 from svm.libsvm import svm_node_dtype
-
 restore_path()
 
 class test_dataset(NumpyTestCase):
@@ -95,23 +93,30 @@ def check_precompute(self):
             # get a new dataset containing the precomputed data
             pcdata = origdata.precompute(kernel)
             for i, row in enumerate(pcdata.grammat):
-                valuerow = row[1:-1]['value']
+                valuerow = row[1:]['value']
                 assert_array_almost_equal(valuerow, expt_grammat[i])
 
     def check_combine(self):
         kernel = LinearKernel()
 
-        y1 = N.random.randn(2)
-        x1 = N.random.randn(len(y1), 2)
+        y1 = N.random.randn(10)
+        x1 = N.random.randn(len(y1), 10)
         origdata = LibSvmRegressionDataSet(zip(y1, x1))
         pcdata = origdata.precompute(kernel)
 
-        y2 = N.random.randn(1)
+        y2 = N.random.randn(5)
         x2 = N.random.randn(len(y2), x1.shape[1])
         moredata = LibSvmRegressionDataSet(zip(y2, x2))
-
-        #pcdata.combine(moredata)
-        #pcdata.copy_and_extend(moredata)
+        morepcdata = pcdata.combine(moredata)
+
+        expt_grammat = N.empty((len(y1) + len(y2),)*2)
+        x = N.vstack([x1,x2])
+        for i, xi in enumerate(x):
+            for j, xj in enumerate(x):
+                expt_grammat[i, j] = kernel(xi, xj, N.dot)
+        for i, row in enumerate(morepcdata.grammat):
+            valuerow = row[1:]['value']
+            assert_array_almost_equal(valuerow, expt_grammat[i])
 
 if __name__ == '__main__':
     NumpyTest().run()
diff --git a/Lib/sandbox/svm/tests/test_kernel.py b/Lib/sandbox/svm/tests/test_kernel.py
@@ -1,10 +1,8 @@
+from numpy.testing import *
 import numpy as N
 
-from numpy.testing import *
 set_local_path('../..')
-
 from svm.kernel import *
-
 restore_path()
 
 class test_kernel(NumpyTestCase):

diff --git a/Lib/sandbox/svm/tests/test_libsvm.py b/Lib/sandbox/svm/tests/test_libsvm.py
@@ -1,10 +1,8 @@
+from numpy.testing import *
 import numpy as N
 
-from numpy.testing import *
 set_local_path('../..')
-
 import svm.libsvm as libsvm
-
 restore_path()
 
 class test_libsvm(NumpyTestCase):

diff --git a/Lib/sandbox/svm/tests/test_oneclass.py b/Lib/sandbox/svm/tests/test_oneclass.py
@@ -1,13 +1,11 @@
+from numpy.testing import *
 import numpy as N
 
-from numpy.testing import *
 set_local_path('../..')
-
 from svm.oneclass import *
 from svm.dataset import LibSvmOneClassDataSet
 from svm.dataset import LibSvmTestDataSet
 from svm.kernel import *
-
 restore_path()
 
 class test_oneclass(NumpyTestCase):

diff --git a/Lib/sandbox/svm/tests/test_regression.py b/Lib/sandbox/svm/tests/test_regression.py
@@ -1,13 +1,11 @@
+from numpy.testing import *
 import numpy as N
 
-from numpy.testing import *
 set_local_path('../..')
-
 from svm.regression import *
 from svm.dataset import LibSvmRegressionDataSet
 from svm.dataset import LibSvmTestDataSet
 from svm.kernel import *
-
 restore_path()
 
 class test_regression(NumpyTestCase):