adding svhn fully.

ragavvenkatesan · Mar 15, 2017 · 7afe7e4 · 7afe7e4
1 parent 3783b39
commit 7afe7e4
Show file tree

Hide file tree

Showing 2 changed files with 221 additions and 6 deletions.
diff --git a/pantry/matlab/make_svhn.m b/pantry/matlab/make_svhn.m
@@ -52,15 +52,15 @@
 % Going to throw away 420 samples.
 throw_away = 420; 
 batch_size = 500;
+test_size = 130;
+train_size = 1000;
 
 data = x (1:length(x) - throw_away,:);
 labels = y (1:length(y) - throw_away) - 1; % because labels go from 1-10
 
 total_batches = length(labels) / batch_size;
-test_size = 130;
 remain = total_batches - test_size; 
 
-train_size = 1000;
 remain = remain - train_size;
 valid_size = remain; 
 

diff --git a/yann/special/datasets.py b/yann/special/datasets.py
@@ -418,11 +418,219 @@ def __init__(self,
                             "p"                 : 0
                         } 
 
+        self.n_classes = len(self.splits['base']) + len(self.splits['shot'])
         super(split_all,self).__init__(     dataset_init_args = dataset_init_args,
                                             save_directory = save_directory,
                                             preprocess_init_args = self.preprocessor,
                                             verbose = 1)
 
+    def _mat2yann (self, verbose = 1):
+        """
+        Interal function. Use this to create matlab image datasets
+        This is modfied for the split dataset from the original ``setup_dataset`` class.
+        """
+        if verbose >=2:
+            print (".. Creating a split dataset")
+
+        type = 'train'
+        if verbose >= 2:
+            print ( ".. creating data " + type )
+
+        batches = self.batches2train
+        new = True
+        for batch in xrange(batches):		# for each batch_i file....
+            if verbose >= 3:
+                print ( "... batch " +str(batch) )
+
+            data_x_batch, data_y_batch = load_data_mat(location = self.location, 
+                                                batch = batch, 
+                                                type_set = type,
+                                                height = self.height,
+                                                width = self.width,
+                                                channels = self.channels)
+            if new is True:
+                data_x = data_x_batch
+                data_y = data_y_batch
+                new = False
+            else:
+                data_x = numpy.concatenate( (data_x, data_x_batch) , axis = 0)
+                data_y = numpy.concatenate( (data_y, data_y_batch) , axis = 0)
+
+        data_x, data_y  = self._split_data (( data_x, data_y ), y1 = False )
+        data_x = preprocessing ( data = data_x,
+                                height = self.height,
+                                width = self.width,
+                                channels = self.channels,
+                                args = self.preprocessor )
+
+        training_sample_size = data_x.shape[0]
+        training_batches_available  = int(numpy.floor(training_sample_size / self.mini_batch_size))
+
+        if training_batches_available < self.batches2train * self.mini_batches_per_batch[0]:
+            self.mini_batches_per_batch = ( training_batches_available/self.batches2train,
+                                            self.mini_batches_per_batch [1],
+                                            self.mini_batches_per_batch [2] )
+
+        if self.batches2train * self.mini_batches_per_batch[0] < self.cache_images[0]:
+            self.cache_images = (self.mini_batches_per_batch[0] * self.mini_batch_size, \
+                                        self.cache_images[1],  self.cache_images[2])
+
+        data_x = data_x[:self.cache_images[0]]
+        data_y = data_y[:self.cache_images[0]]                
+
+        loc = self.root + "/train/"
+        data_x = check_type(data_x, theano.config.floatX)
+        data_y = check_type(data_y, theano.config.floatX)
+
+        for batch in xrange(self.batches2train):
+            start_index = batch * self.cache_images[0]
+            end_index = start_index + self.cache_images[0]
+            data2save = (data_x [start_index:end_index,], data_y[start_index:end_index,] )
+            pickle_dataset(loc = loc, data = data2save, batch=batch)
+
+
+        batches = self.batches2validate
+        type = 'valid'
+        if verbose >= 2:
+            print ( ".. creating data " + type )
+
+        batches = self.batches2validate
+        new = True
+        del(data_x)
+        del(data_y)
+
+        for batch in xrange(batches):		# for each batch_i file....
+            if verbose >= 3:
+                print ( "... batch " +str(batch) )
+
+            data_x_batch, data_y_batch = load_data_mat(location = self.location, 
+                                                batch = batch, 
+                                                type_set = type,
+                                                height = self.height,
+                                                width = self.width,
+                                                channels = self.channels)
+
+            if new is True:
+                data_x = data_x_batch
+                data_y = data_y_batch
+                new = False
+            else:
+                data_x = numpy.concatenate( (data_x, data_x_batch) , axis = 0)
+                data_y = numpy.concatenate( (data_y, data_y_batch) , axis = 0)
+
+        data_x, data_y  = self._split_data (( data_x, data_y ), y1 = False )
+        data_x = preprocessing ( data = data_x,
+                                height = self.height,
+                                width = self.width,
+                                channels = self.channels,
+                                args = self.preprocessor )
+
+
+        validation_sample_size = data_x.shape[0]
+        validation_batches_available = int(numpy.floor(
+                                                validation_sample_size / self.mini_batch_size))
+
+        if validation_batches_available < self.batches2validate * self.mini_batches_per_batch[1]:
+            self.mini_batches_per_batch = ( self.mini_batches_per_batch [0],
+                                            validation_batches_available/self.batches2validate,
+                                            self.mini_batches_per_batch [2] )
+
+        if self.batches2validate * self.mini_batches_per_batch[1] < self.cache_images[1]:
+            self.cache_images = (   self.cache_images[0],\
+                                    self.mini_batches_per_batch[1] * self.mini_batch_size, \
+                                    self.cache_images[2])
+
+        data_x = data_x[:self.cache_images[1]]
+        data_y = data_y[:self.cache_images[1]]
+
+        loc = self.root + "/valid/"
+        data_x = check_type(data_x, theano.config.floatX)
+        data_y = check_type(data_y, theano.config.floatX)
+
+        for batch in xrange(self.batches2validate):
+            start_index = batch * self.cache_images[1]
+            end_index = start_index + self.cache_images[1]
+            data2save = (data_x [start_index:end_index,], data_y[start_index:end_index,] )
+            pickle_dataset(loc = loc, data = data2save, batch=batch)
+
+        type = 'train'
+        if verbose >= 2:
+            print ( ".. creating data " + type )
+        batches = self.batches2test
+        new = True
+        del(data_x)
+        del(data_y)
+
+        for batch in xrange(batches):		# for each batch_i file....
+            if verbose >= 3:
+                print ( "... batch " +str(batch) )
+
+            data_x_batch, data_y_batch = load_data_mat(location = self.location, 
+                                                batch = batch, 
+                                                type_set = type,
+                                                height = self.height,
+                                                width = self.width,
+                                                channels = self.channels)
+            if new is True:
+                data_x = data_x_batch
+                data_y = data_y_batch
+                new = False
+            else:
+                data_x = numpy.concatenate( (data_x, data_x_batch) , axis = 0)
+                data_y = numpy.concatenate( (data_y, data_y_batch) , axis = 0)
+
+        data_x, data_y  = self._split_data (( data_x, data_y ), y1 = False )
+        data_x = preprocessing ( data = data_x,
+                                height = self.height,
+                                width = self.width,
+                                channels = self.channels,
+                                args = self.preprocessor )
+
+        testing_sample_size = data_x.shape[0]
+        testing_batches_available = int(numpy.floor(testing_sample_size / self.mini_batch_size))
+
+        if testing_batches_available < self.batches2test * self.mini_batches_per_batch[2]:
+            self.mini_batches_per_batch = ( self.mini_batches_per_batch [0],
+                                            self.mini_batches_per_batch [1],
+                                            testing_batches_available/self.batches2test )
+
+        if self.batches2test * self.mini_batches_per_batch[2] < self.cache_images[2]:
+            self.cache_images = (   self.cache_images[0],\
+                                    self.cache_images[1], \
+                                    self.mini_batches_per_batch[2] * self.mini_batch_size )
+
+        data_x = data_x[:self.cache_images[2]]
+        data_y = data_y[:self.cache_images[2]]
+
+        loc = self.root + "/test/"
+        data_x = check_type(data_x, theano.config.floatX)
+        data_y = check_type(data_y, theano.config.floatX)
+
+        for batch in xrange(self.batches2test):
+            start_index = batch * self.cache_images[2]
+            end_index = start_index + self.cache_images[2]
+            data2save = (data_x [start_index:end_index,], data_y[start_index:end_index,] )
+            pickle_dataset(loc = loc, data = data2save, batch=batch)
+
+        dataset_args = {
+                "location"                  : self.root,
+                "mini_batch_size"           : self.mini_batch_size,
+                "cache_batches"             : self.mini_batches_per_batch,
+                "batches2train"             : self.batches2train,
+                "batches2test"              : self.batches2test,
+                "batches2validate"          : self.batches2validate,
+                "height"                    : self.height,
+                "width"                     : self.width,
+                "channels"              : 1 if self.preprocessor ["grayscale"] else self.channels,
+                "cache"                     : self.cache,
+                "splits"                    : self.splits
+                }
+
+        assert ( self.height * self.width * self.channels == numpy.prod(data_x.shape[1:]) )
+        f = open(self.root +  '/data_params.pkl', 'wb')
+        cPickle.dump(dataset_args, f, protocol=2)
+        f.close()
+
     def _create_skdata_mnist(self, verbose = 1):
         """
         Interal function. Use this to create mnist and cifar image datasets
@@ -563,7 +771,8 @@ def _create_skdata_mnist(self, verbose = 1):
         cPickle.dump(dataset_args, f, protocol=2)
         f.close()
 
-    def _split_data (self, data):
+
+    def _split_data (self, data, y1 = True):
         """
         This is an internal method that will split the datasets.
 
@@ -574,7 +783,10 @@ def _split_data (self, data):
             tuple: split data in the same format as data.
         """
         n_shots = self.splits["p"]
-        data_x, data_y, data_y1  = data
+        if y1 is True:
+            data_x, data_y, data_y1  = data
+        else:
+            data_x, data_y = data
         locs = numpy.zeros(len(data_y), dtype = bool)
         for label in xrange(self.n_classes + 1):
             temp = numpy.zeros(len(data_y), dtype = bool)                                                
@@ -590,8 +802,11 @@ def _split_data (self, data):
             locs[temp] = True
         data_x = data_x[locs]
         data_y = data_y[locs]
-        data_y1 = data_y1[locs]
-        return (data_x, data_y, data_y1)  
+        if y1 is True:
+            data_y1 = data_y1[locs]
+            return (data_x, data_y, data_y1)  
+        else:
+            return (data_x, data_y)
 
 class split_only_train(setup_dataset):