added option to not print confusion

ragavvenkatesan · Mar 17, 2017 · 8c55e6a · 8c55e6a
1 parent 5f51387
commit 8c55e6a
Show file tree

Hide file tree

Showing 2 changed files with 230 additions and 36 deletions.
diff --git a/yann/modules/resultor.py b/yann/modules/resultor.py
@@ -22,6 +22,7 @@ class resultor(module):
                     "learning_rate" : "<learning_rate_file_name>.txt"
                     "momentum"  : <momentum_file_name>.txt
                     "visualize" : <bool>
+                    "print_confusion" : <bool>
                     "id"        : id of the resultor
                                 }
 
@@ -69,6 +70,9 @@ def __init__( self, resultor_init_args, verbose = 1):
         if not "viualize" in resultor_init_args.keys():
             resultor_init_args["visualize"] = True
 
+        if not"print_confusion" in resultor_init_args.keys():
+            resultor_init_args["print_confusion"] = False
+
         for item, value in resultor_init_args.iteritems():
             if item == "root":
                 self.root                   = value
@@ -84,6 +88,8 @@ def __init__( self, resultor_init_args, verbose = 1):
                 self.learning_rate          = value
             elif item == "momentum":
                 self.momentum               = value
+            elif item == "print_confusion":
+                self.print_confusion        = value
 
 
         if not hasattr(self, 'root'): raise Exception('root variable has not been provided. \
@@ -159,35 +165,37 @@ def print_confusion (self, epoch=0, train = None, valid = None, test = None, ver
             test: testing confusion matrix as gained by the test method.
             verbose: As usual.
         """
-        if verbose >=3:
-            print ("... Printing confusion matrix")
-        if not os.path.exists(self.root + '/confusion'):
-            if verbose >= 3:
-                print "... Creating a root directory for saving confusions"
-            os.makedirs(self.root + '/confusion')
+        if self.print_confusion is True:
+
+            if verbose >=3:
+                print ("... Printing confusion matrix")
+            if not os.path.exists(self.root + '/confusion'):
+                if verbose >= 3:
+                    print "... Creating a root directory for saving confusions"
+                os.makedirs(self.root + '/confusion')
+
+            location = self.root + '/confusion' + '/epoch_' + str(epoch)        
+            if not os.path.exists( location ):
+                if verbose >=3 :
+                    print "... Making the epoch directory"
+                os.makedirs (location)
 
-        location = self.root + '/confusion' + '/epoch_' + str(epoch)        
-        if not os.path.exists( location ):
             if verbose >=3 :
-                print "... Making the epoch directory"
-            os.makedirs (location)
-
-        if verbose >=3 :
-            print ("... Saving down the confusion matrix")
-
-        if not train is None:
-            self._store_confusion_img (confusion = train,
-                              filename = location + '/train_confusion.eps',
-                              verbose = 2)
-        if not valid is None:
-            self._store_confusion_img (confusion = valid,
-                              filename = location + '/valid_confusion.eps',
-                              verbose = 2)
-
-        if not test is None:
-            self._store_confusion_img (confusion = test,
-                              filename = location + '/test_confusion.eps',
-                              verbose = 2)
+                print ("... Saving down the confusion matrix")
+
+            if not train is None:
+                self._store_confusion_img (confusion = train,
+                                  filename = location + '/train_confusion.eps',
+                                  verbose = 2)
+            if not valid is None:
+                self._store_confusion_img (confusion = valid,
+                                  filename = location + '/valid_confusion.eps',
+                                  verbose = 2)
+
+            if not test is None:
+                self._store_confusion_img (confusion = test,
+                                  filename = location + '/test_confusion.eps',
+                                  verbose = 2)
 
     def _store_confusion_img (self, confusion, filename, verbose = 2):
         """

diff --git a/yann/special/datasets.py b/yann/special/datasets.py
@@ -971,7 +971,188 @@ def _create_skdata_mnist(self, verbose = 1):
         cPickle.dump(dataset_args, f, protocol=2)
         f.close()        
 
-    def _split_data (self, data):
+    def _mat2yann (self, verbose = 1):
+        """
+        Interal function. Use this to create matlab image datasets
+        This is modfied for the split dataset from the original ``setup_dataset`` class.
+        """
+        if verbose >=2:
+            print (".. Creating a split dataset")
+
+        type = 'train'
+        if verbose >= 2:
+            print ( ".. creating data " + type )
+
+        batches = self.batches2train
+        new = True
+        for batch in xrange(batches):       # for each batch_i file....
+            if verbose >= 3:
+                print ( "... batch " +str(batch) )
+
+            data_x_batch, data_y_batch = load_data_mat(location = self.location, 
+                                                batch = batch, 
+                                                type_set = type,
+                                                height = self.height,
+                                                width = self.width,
+                                                channels = self.channels)
+            if new is True:
+                data_x = data_x_batch
+                data_y = data_y_batch
+                new = False
+            else:
+                data_x = numpy.concatenate( (data_x, data_x_batch) , axis = 0)
+                data_y = numpy.concatenate( (data_y, data_y_batch) , axis = 0)
+
+        data_x, data_y  = self._split_data (( data_x, data_y ), y1 = False )
+        data_x = preprocessing ( data = data_x,
+                                height = self.height,
+                                width = self.width,
+                                channels = self.channels,
+                                args = self.preprocessor )
+
+        training_sample_size = data_x.shape[0]
+        training_mini_batches_available  = int(numpy.floor(training_sample_size / self.mini_batch_size))
+
+        if training_mini_batches_available < self.batches2train * self.mini_batches_per_batch[0]:
+            #self.mini_batches_per_batch = ( training_batches_available/self.batches2train,
+            #                                self.mini_batches_per_batch [1],
+            #                                self.mini_batches_per_batch [2] )
+            self.batches2train = int(numpy.floor(training_mini_batches_available / self.mini_batches_per_batch[0]))         
+
+        loc = self.root + "/train/"
+        data_x = check_type(data_x, theano.config.floatX)
+        data_y = check_type(data_y, theano.config.floatX)
+
+        for batch in xrange(self.batches2train):
+            start_index = batch * self.cache_images[0]
+            end_index = start_index + self.cache_images[0]
+            data2save = (data_x [start_index:end_index,], data_y[start_index:end_index,] )
+            pickle_dataset(loc = loc, data = data2save, batch=batch)
+
+
+        type = 'valid'
+        if verbose >= 2:
+            print ( ".. creating data " + type )
+        batches = self.batches2validate
+        new = True
+        del(data_x)
+        del(data_y)
+
+        for batch in xrange(batches):       # for each batch_i file....
+            if verbose >= 3:
+                print ( "... batch " +str(batch) )
+
+            data_x_batch, data_y_batch = load_data_mat(location = self.location, 
+                                                batch = batch, 
+                                                type_set = type,
+                                                height = self.height,
+                                                width = self.width,
+                                                channels = self.channels)
+
+            if new is True:
+                data_x = data_x_batch
+                data_y = data_y_batch
+                new = False
+            else:
+                data_x = numpy.concatenate( (data_x, data_x_batch) , axis = 0)
+                data_y = numpy.concatenate( (data_y, data_y_batch) , axis = 0)
+
+        # data_x, data_y  = self._split_data (( data_x, data_y ), y1 = False )
+        data_x = preprocessing ( data = data_x,
+                                height = self.height,
+                                width = self.width,
+                                channels = self.channels,
+                                args = self.preprocessor )
+
+
+        validation_sample_size = data_x.shape[0]
+        validation_mini_batches_available = int(numpy.floor(
+                                                validation_sample_size / self.mini_batch_size))
+
+        if validation_mini_batches_available < self.batches2validate * self.mini_batches_per_batch[1]:
+            self.batches2validate = int(numpy.floor(validation_mini_batches_available \
+                                    / self.mini_batches_per_batch[1]))
+
+        loc = self.root + "/valid/"
+        data_x = check_type(data_x, theano.config.floatX)
+        data_y = check_type(data_y, theano.config.floatX)
+
+        for batch in xrange(self.batches2validate):
+            start_index = batch * self.cache_images[1]
+            end_index = start_index + self.cache_images[1]
+            data2save = (data_x [start_index:end_index,], data_y[start_index:end_index,] )
+            pickle_dataset(loc = loc, data = data2save, batch=batch)
+
+        type = 'train'
+        if verbose >= 2:
+            print ( ".. creating data " + type )
+        batches = self.batches2test
+        new = True
+        del(data_x)
+        del(data_y)
+
+        for batch in xrange(batches):       # for each batch_i file....
+            if verbose >= 3:
+                print ( "... batch " +str(batch) )
+
+            data_x_batch, data_y_batch = load_data_mat(location = self.location, 
+                                                batch = batch, 
+                                                type_set = type,
+                                                height = self.height,
+                                                width = self.width,
+                                                channels = self.channels)
+            if new is True:
+                data_x = data_x_batch
+                data_y = data_y_batch
+                new = False
+            else:
+                data_x = numpy.concatenate( (data_x, data_x_batch) , axis = 0)
+                data_y = numpy.concatenate( (data_y, data_y_batch) , axis = 0)
+
+        # data_x, data_y  = self._split_data (( data_x, data_y ), y1 = False )
+        data_x = preprocessing ( data = data_x,
+                                height = self.height,
+                                width = self.width,
+                                channels = self.channels,
+                                args = self.preprocessor )
+
+        testing_sample_size = data_x.shape[0]
+        testing_mini_batches_available = int(numpy.floor(testing_sample_size / self.mini_batch_size))
+
+        if testing_mini_batches_available < self.batches2test * self.mini_batches_per_batch[2]:
+            self.batches2test = int(numpy.floor(testing_mini_batches_available \
+                                    / self.mini_batches_per_batch[2]))
+
+        loc = self.root + "/test/"
+        data_x = check_type(data_x, theano.config.floatX)
+        data_y = check_type(data_y, theano.config.floatX)
+
+        for batch in xrange(self.batches2test):
+            start_index = batch * self.cache_images[2]
+            end_index = start_index + self.cache_images[2]
+            data2save = (data_x [start_index:end_index,], data_y[start_index:end_index,] )
+            pickle_dataset(loc = loc, data = data2save, batch=batch)
+
+        dataset_args = {
+                "location"                  : self.root,
+                "mini_batch_size"           : self.mini_batch_size,
+                "cache_batches"             : self.mini_batches_per_batch,
+                "batches2train"             : self.batches2train,
+                "batches2test"              : self.batches2test,
+                "batches2validate"          : self.batches2validate,
+                "height"                    : self.height,
+                "width"                     : self.width,
+                "channels"              : 1 if self.preprocessor ["grayscale"] else self.channels,
+                "cache"                     : self.cache,
+                "splits"                    : self.splits
+                }
+
+        assert ( self.height * self.width * self.channels == numpy.prod(data_x.shape[1:]) )
+        f = open(self.root +  '/data_params.pkl', 'wb')
+        cPickle.dump(dataset_args, f, protocol=2)
+        f.close()
+
+    def _split_data (self, data, y1 = True):
         """
         This is an internal method that will split the datasets.
 
@@ -982,25 +1163,30 @@ def _split_data (self, data):
             tuple: split data in the same format as data.
         """
         n_shots = self.splits["p"]
-        data_x, data_y, data_y1  = data
+        if y1 is True:
+            data_x, data_y, data_y1  = data
+        else:
+            data_x, data_y = data
         locs = numpy.zeros(len(data_y), dtype = bool)
         for label in xrange(self.n_classes + 1):
             temp = numpy.zeros(len(data_y), dtype = bool)                                                
             temp[data_y==label] = True
             if label in self.splits["shot"]:
                 count = 0        
                 for element in xrange(len(temp)):
-                    if temp[element] == True:    # numpy needs == rather than 'is'     
+                    if temp[element] == True:    # numpy needs == rather than 'is'               
                         count = count + 1
-                    if count > n_shots:	                     	            
-                        temp[element] = False	
-                        		     
+                    if count > n_shots:                                     
+                        temp[element] = False   
+                                     
             locs[temp] = True
         data_x = data_x[locs]
         data_y = data_y[locs]
-        data_y1 = data_y1[locs]   
-        return (data_x, data_y, data_y1)  
-
+        if y1 is True:
+            data_y1 = data_y1[locs]
+            return (data_x, data_y, data_y1)  
+        else:
+            return (data_x, data_y)
 
 
 if __name__ == '__main__':