Prepping for Tutorials.

ragavvenkatesan · Feb 10, 2017 · 08d599f · 08d599f
1 parent 76490a9
commit 08d599f
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 40 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,4 @@
 /visualizer
 /lenet5
 .vscode
-.yann_data
+svhn
diff --git a/pantry/tutorials/mat2yann.py b/pantry/tutorials/mat2yann.py
@@ -0,0 +1,55 @@
+"""
+This is a tutorial to setup any dataset in matlab format to be used by YANN.
+Still working on this. 
+"""
+def cook_svhn_normalized( location, verbose = 1, **kwargs):
+    """
+    This method demonstrates how to cook a dataset for yann from matlab.
+
+    Args:
+
+        location: provide the location where the dataset is created and stored.
+                  Refer to prepare_svhn.m file to understand how to prepare a dataset.
+        save_directory: which directory to save the cooked dataset onto.
+        dataset_parms: default is the dictionary. Refer to :mod:`setup_dataset`
+        preprocess_params: default is the dictionary. Refer to :mod:`setup_dataset`.
+
+    Notes:
+        By default, this will create a dataset that is not mean-subtracted.
+    """
+
+    if not 'save_directory' in kwargs.keys():
+        save_directory = '_datasets'
+    else:
+        save_directory = kwargs ['save_directory']
+
+    if not 'data_params' in kwargs.keys():
+
+        data_params = {
+                   "source"             : 'mat',
+                   "name"               : 'yann_svhn',
+                   "location"			: location,
+                   "height"             : 32,
+                   "width"              : 32,
+                   "channels"           : 3  }
+
+    else:
+        data_params = kwargs['data_params']
+
+    if not 'preprocess_params' in kwargs.keys():
+
+    # parameters relating to preprocessing.
+        preprocess_params = {
+                            "normalize"     : True,
+                            "ZCA"           : False,
+                            "grayscale"     : False,
+                            "zero_mean"     : False,
+                        }
+    else:
+        preprocess_params = kwargs['preprocess_params']
+
+    dataset = setup_dataset(dataset_init_args = data_params,
+                            save_directory = save_directory,
+                            preprocess_init_args = preprocess_params,
+                            verbose = 3)
+    return dataset
diff --git a/yann/special/datasets.py b/yann/special/datasets.py
@@ -163,7 +163,7 @@ def cook_mnist_multi_load(  verbose = 1, **kwargs):
                             verbose = 3)
     return dataset
 
-def cook_cifar10(verbose = 1, **kwargs):
+def cook_cifar10_normalized(verbose = 1, **kwargs):
     """
     Wrapper to cook cifar10 dataset. Will take as input,
 
@@ -321,6 +321,7 @@ def cook_caltech256(verbose = 1, **kwargs):
 
 # Just some wrappers
 cook_mnist = cook_mnist_normalized
+cook_cifar10 = cook_cifar10_normalized
 
 if __name__ == '__main__':
     pass
diff --git a/yann/utils/dataset.py b/yann/utils/dataset.py
@@ -71,36 +71,6 @@ def download_data (url, location):
 		print status,
 	f.close()    
 
-def load_svhn():
-	"""
-	Function that downloads the dataset from a url and returns the dataset in full
-
-	Returns:
-	list: ``[(train_x, train_y, train_y),(valid_x, valid_y, valid_y), (test_x, test_y, test_y)]``
-	"""
-	train_mat = "https://www.dropbox.com/s/uyssbz9ar7879az/batch_0.mat?dl=1"
-	if not os.path.exists('.yann_data'):
-		os.mkdir('.yann_data')	
-	if not os.path.exists('.yann_data/svhn/'):
-		os.mkdir('.yann_data/svhn/')	
-	if not os.path.exists('.yann_data/svhn/train'):
-		os.mkdir('.yann_data/svhn/train')			
-	if not os.path.exists('.yann_data/svhn/train/batch_0.mat'):
-		download_data (url = train_mat, 
-						location = '.yann_data/svhn/train/')
-		os.rename('.yann_data/svhn/train/batch_0.mat?dl=1', '.yann_data/svhn/train/batch_0.mat')
-	dataset_location = '.yann_data/svhn/'
-	train_x, train_y, train_y1 =  load_data_mat(classes = 10, 
-						height = 32,
-						width= 32,
-						channels = 3,
-						location = dataset_location,
-						type_set = 'train',
-						batch = 0
-					)
-	import pdb
-	pdb.set_trace()
-
 def load_cifar100 ():
     """
     Function that downloads the cifar 100 dataset and returns the dataset in full
@@ -765,12 +735,25 @@ class setup_dataset (object):
                                 'skdata' : Download and setup from skdata
                                 'matlab' : Data is created and is being used from Matlab
                     "name" : necessary only for skdata
-                              supports 'mnist','mnist_noise1', 'mnist_noise2', 'mnist_noise3',
-                            'mnist_noise4', 'mnist_noise5', 'mnist_noise6', 'mnist_bg_images',
-                             'mnist_bg_rand', 'mnist_rotated', 'mnist_rotated_bg'. Refer to
-                             original paper by Hugo Larochelle [1] for these dataset details.
+                              supports 
+                                * ``'mnist'``
+                                * ``'mnist_noise1'``
+                                * ``'mnist_noise2'``
+                                * ``'mnist_noise3'``
+                                * ``'mnist_noise4'``
+                                * ``'mnist_noise5'``
+                                * ``'mnist_noise6'``
+                                * ``'mnist_bg_images'``
+                                * ``'mnist_bg_rand'``
+                                * ``'mnist_rotated'``
+                                * ``'mnist_rotated_bg'``.
+                                * ``'cifar10'``
+                                * ``'caltech101'``
+                                * ``'caltech256'``
+                        Refer to original paper by Hugo Larochelle [1] for these dataset details.
+                        
                     "location"                  : #necessary for 'pkl' and 'matlab'
-                    "mini_batch_size"               : 500,
+                    "mini_batch_size"           : 500,
                     "mini_batches_per_batch"    : (100, 20, 20), # trianing, testing, validation
                     "batches2train"             : 1,
                     "batches2test"              : 1,
@@ -863,7 +846,8 @@ def __init__(self,
         self.source              = dataset_init_args [ "source" ]
         if self.source == 'skdata':
             self.name = dataset_init_args ["name"]
-        else:
+
+        elif self.source == 'mat:
             self.location        = dataset_init_args [ "location" ]
 
         if "height" in dataset_init_args.keys():
@@ -937,6 +921,10 @@ def __init__(self,
         start_time = time.clock()
         if self.source == 'skdata':
             self._create_skdata(verbose = verbose)
+
+        if self.source == 'mat':
+            self._mat2yann( verbose =verbose ) # This needs to be done still.
+
         end_time = time.clock()
         if verbose >=1:
             print(". Dataset " + self.id + " is created.")
@@ -974,7 +962,7 @@ def _create_skdata(self,verbose=1):
             self._create_skdata_caltech101(verbose = verbose)
 
         elif self.name == 'caltech256':
-                self._create_skdata_caltech256(verbose = verbose)
+            self._create_skdata_caltech256(verbose = verbose)
 
     def _create_skdata_mnist(self, verbose = 1):
         """
@@ -1090,7 +1078,6 @@ def _create_skdata_mnist(self, verbose = 1):
         cPickle.dump(dataset_args, f, protocol=2)
         f.close()
 
-
     def _create_skdata_caltech101(self, verbose = 2):
         """
         Interal function. Use this to create mnist and caltech101 image datasets