Created a special module for making datasets. Decoupled it from the d…

…ataset module in utils.
ragavvenkatesan · Jan 18, 2017 · cdfc187 · cdfc187
1 parent 23da23e
commit cdfc187
Show file tree

Hide file tree

Showing 10 changed files with 254 additions and 172 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,5 @@
 .cache
 .eggs
 *.png
-/visualizer
+/visualizer
+.vscode
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -129,7 +129,7 @@ To cook a mnist dataset for yann run the following code:
 
 .. code-block:: python
 
-    from yann.utils.dataset import cook_mnist
+    from yann.special.datasets import cook_mnist
     cook_mnist()
 
 Running this code will print a statement to the following effect ``>>Dataset xxxxx is created.``

diff --git a/docs/source/yann/special/datasets.py b/docs/source/yann/special/datasets.py
@@ -0,0 +1,14 @@
+.. _datasets:
+
+:mod:`datasets` - provides quick methods to produce common datasets.
+====================================================================
+
+The file ``yann.special.datasets.py`` contains the definition for some methods that can produce
+quickly some datasets. Some of them include :
+
+    * :mod:`cook_mnist`
+    * :mod:`cook_cifar10`
+    * ...
+
+.. automodule:: yann.special.datasets
+   :members:
diff --git a/pantry/tutorials/autoencoder.py b/pantry/tutorials/autoencoder.py
@@ -127,7 +127,7 @@ def autoencoder ( dataset= None, verbose = 1 ):
     dataset = None  
     if len(sys.argv) > 1:
         if sys.argv[1] == 'create_dataset':
-            from yann.utils.dataset import cook_mnist  
+            from yann.special.datasets import cook_mnist  
             data = cook_mnist (verbose = 2)
             dataset = data.dataset_location()
         else:
@@ -137,7 +137,7 @@ def autoencoder ( dataset= None, verbose = 1 ):
 
     if dataset is None:
         print " creating a new dataset to run through"
-        from yann.utils.dataset import cook_mnist  
+        from yann.special.datasets import cook_mnist  
         data = cook_mnist (verbose = 2)
         dataset = data.dataset_location()
 

diff --git a/pantry/tutorials/gan.py b/pantry/tutorials/gan.py
@@ -192,7 +192,7 @@ def simple_gan ( dataset= None, verbose = 1 ):
     dataset = None  
     if len(sys.argv) > 1:
         if sys.argv[1] == 'create_dataset':
-            from yann.utils.dataset import cook_mnist  
+            from yann.special.datasets import cook_mnist  
             data = cook_mnist (verbose = 2)
             dataset = data.dataset_location()
         else:
@@ -202,7 +202,7 @@ def simple_gan ( dataset= None, verbose = 1 ):
 
     if dataset is None:
         print " creating a new dataset to run through"
-        from yann.utils.dataset import cook_mnist  
+        from yann.special.datasets import cook_mnist  
         data = cook_mnist (verbose = 2)
         dataset = data.dataset_location()
 

diff --git a/pantry/tutorials/lenet.py b/pantry/tutorials/lenet.py
@@ -286,7 +286,7 @@ def lenet_maxout ( dataset= None, verbose = 1 ):
     dataset = None  
     if len(sys.argv) > 1:
         if sys.argv[1] == 'create_dataset':
-            from yann.utils.dataset import cook_mnist  
+            from yann.special.datasets import cook_mnist  
             data = cook_mnist (verbose = 2)
             dataset = data.dataset_location()
         else:
@@ -296,7 +296,7 @@ def lenet_maxout ( dataset= None, verbose = 1 ):
 
     if dataset is None:
         print " creating a new dataset to run through"
-        from yann.utils.dataset import cook_cifar10  
+        from yann.special.datasets import cook_cifar10  
         data = cook_cifar10 (verbose = 2)
         dataset = data.dataset_location()
 

diff --git a/pantry/tutorials/log_reg.py b/pantry/tutorials/log_reg.py
@@ -80,7 +80,7 @@ def log_reg ( dataset, verbose ):
     import sys
     if len(sys.argv) > 1:
         if sys.argv[1] == 'create_dataset':
-            from yann.utils.dataset import cook_mnist  
+            from yann.special.datasets import cook_mnist  
             data = cook_mnist (verbose = 3)
             dataset = data.dataset_location()
         else:
@@ -90,7 +90,7 @@ def log_reg ( dataset, verbose ):
 
     if dataset is None:
         print " creating a new dataset to run through"
-        from yann.utils.dataset import cook_mnist  
+        from yann.special.datasets import cook_mnist  
         data = cook_mnist (verbose = 3)
         dataset = data.dataset_location()
 

diff --git a/pantry/tutorials/mlp.py b/pantry/tutorials/mlp.py
@@ -99,7 +99,7 @@ def mlp ( dataset, verbose = 1 ):
     dataset = None  
     if len(sys.argv) > 1:
         if sys.argv[1] == 'create_dataset':
-            from yann.utils.dataset import cook_mnist  
+            from yann.special.datasets import cook_mnist  
             data = cook_mnist (verbose = 3)
             dataset = data.dataset_location()
         else:
@@ -109,7 +109,7 @@ def mlp ( dataset, verbose = 1 ):
 
     if dataset is None:
         print " creating a new dataset to run through"
-        from yann.utils.dataset import cook_mnist  
+        from yann.special.datasets import cook_mnist  
         data = cook_mnist (verbose = 3)
         dataset = data.dataset_location()
 

diff --git a/yann/special/datasets.py b/yann/special/datasets.py
@@ -0,0 +1,226 @@
+from yann.utils.dataset import setup_dataset
+
+def cook_mnist_normalized(  verbose = 1,
+				 **kwargs):
+	"""
+	Wrapper to cook mnist dataset. Will take as input,
+
+	Args:
+		save_directory: which directory to save the cooked dataset onto.
+		dataset_parms: default is the dictionary. Refer to :mod:`setup_dataset`		
+		preprocess_params: default is the dictionary. Refer to :mod:`setup_dataset`
+
+    Notes:
+        By default, this will create a dataset that is not mean-subtracted.
+	"""
+
+	if not 'data_params' in kwargs.keys():
+
+		data_params = {
+                   "source"             : 'skdata',                                   
+                   "name"               : 'mnist',    
+				   "location"			: '',                                      
+                   "mini_batch_size"    : 500,                                     
+                   "mini_batches_per_batch" : (100, 20, 20), 
+                   "batches2train"      : 1,                                      
+                   "batches2test"       : 1,                                      
+                   "batches2validate"   : 1,                                        
+                   "height"             : 28,                                       
+                   "width"              : 28,                                       
+                   "channels"           : 1  }    
+
+	else:
+		data_params = kwargs['data_params']
+
+	if not 'preprocess_params' in kwargs.keys():
+
+    # parameters relating to preprocessing.
+		preprocess_params = { 
+                            "normalize"     : True,
+                            "GCN"           : False,
+                            "ZCA"           : False,
+                            "grayscale"     : False,
+							"mean_subtract" : False,
+                        }
+	else:
+		preprocess_params = kwargs['preprocess_params']
+
+	if not 'save_directory' in kwargs.keys():
+		save_directory = '_datasets'
+	else:
+		save_directory = kwargs ['save_directory']
+
+	dataset = setup_dataset(dataset_init_args = data_params,
+							save_directory = save_directory,
+							preprocess_init_args = preprocess_params,
+							verbose = 3)
+	return dataset
+
+def cook_mnist_normalized_mean_subtracted(  verbose = 1,
+				 **kwargs):
+	"""
+	Wrapper to cook mnist dataset. Will take as input,
+
+	Args:
+		save_directory: which directory to save the cooked dataset onto.
+		dataset_parms: default is the dictionary. Refer to :mod:`setup_dataset`		
+		preprocess_params: default is the dictionary. Refer to :mod:`setup_dataset`
+	"""
+
+	if not 'data_params' in kwargs.keys():
+
+		data_params = {
+                   "source"             : 'skdata',                                   
+                   "name"               : 'mnist',    
+				   "location"			: '',                                      
+                   "mini_batch_size"    : 500,                                     
+                   "mini_batches_per_batch" : (100, 20, 20), 
+                   "batches2train"      : 1,                                      
+                   "batches2test"       : 1,                                      
+                   "batches2validate"   : 1,                                        
+                   "height"             : 28,                                       
+                   "width"              : 28,                                       
+                   "channels"           : 1  }    
+
+	else:
+		data_params = kwargs['data_params']
+
+	if not 'preprocess_params' in kwargs.keys():
+
+    # parameters relating to preprocessing.
+		preprocess_params = { 
+                            "normalize"     : True,
+                            "GCN"           : False,
+                            "ZCA"           : False,
+                            "grayscale"     : False,
+							"mean_subtract" : True,
+                        }
+	else:
+		preprocess_params = kwargs['preprocess_params']
+
+	if not 'save_directory' in kwargs.keys():
+		save_directory = '_datasets'
+	else:
+		save_directory = kwargs ['save_directory']
+
+	dataset = setup_dataset(dataset_init_args = data_params,
+							save_directory = save_directory,
+							preprocess_init_args = preprocess_params,
+							verbose = 3)
+	return dataset
+
+def cook_mnist_multi_load(  verbose = 1, **kwargs):
+	"""
+	Testing code, mainly.
+	Wrapper to cook mnist dataset. Will take as input,
+	
+	Args:
+		save_directory: which directory to save the cooked dataset onto.
+		dataset_parms: default is the dictionary. Refer to :mod:`setup_dataset`		
+		preprocess_params: default is the dictionary. Refer to :mod:`setup_dataset`
+
+	Notes: 
+		This just creates a ``data_params`` that loads multiple batches without cache. I use this 
+		to test the cahcing working on datastream module.
+	"""
+
+	if not 'data_params' in kwargs.keys():
+
+		data_params = {
+                   "source"             : 'skdata',                                   
+                   "name"               : 'mnist',    
+				   "location"			: '',                                      
+                   "mini_batch_size"    : 500,                                     
+                   "mini_batches_per_batch" : (20, 5, 5), 
+                   "batches2train"      : 5,                                      
+                   "batches2test"       : 4,                                      
+                   "batches2validate"   : 4,                                        
+                   "height"             : 28,                                       
+                   "width"              : 28,                                       
+                   "channels"           : 1 }    
+
+	else:
+		data_params = kwargs['data_params']
+
+	if not 'preprocess_params' in kwargs.keys():                  
+    	# parameters relating to preprocessing.
+		preprocess_params = { 
+                            "normalize"     : True,
+                            "GCN"           : False,
+                            "ZCA"           : False,
+                            "grayscale"     : False,
+                            "mean_subtract" : True,
+                        }
+	else:
+		preprocess_params = kwargs['preprocess_params']
+
+	if not 'save_directory' in kwargs.keys():
+		save_directory = '_datasets'
+	else:
+		save_directory = kwargs ['save_directory']
+
+	dataset = setup_dataset(dataset_init_args = data_params,
+							save_directory = save_directory,
+							preprocess_init_args = preprocess_params,
+							verbose = 3)
+	return dataset	
+
+def cook_cifar10_normalized_mean_subtracted(verbose = 1, **kwargs):
+	"""
+	Wrapper to cook cifar10 dataset. Will take as input,
+
+	Args:
+		save_directory: which directory to save the cooked dataset onto.
+		dataset_parms: default is the dictionary. Refer to :mod:`setup_dataset`		
+		preprocess_params: default is the dictionary. Refer to :mod:`setup_dataset`
+	"""
+
+	if not 'data_params' in kwargs.keys():
+
+		data_params = {
+                   "source"             : 'skdata',                                   
+                   "name"               : 'cifar10',    
+				   "location"			: '',                                      
+                   "mini_batch_size"    : 500,                                     
+                   "mini_batches_per_batch" : (80, 20, 20), 
+                   "batches2train"      : 1,                                      
+                   "batches2test"       : 1,                                      
+                   "batches2validate"   : 1,                                        
+                   "height"             : 32,                                       
+                   "width"              : 32,                                       
+                   "channels"           : 3  }    
+
+	else:
+		data_params = kwargs['data_params']
+
+	if not 'preprocess_params' in kwargs.keys():
+
+    # parameters relating to preprocessing.
+		preprocess_params = { 
+                            "normalize"     : True,
+                            "GCN"           : False,
+                            "ZCA"           : False,
+                            "grayscale"     : False,
+                            mean_subtract   : True,
+                        }
+	else:
+		preprocess_params = kwargs['preprocess_params']
+
+	if not 'save_directory' in kwargs.keys():
+		save_directory = '_datasets'
+	else:
+		save_directory = kwargs ['save_directory']
+
+	dataset = setup_dataset(dataset_init_args = data_params,
+							save_directory = save_directory,
+							preprocess_init_args = preprocess_params,
+							verbose = 3)
+	return dataset    	
+
+
+# Just some wrappers
+cook_mnist = cook_mnist_normalized_mean_subtracted
+cook_cifar10 = cook_cifar10_normalized_mean_subtracted
+
+if __name__ == '__main__':
+	pass