Merge pull request #49 from sciris/develop

Parallel processing and unpickling fixes
sciris · Dec 12, 2018 · bccb3bc · bccb3bc
2 parents 16876c7 + 7b42b72
commit bccb3bc
Show file tree

Hide file tree

Showing 7 changed files with 202 additions and 23 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,6 +1,8 @@
 sudo: required
 dist: xenial
-python: "3.7"
+python:
+- "2.7"
+- "3.7"
 language: python
 install: pip install tox-travis
 script: tox
diff --git a/sciris/sc_fileio.py b/sciris/sc_fileio.py
@@ -47,7 +47,7 @@
 __all__ = ['loadobj', 'loadstr', 'saveobj', 'dumpstr']
 
 
-def loadobj(filename=None, folder=None, verbose=True, die=None):
+def loadobj(filename=None, folder=None, verbose=False, die=None):
     '''
     Load a file that has been saved as a gzipped pickle file. Accepts either 
     a filename (standard usage) or a file object as the first argument.
@@ -68,16 +68,16 @@ def loadobj(filename=None, folder=None, verbose=True, die=None):
     kwargs = {'mode': 'rb', argtype: filename}
     with GzipFile(**kwargs) as fileobj:
         filestr = fileobj.read() # Convert it to a string
-        obj = unpickler(filestr, die=die) # Actually load it
+        obj = unpickler(filestr, filename=filename, verbose=verbose, die=die) # Actually load it
     if verbose: print('Object loaded from "%s"' % filename)
     return obj
 
 
-def loadstr(string=None, die=None):
+def loadstr(string=None, verbose=False, die=None):
     with closing(IO(string)) as output: # Open a "fake file" with the Gzip string pickle in it.
         with GzipFile(fileobj=output, mode='rb') as fileobj: # Set a Gzip reader to pull from the "file."
             picklestring = fileobj.read() # Read the string pickle from the "file" (applying Gzip decompression).
-    obj = unpickler(picklestring, die=die) # Return the object gotten from the string pickle.   
+    obj = unpickler(picklestring, filestring=string, verbose=verbose, die=die) # Return the object gotten from the string pickle.   
     return obj
 
 
@@ -757,9 +757,20 @@ def savespreadsheet(filename=None, data=None, folder=None, sheetnames=None, clos
 class Failed(object):
     ''' An empty class to represent a failed object loading '''
     failure_info = odict()
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class Empty(object):
+    ''' Another empty class to represent a failed object loading '''
+
     def __init__(self, *args, **kwargs):
         pass
 
+    def __setstate__(self, state):
+        pass
+
 
 def makefailed(module_name=None, name=None, error=None):
     ''' Create a class -- not an object! -- that contains the failure info '''
@@ -773,6 +784,10 @@ def makefailed(module_name=None, name=None, error=None):
 
 class RobustUnpickler(pickle.Unpickler):
     ''' Try to import an object, and if that fails, return a Failed object rather than crashing '''
+
+    def __init__(self, tmpfile, fix_imports=True, encoding="latin1", errors="ignore"):
+        pickle.Unpickler.__init__(self, tmpfile, fix_imports=fix_imports, encoding=encoding, errors=errors)
+
     def find_class(self, module_name, name, verbose=False):
         obj = makefailed(module_name, name, 'Unknown error') # This should get overwritten unless something goes terribly wrong
         try:
@@ -788,20 +803,41 @@ def find_class(self, module_name, name, verbose=False):
         return obj
 
 
-def unpickler(string=None, die=None):
-    import dill # Optional Sciris dependency
+def unpickler(string=None, filename=None, filestring=None, die=None, verbose=False):
+
     if die is None: die = False
+
     try: # Try pickle first
         obj = pkl.loads(string) # Actually load it -- main usage case
-    except Exception as E:
+    except Exception as E1:
         if die: 
-            raise E
+            raise E1
         else:
-            try:    obj = dill.loads(string) # If that fails, try dill
-            except: obj = RobustUnpickler(io.BytesIO(string)).load() # And if that trails, throw everything at it
+            try:
+                if verbose: print('Standard unpickling failed (%s), trying encoding...' % str(E1))
+                obj = pkl.loads(string, encoding='latin1') # Try loading it again with different encoding
+            except Exception as E2:
+                try:
+                    if verbose: print('Encoded unpickling failed (%s), trying dill...' % str(E2))
+                    import dill # Optional Sciris dependency
+                    obj = dill.loads(string) # If that fails, try dill
+                except Exception as E3:
+                    try:
+                        if verbose: print('Dill failed (%s), trying robust unpickler...' % str(E3))
+                        obj = RobustUnpickler(io.BytesIO(string)).load() # And if that trails, throw everything at it
+                    except Exception as E4:
+                        try:
+                            if verbose: print('Robust unpickler failed (%s), trying Python 2->3 conversion...' % str(E4))
+                            obj = loadobj2to3(filename=filename, filestring=filestring)
+                        except Exception as E5:
+                            if verbose: print('Python 2->3 conversion failed (%s), giving up...' % str(E5))
+                            errormsg = 'All available unpickling methods failed:\n    Standard: %s\n     Encoded: %s\n        Dill: %s\n      Robust: %s\n  Python2->3: %s' % (E1, E2, E3, E4, E5)
+                            raise Exception(errormsg)
+
     if isinstance(obj, Failed):
         print('Warning, the following errors were encountered during unpickling:')
         print(obj.failure_info)
+
     return obj
 
 
@@ -818,6 +854,100 @@ def savedill(fileobj=None, obj=None):
     return None
 
 
+
+##############################################################################
+### Python 2 legacy support
+##############################################################################
+
+not_string_pickleable = ['datetime', 'BytesIO']
+byte_objects = ['datetime', 'BytesIO', 'odict', 'spreadsheet', 'blobject']
+
+def loadobj2to3(filename=None, filestring=None):
+    ''' Used automatically by loadobj() to load Python2 objects in Python3 if all other loading methods fail '''
+
+    class Placeholder():
+        ''' Replace these corrupted classes with properly loaded ones '''
+        def __init__(*args):
+            return
+
+        def __setstate__(self, state):
+            if isinstance(state,dict):
+                self.__dict__ = state
+            else:
+                self.state = state
+            return
+
+    class StringUnpickler(pickle.Unpickler):
+        def find_class(self, module, name):
+            if name in not_string_pickleable:
+                return Empty
+            else:
+                return pickle.Unpickler.find_class(self,module,name)
+
+    class BytesUnpickler(pickle.Unpickler):
+        def find_class(self, module, name):
+            if name in byte_objects:
+                return pickle.Unpickler.find_class(self,module,name)
+            else:
+                return Placeholder
+
+    def recursive_substitute(obj1, obj2, track=None):
+        if track is None:
+            track = []
+
+        if isinstance(obj1, Blobject): # Handle blobjects (usually spreadsheets)
+            obj1.blob  = obj2.__dict__[b'blob']
+            obj1.bytes = obj2.__dict__[b'bytes']
+
+        if isinstance(obj2, dict): # Handle dictionaries
+            for k,v in obj2.items():
+                if isinstance(v, datetime.datetime):
+                    setattr(obj1, k.decode('latin1'), v)
+                elif isinstance(v, dict) or hasattr(v,'__dict__'):
+                    if isinstance(k, (bytes, bytearray)):
+                        k = k.decode('latin1')
+                    track2 = track.copy()
+                    track2.append(k)
+                    recursive_substitute(obj1[k], v, track2)
+        else:
+            for k,v in obj2.__dict__.items():
+                if isinstance(v,datetime.datetime):
+                    setattr(obj1,k.decode('latin1'), v)
+                elif isinstance(v,dict) or hasattr(v,'__dict__'):
+                    if isinstance(k, (bytes, bytearray)):
+                        k = k.decode('latin1')
+                    track2 = track.copy()
+                    track2.append(k)
+                    recursive_substitute(getattr(obj1,k), v, track2)
+
+    # Load either from file or from string
+    if filename:
+        with GzipFile(filename) as fileobj:
+            unpickler1 = StringUnpickler(fileobj, encoding='latin1')
+            stringout = unpickler1.load()
+        with GzipFile(filename) as fileobj:
+            unpickler2 = BytesUnpickler(fileobj,  encoding='bytes')
+            bytesout  = unpickler2.load()
+    elif filestring:
+        with closing(IO(filestring)) as output: 
+            with GzipFile(fileobj=output, mode='rb') as fileobj:
+                unpickler1 = StringUnpickler(fileobj, encoding='latin1')
+                stringout = unpickler1.load()
+        with closing(IO(filestring)) as output:
+            with GzipFile(fileobj=output, mode='rb') as fileobj:
+                unpickler2 = BytesUnpickler(fileobj,  encoding='bytes')
+                bytesout  = unpickler2.load()
+    else:
+        errormsg = 'You must supply either a filename or a filestring for loadobj() or loadstr(), respectively'
+        raise Exception(errormsg)
+
+    # Actually do the load, with correct substitution
+    recursive_substitute(stringout, bytesout)
+    return stringout
+
+
+
+
 ##############################################################################
 ### Twisted pickling methods
 ##############################################################################

diff --git a/sciris/sc_math.py b/sciris/sc_math.py
@@ -8,19 +8,58 @@
 from . import sc_utils as ut
 
 ##############################################################################
-### FIND FUNCTIONS
+### FIND AND APPROXIMATION FUNCTIONS
 ##############################################################################
 
-__all__ = ['approx', 'findinds', 'findnearest', 'dataindex', 'getvalidinds', 'sanitize', 'getvaliddata', 'isprime']
+__all__ = ['approx', 'safedivide', 'findinds', 'findnearest', 'dataindex', 'getvalidinds', 'sanitize', 'getvaliddata', 'isprime']
 
 
 def approx(val1=None, val2=None, eps=None):
-    ''' Determine whether two scalars approximately match '''
-    if eps is None: eps = 1e-9
-    output = abs(val1-val2)<eps
+    '''
+    Determine whether two scalars approximately match. Example:
+        sc.approx(2*6, 11.9999999, eps=1e-6) # Returns True
+        sc.approx([3,12,11.9], 12) # Returns array([False, True, False], dtype=bool)
+    '''
+    if val2 is None: val2 = 0.0
+    if eps  is None: eps = 1e-9
+    if isinstance(val1, list): val1 = np.array(val1) # If it's a list, convert to an array first
+    output = abs(val1-val2)<=eps
     return output
 
 
+
+def safedivide(numerator=None, denominator=None, default=None, eps=None, warn=False):
+    '''
+    Handle divide-by-zero and divide-by-nan elegantly. Examples:
+        sc.safedivide(numerator=0, denominator=0, default=1, eps=0) # Returns 1
+        sc.safedivide(numerator=5, denominator=2.0, default=1, eps=1e-3) # Returns 2.5
+        sc.safedivide(3,array([1,3,0]),-1, warn=True) # Returns array([ 3,  1, -1])
+    '''
+    # Set some defaults
+    if numerator   is None: numerator   = 1.0
+    if denominator is None: denominator = 1.0
+    if default     is None: default     = 0.0
+
+    # Handle the logic
+    invalid = approx(denominator, 0.0, eps=eps)
+    if ut.isnumber(denominator): # The denominator is a scalar
+        if invalid:
+            output = default
+        else:
+            output = numerator/denominator
+    elif ut.checktype(denominator, 'array'):
+        if not warn:
+            denominator[invalid] = 1.0 # Replace invalid values with 1 
+        output = numerator/denominator
+        output[invalid] = default
+    else: # Unclear input, raise exception
+        errormsg = 'Input type %s not understood: must be number or array' % type(denominator)
+        raise Exception(errormsg)
+
+    return output    
+
+
+
 def findinds(val1, val2=None, eps=1e-6):
     '''
     Little function to find matches even if two things aren't eactly equal (eg. 

diff --git a/sciris/sc_parallel.py b/sciris/sc_parallel.py
@@ -197,6 +197,9 @@ def f(x,y):
     # Run simply using map -- no advantage here to using Process/Queue
     multipool = mp.Pool(processes=ncpus)
     outputlist = multipool.map(parallel_task, argslist)
+    multipool.close()
+    multipool.join()
+
     return outputlist
 
 

diff --git a/sciris/sc_utils.py b/sciris/sc_utils.py
@@ -136,9 +136,10 @@ def sha(string, encoding='utf-8', *args, **kwargs):
     return output
 
 
-def wget(url):
+def wget(url, convert=True):
     ''' Download a URL '''
     output = urlrequester.urlopen(url).read()
+    if convert and six.PY3: output = output.decode()
     return output
 
 
@@ -735,11 +736,15 @@ def flexstr(arg, force=True):
                 output = arg # If anything goes wrong, just return as-is
         else:
             if isinstance(arg, six.binary_type): 
-                output = arg.decode() # If it's bytes, decode to unicode
+                try:
+                    output = arg.decode() # If it's bytes, decode to unicode
+                except:
+                    if force: output = repr(arg) # If that fails, just print its representation
+                    else:     output = arg
             else: 
                 output = arg # Otherwise, return as-is
     else:
-        if force: output = str(arg) 
+        if force: output = repr(arg)
         else:     output = arg # Optionally don't do anything for non-strings
     return output
 

diff --git a/sciris/sc_version.py b/sciris/sc_version.py
@@ -1,5 +1,5 @@
 __all__ = ['__version__', '__versiondate__', '__license__']
 
-__version__      = '0.12.0'
-__versiondate__  = '2018-11-24'
+__version__      = '0.12.1'
+__versiondate__  = '2018-12-11'
 __license__      = 'Sciris %s (%s) -- (c) Sciris.org' % (__version__, __versiondate__)
diff --git a/tox.ini b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py37
+envlist = py27, py37
 
 [testenv]
 description = Run basic usage tests
@@ -15,7 +15,7 @@ commands =
 setenv = MPLBACKEND = agg
 
 [pytest]
-addopts = -ra -v -n 2
+addopts = -ra -v
 console_output_style = count
 testpaths = tests
 python_files = test_tox_*.py