try_decoding: a handy script to find out what codecs a given byte seq…

…uence might possibly be encoded in It includes the option of telling it what resulting characters you were actually expecting. If you don't give that option then it just shows you the characters that would result if you decoded those bytes with each of the codecs that Python knows about. It skips codecs which raise an exception when they attempt to decode that sequence of bytes.
simplegeo · Jun 16, 2010 · 323edd3 · 323edd3
1 parent 982d947
commit 323edd3
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 1 deletion.
diff --git a/pyutil/_version.py b/pyutil/_version.py
@@ -5,7 +5,7 @@
 # -NN "build number" suffix, or else a -rNN "revision number" suffix. Please see
 # pyutil.version_class for a description of what the different fields mean.
 
-verstr = "1.7.8-2"
+verstr = "1.7.9"
 try:
     from pyutil.version_class import Version as pyutil_Version
     __version__ = pyutil_Version(verstr)

diff --git a/pyutil/scripts/try_decoding.py b/pyutil/scripts/try_decoding.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+
+import codecs, encodings, locale, os, sys, zlib
+
+import argparse
+
+def listcodecs(dir):
+    names = []
+    for filename in os.listdir(dir):
+        if filename[-3:] != '.py':
+            continue
+        name = filename[:-3]
+        # Check whether we've found a true codec
+        try:
+            codecs.lookup(name)
+        except LookupError:
+            # Codec not found
+            continue
+        except Exception, reason:
+            # Probably an error from importing the codec; still it's
+            # a valid code name
+            if _debug:
+                print '* problem importing codec %r: %s' % \
+                      (name, reason)
+        names.append(name)
+    return names
+
+def listem():
+    return listcodecs(encodings.__path__[0])
+
+def _canonical_encoding(encoding):
+    if encoding is None:
+        encoding = 'utf-8'
+    encoding = encoding.lower()
+    if encoding == "cp65001":
+        encoding = 'utf-8'
+    elif encoding == "us-ascii" or encoding == "646":
+        encoding = 'ascii'
+
+    # sometimes Python returns an encoding name that it doesn't support for conversion
+    # fail early if this happens
+    try:
+        u"test".encode(encoding)
+    except (LookupError, AttributeError):
+        raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
+
+    return encoding
+
+def get_output_encoding():
+    return _canonical_encoding(sys.stdout.encoding or locale.getpreferredencoding())
+
+def get_argv_encoding():
+    if sys.platform == 'win32':
+        # Unicode arguments are not supported on Windows yet; see Tahoe-LAFS tickets #565 and #1074.
+        return 'ascii'
+    else:
+        return get_output_encoding()
+
+output_encoding = get_output_encoding()
+argv_encoding = get_argv_encoding()
+
+def type_unicode(argstr):
+    return argstr.decode(argv_encoding)
+
+def main():
+    argv = sys.argv
+
+    parser = argparse.ArgumentParser(prog="try_decoding", description="Try decoding some bytes with all sorts of different codecs and print out any that decode.")
+
+    parser.add_argument('inputfile', help='file to decode or "-" for stdin', type=argparse.FileType('rb'), metavar='INF')
+    parser.add_argument('-t', '--target', help='unicode string to match against (if any)', type=type_unicode, metavar='T')
+    parser.add_argument('-a', '--accept-bytes', help='include codecs which return bytes instead of returning unicode (they will be marked with "!!!" in the output)', action='store_true')
+
+    args = parser.parse_args()
+
+    inb = args.inputfile.read()
+
+    for codec in listem():
+        try:
+            u = inb.decode(codec)
+        except (UnicodeDecodeError, IOError, TypeError, IndexError, UnicodeError, ValueError, zlib.error):
+            pass
+        else:
+            if isinstance(u, unicode):
+                if args.target:
+                    if args.target != u:
+                        continue
+                print "%19s" % codec,
+                print ':',
+                print u.encode(output_encoding)
+            else:
+                if not args.accept_bytes:
+                    continue
+                print "%19s" % codec,
+                print "!!! ",
+                print ':',
+                print u
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
@@ -101,6 +101,7 @@ def _setup(test_suite):
                   'randfile = pyutil.scripts.randfile:main',
                   'unsort = pyutil.scripts.unsort:main',
                   'verinfo = pyutil.scripts.verinfo:main',
+                  'try_decoding = pyutil.scripts.try_decoding:main',
                   ] },
           test_suite=test_suite,
           zip_safe=False, # I prefer unzipped for easier access.