Skip to content

Commit

Permalink
try_decoding: a handy script to find out what codecs a given byte seq…
Browse files Browse the repository at this point in the history
…uence might possibly be encoded in

It includes the option of telling it what resulting characters you were actually expecting. If you don't give that option then it just shows you the characters that would result if you decoded those bytes with each of the codecs that Python knows about. It skips codecs which raise an exception when they attempt to decode that sequence of bytes.
  • Loading branch information
zookos committed Jun 16, 2010
1 parent 982d947 commit 323edd3
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 1 deletion.
2 changes: 1 addition & 1 deletion pyutil/_version.py
Expand Up @@ -5,7 +5,7 @@
# -NN "build number" suffix, or else a -rNN "revision number" suffix. Please see
# pyutil.version_class for a description of what the different fields mean.

verstr = "1.7.8-2"
verstr = "1.7.9"
try:
from pyutil.version_class import Version as pyutil_Version
__version__ = pyutil_Version(verstr)
Expand Down
100 changes: 100 additions & 0 deletions pyutil/scripts/try_decoding.py
@@ -0,0 +1,100 @@
#!/usr/bin/env python

import codecs, encodings, locale, os, sys, zlib

import argparse

def listcodecs(dir):
names = []
for filename in os.listdir(dir):
if filename[-3:] != '.py':
continue
name = filename[:-3]
# Check whether we've found a true codec
try:
codecs.lookup(name)
except LookupError:
# Codec not found
continue
except Exception, reason:
# Probably an error from importing the codec; still it's
# a valid code name
if _debug:
print '* problem importing codec %r: %s' % \
(name, reason)
names.append(name)
return names

def listem():
return listcodecs(encodings.__path__[0])

def _canonical_encoding(encoding):
if encoding is None:
encoding = 'utf-8'
encoding = encoding.lower()
if encoding == "cp65001":
encoding = 'utf-8'
elif encoding == "us-ascii" or encoding == "646":
encoding = 'ascii'

# sometimes Python returns an encoding name that it doesn't support for conversion
# fail early if this happens
try:
u"test".encode(encoding)
except (LookupError, AttributeError):
raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))

return encoding

def get_output_encoding():
return _canonical_encoding(sys.stdout.encoding or locale.getpreferredencoding())

def get_argv_encoding():
if sys.platform == 'win32':
# Unicode arguments are not supported on Windows yet; see Tahoe-LAFS tickets #565 and #1074.
return 'ascii'
else:
return get_output_encoding()

output_encoding = get_output_encoding()
argv_encoding = get_argv_encoding()

def type_unicode(argstr):
return argstr.decode(argv_encoding)

def main():
argv = sys.argv

parser = argparse.ArgumentParser(prog="try_decoding", description="Try decoding some bytes with all sorts of different codecs and print out any that decode.")

parser.add_argument('inputfile', help='file to decode or "-" for stdin', type=argparse.FileType('rb'), metavar='INF')
parser.add_argument('-t', '--target', help='unicode string to match against (if any)', type=type_unicode, metavar='T')
parser.add_argument('-a', '--accept-bytes', help='include codecs which return bytes instead of returning unicode (they will be marked with "!!!" in the output)', action='store_true')

args = parser.parse_args()

inb = args.inputfile.read()

for codec in listem():
try:
u = inb.decode(codec)
except (UnicodeDecodeError, IOError, TypeError, IndexError, UnicodeError, ValueError, zlib.error):
pass
else:
if isinstance(u, unicode):
if args.target:
if args.target != u:
continue
print "%19s" % codec,
print ':',
print u.encode(output_encoding)
else:
if not args.accept_bytes:
continue
print "%19s" % codec,
print "!!! ",
print ':',
print u

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -101,6 +101,7 @@ def _setup(test_suite):
'randfile = pyutil.scripts.randfile:main',
'unsort = pyutil.scripts.unsort:main',
'verinfo = pyutil.scripts.verinfo:main',
'try_decoding = pyutil.scripts.try_decoding:main',
] },
test_suite=test_suite,
zip_safe=False, # I prefer unzipped for easier access.
Expand Down

0 comments on commit 323edd3

Please sign in to comment.