Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
try_decoding: a handy script to find out what codecs a given byte seq…
…uence might possibly be encoded in It includes the option of telling it what resulting characters you were actually expecting. If you don't give that option then it just shows you the characters that would result if you decoded those bytes with each of the codecs that Python knows about. It skips codecs which raise an exception when they attempt to decode that sequence of bytes.
- Loading branch information
Showing
3 changed files
with
102 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#!/usr/bin/env python | ||
|
||
import codecs, encodings, locale, os, sys, zlib | ||
|
||
import argparse | ||
|
||
def listcodecs(dir): | ||
names = [] | ||
for filename in os.listdir(dir): | ||
if filename[-3:] != '.py': | ||
continue | ||
name = filename[:-3] | ||
# Check whether we've found a true codec | ||
try: | ||
codecs.lookup(name) | ||
except LookupError: | ||
# Codec not found | ||
continue | ||
except Exception, reason: | ||
# Probably an error from importing the codec; still it's | ||
# a valid code name | ||
if _debug: | ||
print '* problem importing codec %r: %s' % \ | ||
(name, reason) | ||
names.append(name) | ||
return names | ||
|
||
def listem(): | ||
return listcodecs(encodings.__path__[0]) | ||
|
||
def _canonical_encoding(encoding): | ||
if encoding is None: | ||
encoding = 'utf-8' | ||
encoding = encoding.lower() | ||
if encoding == "cp65001": | ||
encoding = 'utf-8' | ||
elif encoding == "us-ascii" or encoding == "646": | ||
encoding = 'ascii' | ||
|
||
# sometimes Python returns an encoding name that it doesn't support for conversion | ||
# fail early if this happens | ||
try: | ||
u"test".encode(encoding) | ||
except (LookupError, AttributeError): | ||
raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,)) | ||
|
||
return encoding | ||
|
||
def get_output_encoding(): | ||
return _canonical_encoding(sys.stdout.encoding or locale.getpreferredencoding()) | ||
|
||
def get_argv_encoding(): | ||
if sys.platform == 'win32': | ||
# Unicode arguments are not supported on Windows yet; see Tahoe-LAFS tickets #565 and #1074. | ||
return 'ascii' | ||
else: | ||
return get_output_encoding() | ||
|
||
output_encoding = get_output_encoding() | ||
argv_encoding = get_argv_encoding() | ||
|
||
def type_unicode(argstr): | ||
return argstr.decode(argv_encoding) | ||
|
||
def main(): | ||
argv = sys.argv | ||
|
||
parser = argparse.ArgumentParser(prog="try_decoding", description="Try decoding some bytes with all sorts of different codecs and print out any that decode.") | ||
|
||
parser.add_argument('inputfile', help='file to decode or "-" for stdin', type=argparse.FileType('rb'), metavar='INF') | ||
parser.add_argument('-t', '--target', help='unicode string to match against (if any)', type=type_unicode, metavar='T') | ||
parser.add_argument('-a', '--accept-bytes', help='include codecs which return bytes instead of returning unicode (they will be marked with "!!!" in the output)', action='store_true') | ||
|
||
args = parser.parse_args() | ||
|
||
inb = args.inputfile.read() | ||
|
||
for codec in listem(): | ||
try: | ||
u = inb.decode(codec) | ||
except (UnicodeDecodeError, IOError, TypeError, IndexError, UnicodeError, ValueError, zlib.error): | ||
pass | ||
else: | ||
if isinstance(u, unicode): | ||
if args.target: | ||
if args.target != u: | ||
continue | ||
print "%19s" % codec, | ||
print ':', | ||
print u.encode(output_encoding) | ||
else: | ||
if not args.accept_bytes: | ||
continue | ||
print "%19s" % codec, | ||
print "!!! ", | ||
print ':', | ||
print u | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters