spyder/utils/encoding.py

# -*- coding: utf-8 -*-
#
# Copyright © Spyder Project Contributors
# Licensed under the terms of the MIT License
# (see spyder/__init__.py for details)

"""
Text encoding utilities, text file I/O

Functions 'get_coding', 'decode', 'encode' and 'to_unicode' come from Eric4
source code (Utilities/__init___.py) Copyright © 2003-2009 Detlev Offenbach
"""

# Standard library imports
from codecs import BOM_UTF8, BOM_UTF16, BOM_UTF32
import tempfile
import locale
import re
import os
import os.path as osp
import pathlib
import sys
import time
import errno

# Third-party imports
from chardet.universaldetector import UniversalDetector
from atomicwrites import atomic_write

# Local imports
from spyder.py3compat import (is_string, to_text_string, is_binary_string,
                              is_text_string)
from spyder.utils.external.binaryornot.check import is_binary


PREFERRED_ENCODING = locale.getpreferredencoding()

def transcode(text, input=PREFERRED_ENCODING, output=PREFERRED_ENCODING):
    """Transcode a text string"""
    try:
        return text.decode("cp437").encode("cp1252")
    except UnicodeError:
        try:
            return text.decode("cp437").encode(output)
        except UnicodeError:
            return text

#------------------------------------------------------------------------------
#  Functions for encoding and decoding bytes that come from
#  the *file system*.
#------------------------------------------------------------------------------

# The default encoding for file paths and environment variables should be set
# to match the default encoding that the OS is using.
def getfilesystemencoding():
    """
    Query the filesystem for the encoding used to encode filenames
    and environment variables.
    """
    encoding = sys.getfilesystemencoding()
    if encoding is None:
        # Must be Linux or Unix and nl_langinfo(CODESET) failed.
        encoding = PREFERRED_ENCODING
    return encoding

FS_ENCODING = getfilesystemencoding()

def to_unicode_from_fs(string):
    """
    Return a unicode version of string decoded using the file system encoding.
    """
    if not is_string(string): # string is a QString
        string = to_text_string(string.toUtf8(), 'utf-8')
    else:
        if is_binary_string(string):
            try:
                unic = string.decode(FS_ENCODING)
            except (UnicodeError, TypeError):
                pass
            else:
                return unic
    return string

def to_fs_from_unicode(unic):
    """
    Return a byte string version of unic encoded using the file
    system encoding.
    """
    if is_text_string(unic):
        try:
            string = unic.encode(FS_ENCODING)
        except (UnicodeError, TypeError):
            pass
        else:
            return string
    return unic

#------------------------------------------------------------------------------
#  Functions for encoding and decoding *text data* itself, usually originating
#  from or destined for the *contents* of a file.
#------------------------------------------------------------------------------

# Codecs for working with files and text.
CODING_RE = re.compile(r"coding[:=]\s*([-\w_.]+)")
CODECS = ['utf-8', 'iso8859-1',  'iso8859-15', 'ascii', 'koi8-r', 'cp1251',
          'koi8-u', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
          'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9',
          'iso8859-10', 'iso8859-13', 'iso8859-14', 'latin-1',
          'utf-16']


def get_coding(text, force_chardet=False):
    """
    Function to get the coding of a text.
    @param text text to inspect (string)
    @return coding string
    """
    if not force_chardet:
        for line in text.splitlines()[:2]:
            try:
                result = CODING_RE.search(to_text_string(line))
            except UnicodeDecodeError:
                # This could fail because to_text_string assume the text
                # is utf8-like and we don't know the encoding to give
                # it to to_text_string
                pass
            else:
                if result:
                    codec = result.group(1)
                    # sometimes we find a false encoding that can
                    # result in errors
                    if codec in CODECS:
                        return codec

    # Fallback using chardet
    if is_binary_string(text):
        detector = UniversalDetector()
        for line in text.splitlines()[:2]:
            detector.feed(line)
            if detector.done: break

        detector.close()
        return detector.result['encoding']

    return None

def decode(text):
    """
    Function to decode a text.
    @param text text to decode (string)
    @return decoded text and encoding
    """
    try:
        if text.startswith(BOM_UTF8):
            # UTF-8 with BOM
            return to_text_string(text[len(BOM_UTF8):], 'utf-8'), 'utf-8-bom'
        elif text.startswith(BOM_UTF16):
            # UTF-16 with BOM
            return to_text_string(text[len(BOM_UTF16):], 'utf-16'), 'utf-16'
        elif text.startswith(BOM_UTF32):
            # UTF-32 with BOM
            return to_text_string(text[len(BOM_UTF32):], 'utf-32'), 'utf-32'
        coding = get_coding(text)
        if coding:
            return to_text_string(text, coding), coding
    except (UnicodeError, LookupError):
        pass
    # Assume UTF-8
    try:
        return to_text_string(text, 'utf-8'), 'utf-8-guessed'
    except (UnicodeError, LookupError):
        pass
    # Assume Latin-1 (behaviour before 3.7.1)
    return to_text_string(text, "latin-1"), 'latin-1-guessed'

def encode(text, orig_coding):
    """
    Function to encode a text.
    @param text text to encode (string)
    @param orig_coding type of the original coding (string)
    @return encoded text and encoding
    """
    if orig_coding == 'utf-8-bom':
        return BOM_UTF8 + text.encode("utf-8"), 'utf-8-bom'

    # Try saving with original encoding
    if orig_coding:
        try:
            return text.encode(orig_coding), orig_coding
        except (UnicodeError, LookupError):
            pass

    # Try declared coding spec
    coding = get_coding(text)
    if coding:
        try:
            return text.encode(coding), coding
        except (UnicodeError, LookupError):
            raise RuntimeError("Incorrect encoding (%s)" % coding)
    if orig_coding and orig_coding.endswith('-default') or \
      orig_coding.endswith('-guessed'):
        coding = orig_coding.replace("-default", "")
        coding = orig_coding.replace("-guessed", "")
        try:
            return text.encode(coding), coding
        except (UnicodeError, LookupError):
            pass

    # Save as UTF-8 without BOM
    return text.encode('utf-8'), 'utf-8'

def to_unicode(string):
    """Convert a string to unicode"""
    if not is_text_string(string):
        for codec in CODECS:
            try:
                unic = to_text_string(string, codec)
            except UnicodeError:
                pass
            except TypeError:
                break
            else:
                return unic
    return string


def write(text, filename, encoding='utf-8', mode='wb'):
    """
    Write 'text' to file ('filename') assuming 'encoding' in an atomic way
    Return (eventually new) encoding
    """
    text, encoding = encode(text, encoding)

    if os.name == 'nt':
        try:
            absolute_path_filename = pathlib.Path(filename).resolve()
            if absolute_path_filename.exists():
                absolute_filename = to_text_string(absolute_path_filename)
            else:
                absolute_filename = osp.realpath(filename)
        except (OSError, RuntimeError):
            absolute_filename = osp.realpath(filename)
    else:
        absolute_filename = osp.realpath(filename)

    if 'a' in mode:
        with open(absolute_filename, mode) as textfile:
            textfile.write(text)
    else:
        # Based in the solution at untitaker/python-atomicwrites#42.
        # Needed to fix file permissions overwriting.
        # See spyder-ide/spyder#9381.
        try:
            file_stat = os.stat(absolute_filename)
            original_mode = file_stat.st_mode
            creation = file_stat.st_atime
        except FileNotFoundError:
            # Creating a new file, emulate what os.open() does
            umask = os.umask(0)
            os.umask(umask)
            # Set base permission of a file to standard permissions.
            # See #spyder-ide/spyder#14112.
            original_mode = 0o666 & ~umask
            creation = time.time()
        try:
            # fixes issues with scripts in Dropbox leaving
            # temporary files in the folder, see spyder-ide/spyder#13041
            tempfolder = None
            if 'dropbox' in absolute_filename.lower():
                tempfolder = tempfile.gettempdir()
            with atomic_write(absolute_filename, overwrite=True,
                              mode=mode, dir=tempfolder) as textfile:
                textfile.write(text)
        except OSError as error:
            # Some filesystems don't support the option to sync directories
            # See untitaker/python-atomicwrites#17
            if error.errno != errno.EINVAL:
                with open(absolute_filename, mode) as textfile:
                    textfile.write(text)
        try:
            os.chmod(absolute_filename, original_mode)
            file_stat = os.stat(absolute_filename)
            # Preserve creation timestamps
            os.utime(absolute_filename, (creation, file_stat.st_mtime))
        except OSError:
            # Prevent error when chmod/utime is not allowed
            # See spyder-ide/spyder#11308
            pass
    return encoding


def writelines(lines, filename, encoding='utf-8', mode='wb'):
    """
    Write 'lines' to file ('filename') assuming 'encoding'
    Return (eventually new) encoding
    """
    return write(os.linesep.join(lines), filename, encoding, mode)

def read(filename, encoding='utf-8'):
    """
    Read text from file ('filename')
    Return text and encoding
    """
    text, encoding = decode( open(filename, 'rb').read() )
    return text, encoding

def readlines(filename, encoding='utf-8'):
    """
    Read lines from file ('filename')
    Return lines and encoding
    """
    text, encoding = read(filename, encoding)
    return text.split(os.linesep), encoding


def is_text_file(filename):
    """
    Test if the given path is a text-like file.
    """
    try:
        return not is_binary(filename)
    except (OSError, IOError):
        return False