Permalink
Browse files

Fix wrong BOM detection in startsWithBom and fix BOM checks containing 0

The same fix for BOM detection in startsWithBom as already applied in
c9c848e for removeBom, otherwise binary
data is considered text.

Fixed also check for "\x00\x00\xFE\xFF" and "\xFF\xFE\x00\x00", that are
the problematic BOMs since they contain the null character.
  • Loading branch information...
mgrojo committed Jan 2, 2018
1 parent c9c848e commit feda408161ae98fee69bfd46a91c607bbc40921d
Showing with 16 additions and 9 deletions.
  1. +16 −9 src/Data.cpp
@@ -1,6 +1,15 @@
#include "Data.h"

#include <QTextCodec>
#include <QDebug>

This comment has been minimized.

@justinclift

justinclift Jan 4, 2018

Member

Left-over debugging bit? 😄

This comment has been minimized.

@mgrojo

mgrojo Jan 4, 2018

Author Contributor

Yep 😄 This one and two others are now deleted: 1be61db


// Note that these aren't all possible BOMs. But they are probably the most common ones.
// The size is needed at least for the ones with character zero in them.
static const QByteArray bom3("\xEF\xBB\xBF", 3);
static const QByteArray bom2a("\xFE\xFF", 2);
static const QByteArray bom2b("\xFF\xFE", 2);
static const QByteArray bom4a("\x00\x00\xFE\xFF", 4);
static const QByteArray bom4b("\xFF\xFE\x00\x00", 4);

bool isTextOnly(QByteArray data, const QString& encoding, bool quickTest)
{
@@ -22,28 +31,26 @@ bool isTextOnly(QByteArray data, const QString& encoding, bool quickTest)

bool startsWithBom(const QByteArray& data)
{
// Note that these aren't all possible BOMs. But they are probably the most common ones.

if(data.startsWith("\xEF\xBB\xBF") ||
data.startsWith("\xFE\xFF") || data.startsWith("\xFF\xFE") ||
data.startsWith("\x00\x00\xFE\xFF") || data.startsWith("\xFF\xFE\x00\x00"))
return true;
if(data.startsWith(bom3) ||
data.startsWith(bom2a) || data.startsWith(bom2b) ||
data.startsWith(bom4a) || data.startsWith(bom4b))
return true;
else
return false;
}

QByteArray removeBom(QByteArray& data)
{
if(data.left(3) == QByteArray("\xEF\xBB\xBF"))
if(data.startsWith(bom3))
{
QByteArray bom = data.left(3);
data.remove(0, 3);
return bom;
} else if(data.left(2) == QByteArray("\xFE\xFF") || data.left(2) == QByteArray("\xFF\xFE")) {
} else if(data.startsWith(bom2a) || data.startsWith(bom2b)) {
QByteArray bom = data.left(2);
data.remove(0, 2);
return bom;
} else if(data.left(4) == QByteArray("\x00\x00\xFE\xFF") || data.left(4) == QByteArray("\xFF\xFE\x00\x00")) {
} else if(data.startsWith(bom4a) || data.startsWith(bom4b)) {
QByteArray bom = data.left(4);
data.remove(0, 4);
return bom;

0 comments on commit feda408

Please sign in to comment.