Skip to content

Commit ad527a6

Browse files
committed
Parse iteratively to avoid stack overflow (fixes #146)
1 parent 85f05cc commit ad527a6

File tree

6 files changed

+150
-108
lines changed

6 files changed

+150
-108
lines changed

Diff for: ChangeLog

+3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
2017-08-25 Jay Berkenbilt <ejb@ql.org>
22

3+
* Re-implement parser iteratively to avoid stack overflow on very
4+
deeply nested arrays and dictionaries. Fixes #146.
5+
36
* Detect infinite loop while finding additional xref tables. Fixes
47
#149.
58

Diff for: include/qpdf/QPDFObjectHandle.hh

-1
Original file line numberDiff line numberDiff line change
@@ -667,7 +667,6 @@ class QPDFObjectHandle
667667
std::string const& object_description,
668668
QPDFTokenizer& tokenizer, bool& empty,
669669
StringDecrypter* decrypter, QPDF* context,
670-
bool in_array, bool in_dictionary,
671670
bool content_stream);
672671
static void parseContentStream_internal(
673672
PointerHolder<Buffer> stream_data,

Diff for: libqpdf/QPDFObjectHandle.cc

+141-107
Original file line numberDiff line numberDiff line change
@@ -883,8 +883,7 @@ QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data,
883883
while (static_cast<size_t>(input->tell()) < length)
884884
{
885885
QPDFObjectHandle obj =
886-
parseInternal(input, "content", tokenizer, empty,
887-
0, 0, false, false, true);
886+
parseInternal(input, "content", tokenizer, empty, 0, 0, true);
888887
if (! obj.isInitialized())
889888
{
890889
// EOF
@@ -945,15 +944,14 @@ QPDFObjectHandle::parse(PointerHolder<InputSource> input,
945944
StringDecrypter* decrypter, QPDF* context)
946945
{
947946
return parseInternal(input, object_description, tokenizer, empty,
948-
decrypter, context, false, false, false);
947+
decrypter, context, false);
949948
}
950949

951950
QPDFObjectHandle
952951
QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
953952
std::string const& object_description,
954953
QPDFTokenizer& tokenizer, bool& empty,
955954
StringDecrypter* decrypter, QPDF* context,
956-
bool in_array, bool in_dictionary,
957955
bool content_stream)
958956
{
959957
// This method must take care not to resolve any objects. Don't
@@ -962,22 +960,23 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
962960
// of reading the object and changing the file pointer.
963961

964962
empty = false;
965-
if (in_dictionary && in_array)
966-
{
967-
// Although dictionaries and arrays arbitrarily nest, these
968-
// variables indicate what is at the top of the stack right
969-
// now, so they can, by definition, never both be true.
970-
throw std::logic_error(
971-
"INTERNAL ERROR: parseInternal: in_dict && in_array");
972-
}
973963

974964
QPDFObjectHandle object;
975965

976-
qpdf_offset_t offset = input->tell();
977-
std::vector<QPDFObjectHandle> olist;
966+
std::vector<std::vector<QPDFObjectHandle> > olist_stack;
967+
olist_stack.push_back(std::vector<QPDFObjectHandle>());
968+
enum state_e { st_top, st_start, st_stop, st_eof, st_dictionary, st_array };
969+
std::vector<state_e> state_stack;
970+
state_stack.push_back(st_top);
971+
std::vector<qpdf_offset_t> offset_stack;
972+
offset_stack.push_back(input->tell());
978973
bool done = false;
979974
while (! done)
980975
{
976+
std::vector<QPDFObjectHandle>& olist = olist_stack.back();
977+
state_e state = state_stack.back();
978+
qpdf_offset_t offset = offset_stack.back();
979+
981980
object = QPDFObjectHandle();
982981

983982
QPDFTokenizer::Token token =
@@ -988,8 +987,7 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
988987
case QPDFTokenizer::tt_eof:
989988
if (content_stream)
990989
{
991-
// Return uninitialized object to indicate EOF
992-
return object;
990+
state = st_eof;
993991
}
994992
else
995993
{
@@ -1012,9 +1010,9 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
10121010
break;
10131011

10141012
case QPDFTokenizer::tt_array_close:
1015-
if (in_array)
1013+
if (state == st_array)
10161014
{
1017-
done = true;
1015+
state = st_stop;
10181016
}
10191017
else
10201018
{
@@ -1029,9 +1027,9 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
10291027
break;
10301028

10311029
case QPDFTokenizer::tt_dict_close:
1032-
if (in_dictionary)
1030+
if (state == st_dictionary)
10331031
{
1034-
done = true;
1032+
state = st_stop;
10351033
}
10361034
else
10371035
{
@@ -1046,15 +1044,13 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
10461044
break;
10471045

10481046
case QPDFTokenizer::tt_array_open:
1049-
object = parseInternal(
1050-
input, object_description, tokenizer, empty,
1051-
decrypter, context, true, false, content_stream);
1052-
break;
1053-
10541047
case QPDFTokenizer::tt_dict_open:
1055-
object = parseInternal(
1056-
input, object_description, tokenizer, empty,
1057-
decrypter, context, false, true, content_stream);
1048+
olist_stack.push_back(std::vector<QPDFObjectHandle>());
1049+
state = st_start;
1050+
offset_stack.push_back(input->tell());
1051+
state_stack.push_back(
1052+
(token.getType() == QPDFTokenizer::tt_array_open) ?
1053+
st_array : st_dictionary);
10581054
break;
10591055

10601056
case QPDFTokenizer::tt_bool:
@@ -1084,12 +1080,12 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
10841080
{
10851081
object = QPDFObjectHandle::newOperator(value);
10861082
}
1087-
else if ((value == "R") && (in_array || in_dictionary) &&
1088-
(olist.size() >= 2) &&
1089-
(! olist.at(olist.size() - 1).isIndirect()) &&
1090-
(olist.at(olist.size() - 1).isInteger()) &&
1091-
(! olist.at(olist.size() - 2).isIndirect()) &&
1092-
(olist.at(olist.size() - 2).isInteger()))
1083+
else if ((value == "R") && (state != st_top) &&
1084+
(olist.size() >= 2) &&
1085+
(! olist.at(olist.size() - 1).isIndirect()) &&
1086+
(olist.at(olist.size() - 1).isInteger()) &&
1087+
(! olist.at(olist.size() - 2).isIndirect()) &&
1088+
(olist.at(olist.size() - 2).isInteger()))
10931089
{
10941090
if (context == 0)
10951091
{
@@ -1106,8 +1102,7 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
11061102
olist.pop_back();
11071103
olist.pop_back();
11081104
}
1109-
else if ((value == "endobj") &&
1110-
(! (in_array || in_dictionary)))
1105+
else if ((value == "endobj") && (state == st_top))
11111106
{
11121107
// We just saw endobj without having read
11131108
// anything. Treat this as a null and do not move
@@ -1153,93 +1148,132 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
11531148
break;
11541149
}
11551150

1156-
if (in_dictionary || in_array)
1157-
{
1158-
if (! done)
1159-
{
1160-
olist.push_back(object);
1161-
}
1162-
}
1163-
else if (! object.isInitialized())
1164-
{
1165-
warn(context,
1166-
QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1167-
object_description,
1168-
input->getLastOffset(),
1169-
"parse error while reading object"));
1151+
if ((! object.isInitialized()) &&
1152+
(! ((state == st_start) ||
1153+
(state == st_stop) ||
1154+
(state == st_eof))))
1155+
{
1156+
throw std::logic_error(
1157+
"QPDFObjectHandle::parseInternal: "
1158+
"unexpected uninitialized object");
11701159
object = newNull();
1171-
}
1172-
else
1173-
{
1174-
done = true;
1175-
}
1176-
}
1160+
}
11771161

1178-
if (in_array)
1179-
{
1180-
object = newArray(olist);
1181-
}
1182-
else if (in_dictionary)
1183-
{
1184-
// Convert list to map. Alternating elements are keys. Attempt
1185-
// to recover more or less gracefully from invalid
1186-
// dictionaries.
1187-
std::set<std::string> names;
1188-
for (std::vector<QPDFObjectHandle>::iterator iter = olist.begin();
1189-
iter != olist.end(); ++iter)
1162+
switch (state)
11901163
{
1191-
if ((! (*iter).isIndirect()) && (*iter).isName())
1164+
case st_eof:
1165+
if (state_stack.size() > 1)
11921166
{
1193-
names.insert((*iter).getName());
1167+
warn(context,
1168+
QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1169+
object_description,
1170+
input->getLastOffset(),
1171+
"parse error while reading object"));
11941172
}
1195-
}
1173+
done = true;
1174+
// Leave object uninitialized to indicate EOF
1175+
break;
11961176

1197-
std::map<std::string, QPDFObjectHandle> dict;
1198-
int next_fake_key = 1;
1199-
for (unsigned int i = 0; i < olist.size(); ++i)
1200-
{
1201-
QPDFObjectHandle key_obj = olist.at(i);
1202-
QPDFObjectHandle val;
1203-
if (key_obj.isIndirect() || (! key_obj.isName()))
1177+
case st_dictionary:
1178+
case st_array:
1179+
olist.push_back(object);
1180+
break;
1181+
1182+
case st_top:
1183+
done = true;
1184+
break;
1185+
1186+
case st_start:
1187+
break;
1188+
1189+
case st_stop:
1190+
if ((state_stack.size() < 2) || (olist_stack.size() < 2))
1191+
{
1192+
throw std::logic_error(
1193+
"QPDFObjectHandle::parseInternal: st_stop encountered"
1194+
" with insufficient elements in stack");
1195+
}
1196+
state_e old_state = state_stack.back();
1197+
state_stack.pop_back();
1198+
if (old_state == st_array)
12041199
{
1205-
bool found_fake = false;
1206-
std::string candidate;
1207-
while (! found_fake)
1200+
object = newArray(olist);
1201+
}
1202+
else if (old_state == st_dictionary)
1203+
{
1204+
// Convert list to map. Alternating elements are keys.
1205+
// Attempt to recover more or less gracefully from
1206+
// invalid dictionaries.
1207+
std::set<std::string> names;
1208+
for (std::vector<QPDFObjectHandle>::iterator iter =
1209+
olist.begin();
1210+
iter != olist.end(); ++iter)
1211+
{
1212+
if ((! (*iter).isIndirect()) && (*iter).isName())
1213+
{
1214+
names.insert((*iter).getName());
1215+
}
1216+
}
1217+
1218+
std::map<std::string, QPDFObjectHandle> dict;
1219+
int next_fake_key = 1;
1220+
for (unsigned int i = 0; i < olist.size(); ++i)
12081221
{
1209-
candidate =
1210-
"/QPDFFake" + QUtil::int_to_string(next_fake_key++);
1211-
found_fake = (names.count(candidate) == 0);
1212-
QTC::TC("qpdf", "QPDFObjectHandle found fake",
1213-
(found_fake ? 0 : 1));
1222+
QPDFObjectHandle key_obj = olist.at(i);
1223+
QPDFObjectHandle val;
1224+
if (key_obj.isIndirect() || (! key_obj.isName()))
1225+
{
1226+
bool found_fake = false;
1227+
std::string candidate;
1228+
while (! found_fake)
1229+
{
1230+
candidate =
1231+
"/QPDFFake" +
1232+
QUtil::int_to_string(next_fake_key++);
1233+
found_fake = (names.count(candidate) == 0);
1234+
QTC::TC("qpdf", "QPDFObjectHandle found fake",
1235+
(found_fake ? 0 : 1));
1236+
}
1237+
warn(context,
1238+
QPDFExc(
1239+
qpdf_e_damaged_pdf,
1240+
input->getName(), object_description, offset,
1241+
"expected dictionary key but found"
1242+
" non-name object; inserting key " +
1243+
candidate));
1244+
val = key_obj;
1245+
key_obj = newName(candidate);
1246+
}
1247+
else if (i + 1 >= olist.size())
1248+
{
1249+
QTC::TC("qpdf", "QPDFObjectHandle no val for last key");
1250+
warn(context,
1251+
QPDFExc(
1252+
qpdf_e_damaged_pdf,
1253+
input->getName(), object_description, offset,
1254+
"dictionary ended prematurely; "
1255+
"using null as value for last key"));
1256+
val = newNull();
1257+
}
1258+
else
1259+
{
1260+
val = olist.at(++i);
1261+
}
1262+
dict[key_obj.getName()] = val;
12141263
}
1215-
warn(context,
1216-
QPDFExc(
1217-
qpdf_e_damaged_pdf,
1218-
input->getName(), object_description, offset,
1219-
"expected dictionary key but found"
1220-
" non-name object; inserting key " +
1221-
candidate));
1222-
val = key_obj;
1223-
key_obj = newName(candidate);
1264+
object = newDictionary(dict);
12241265
}
1225-
else if (i + 1 >= olist.size())
1266+
olist_stack.pop_back();
1267+
offset_stack.pop_back();
1268+
if (state_stack.back() == st_top)
12261269
{
1227-
QTC::TC("qpdf", "QPDFObjectHandle no val for last key");
1228-
warn(context,
1229-
QPDFExc(
1230-
qpdf_e_damaged_pdf,
1231-
input->getName(), object_description, offset,
1232-
"dictionary ended prematurely; using null as value"
1233-
" for last key"));
1234-
val = newNull();
1270+
done = true;
12351271
}
12361272
else
12371273
{
1238-
val = olist.at(++i);
1274+
olist_stack.back().push_back(object);
12391275
}
1240-
dict[key_obj.getName()] = val;
12411276
}
1242-
object = newDictionary(dict);
12431277
}
12441278

12451279
return object;

Diff for: qpdf/qtest/qpdf.test

+1
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ my @bug_tests = (
221221
["141a", "/W entry size 0", 2],
222222
["141b", "/W entry size 0", 2],
223223
["143", "self-referential ostream", 3],
224+
["146", "very deeply nested array", 2],
224225
["149", "xref prev pointer loop", 3],
225226
);
226227
$n_tests += scalar(@bug_tests);

Diff for: qpdf/qtest/qpdf/issue-146.out

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
WARNING: issue-146.pdf: file is damaged
2+
WARNING: issue-146.pdf: can't find startxref
3+
WARNING: issue-146.pdf: Attempting to reconstruct cross-reference table
4+
WARNING: issue-146.pdf (trailer, file position 20728): unknown token while reading object; treating as string
5+
issue-146.pdf (trailer, file position 20732): EOF while reading token

Diff for: qpdf/qtest/qpdf/issue-146.pdf

20.2 KB
Binary file not shown.

0 commit comments

Comments
 (0)