Skip to content

Commit

Permalink
Fix numeric and named entity validation
Browse files Browse the repository at this point in the history
We now escape entities that we're sure aren't valid, or don't
match a whitelist of named entities (if applicable.)
  • Loading branch information
JordanMilne committed Feb 25, 2015
1 parent 4894747 commit c96c9f5
Show file tree
Hide file tree
Showing 5 changed files with 300 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
build/
snudown.egg-info/
src/html_entities.h
*.pyc
*.so
2 changes: 1 addition & 1 deletion debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Source: snudown
Maintainer: Neil Williams <neil@reddit.com>
Section: python
Priority: optional
Build-Depends: python-all-dev (>= 2.6.6-3), debhelper (>= 7), python-setuptools
Build-Depends: python-all-dev (>= 2.6.6-3), debhelper (>= 7), python-setuptools, gperf
Standards-Version: 3.9.3
Homepage: https://github.com/reddit/snudown
Vcs-Git: git://github.com/reddit/snudown.git
Expand Down
7 changes: 7 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from setuptools import setup, Extension
from setuptools.command.build_ext import build_ext

import re
import os
Expand All @@ -21,13 +22,19 @@ def c_files_in(directory):
version = m.group(1)
assert version

class GPerfingBuildExt(build_ext):
def run(self):
os.system("gperf src/html_entities.gperf > src/html_entities.h")
build_ext.run(self)

setup(
name='snudown',
version=version,
author='Vicent Marti',
author_email='vicent@github.com',
license='MIT',
test_suite="test_snudown.test_snudown",
cmdclass={'build_ext': GPerfingBuildExt,},
ext_modules=[
Extension(
name='snudown',
Expand Down
239 changes: 239 additions & 0 deletions src/html_entities.gperf
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
%language=ANSI-C
%define lookup-function-name is_allowed_named_entity
%compare-strncmp
%readonly-tables
%define hash-function-name hash_html_entity
%enum
%includes
%{
#include <stdlib.h>

// Parsers tend to choke on entities with values greater than this
const u_int32_t MAX_NUM_ENTITY_VAL = 0x10ffff;
// Any numeric entity longer than this is obviously above MAX_ENTITY_CHAR
// used to avoid dealing with overflows.
const size_t MAX_NUM_ENTITY_LEN = 7;
%}
%%
&AElig;
&Aacute;
&Acirc;
&Agrave;
&Alpha;
&Aring;
&Atilde;
&Auml;
&Beta;
&Ccedil;
&Chi;
&Dagger;
&Delta;
&ETH;
&Eacute;
&Ecirc;
&Egrave;
&Epsilon;
&Eta;
&Euml;
&Gamma;
&Iacute;
&Icirc;
&Igrave;
&Iota;
&Iuml;
&Kappa;
&Lambda;
&Mu;
&Ntilde;
&Nu;
&OElig;
&Oacute;
&Ocirc;
&Ograve;
&Omega;
&Omicron;
&Oslash;
&Otilde;
&Ouml;
&Phi;
&Pi;
&Prime;
&Psi;
&Rho;
&Scaron;
&Sigma;
&THORN;
&Tau;
&Theta;
&Uacute;
&Ucirc;
&Ugrave;
&Upsilon;
&Uuml;
&Xi;
&Yacute;
&Yuml;
&Zeta;
&aacute;
&acirc;
&acute;
&aelig;
&agrave;
&alefsym;
&alpha;
&amp;
&and;
&ang;
&apos;
&aring;
&asymp;
&atilde;
&auml;
&bdquo;
&beta;
&brvbar;
&bull;
&cap;
&ccedil;
&cedil;
&cent;
&chi;
&circ;
&clubs;
&cong;
&copy;
&crarr;
&cup;
&curren;
&dArr;
&dagger;
&darr;
&deg;
&delta;
&diams;
&divide;
&infin;
&int;
&iota;
&iquest;
&isin;
&iuml;
&kappa;
&lArr;
&lambda;
&lang;
&laquo;
&larr;
&lceil;
&ldquo;
&le;
&lfloor;
&lowast;
&loz;
&lrm;
&lsaquo;
&lsquo;
&lt;
&macr;
&mdash;
&micro;
&middot;
&minus;
&mu;
&nabla;
&nbsp;
&ndash;
&ne;
&ni;
&not;
&notin;
&nsub;
&ntilde;
&nu;
&oacute;
&ocirc;
&oelig;
&ograve;
&oline;
&omega;
&omicron;
&oplus;
&or;
&ordf;
&ordm;
&oslash;
&otilde;
&otimes;
&ouml;
&para;
&part;
&permil;
&perp;
&phi;
&pi;
&piv;
&plusmn;
&pound;
&prime;
&prod;
&prop;
&psi;
&quot;
&rArr;
&radic;
&rang;
&raquo;
&rarr;
&rceil;
&rdquo;
&real;
&reg;
&rfloor;
&rho;
&rlm;
&rsaquo;
&rsquo;
&sbquo;
&scaron;
&sdot;
&sect;
&shy;
&sigma;
&sigmaf;
&sim;
&spades;
&sub;
&sube;
&sum;
&sup1;
&sup2;
&sup3;
&sup;
&supe;
&szlig;
&tau;
&there4;
&theta;
&thetasym;
&thinsp;
&thorn;
&tilde;
&times;
&trade;
&uArr;
&uacute;
&uarr;
&ucirc;
&ugrave;
&uml;
&upsih;
&upsilon;
&uuml;
&weierp;
&xi;
&yacute;
&yen;
&yuml;
&zeta;
&zwj;
&zwnj;
57 changes: 52 additions & 5 deletions src/markdown.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#define GPERF_DOWNCASE 1
#define GPERF_CASE_STRNCMP 1
#include "html_blocks.h"
#include "html_entities.h"

/***************
* LOCAL TYPES *
Expand Down Expand Up @@ -709,24 +710,70 @@ char_escape(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t offs
}

/* char_entity • '&' escaped when it doesn't belong to an entity */
/* valid entities are assumed to be anything matching &#?[A-Za-z0-9]+; */
static size_t
char_entity(struct buf *ob, struct sd_markdown *rndr, uint8_t *data, size_t offset, size_t size)
{
size_t end = 1;
size_t content_start;
size_t content_end;
struct buf work = { 0, 0, 0, 0 };
int numeric = 0;
int hex = 0;
int entity_base;
u_int64_t entity_val;

if (end < size && data[end] == '#')
if (end < size && data[end] == '#') {
numeric = 1;
end++;
}

while (end < size && isalnum(data[end]))
if (end < size && numeric && tolower(data[end]) == 'x') {
hex = 1;
end++;
}

content_start = end;

while (end < size) {
const char c = data[end];
if (hex) {
if (!isxdigit(c)) break;
} else if (numeric) {
if (!isdigit(c)) break;
} else if (!isalpha(c)) {
break;
}
end++;
}

content_end = end;

if (end < size && data[end] == ';')
end++; /* real entity */
if (end > content_start && end < size && data[end] == ';')
end++; /* well-formed entity */
else
return 0; /* lone '&' */

/* way too long to be a valid numeric entity */
if (numeric && content_end - content_start > MAX_NUM_ENTITY_LEN)
return 0;

/* Validate the entity's contents */
if (numeric) {
if (hex)
entity_base = 16;
else
entity_base = 10;

// This is ok because it'll stop once it hits the ';'
entity_val = strtol((char*)data + content_start, NULL, entity_base);
// Outside of UCS range, many parsers will choke on this.
if (entity_val > MAX_NUM_ENTITY_VAL)
return 0;
} else {
if (!is_allowed_named_entity((const char *)data, end))
return 0;
}

if (rndr->cb.entity) {
work.data = data;
work.size = end;
Expand Down

0 comments on commit c96c9f5

Please sign in to comment.