From 3bd18a779e880d996fda3cf4c35ef4dc4f6a24e1 Mon Sep 17 00:00:00 2001 From: Peter Griess Date: Fri, 27 May 2011 20:24:07 -0700 Subject: [PATCH] IS_* macros for char classes. - Add IS_ALPHA(), IS_NUM(), IS_HOST_CHAR(), etc. macros for determining membership in a character class. HTTP_PARSER_STRICT causes some of these definitions to change. - Support '_' character in hostnames in non-strict mode. - Support leading digits in hostnames when the method is HTTP_CONNECT. - Don't re-define HTTP_PARSER_STRICT in http_parser.h if it's already defined. - Tweak Makefile to run non-strict-mode unit tests. Rearrange non-strict mode unit tests in test.c. - Add test_fast to .gitignore. Fixes #44 --- .gitignore | 1 + Makefile | 9 +++-- http_parser.c | 103 +++++++++++++++++++++----------------------------- test.c | 80 +++++++++++++++++++++++++-------------- 4 files changed, 102 insertions(+), 91 deletions(-) diff --git a/.gitignore b/.gitignore index 73fe6a4..04b7a1f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ tags *.o test test_g +test_fast diff --git a/Makefile b/Makefile index 182e9d8..4eceeaa 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,14 @@ -CPPFLAGS?=-DHTTP_PARSER_STRICT=1 -OPT_DEBUG=-O0 -g -Wall -Wextra -Werror -I. $(CPPFLAGS) -OPT_FAST=-O3 -DHTTP_PARSER_STRICT=0 -I. $(CPPFLAGS) +CPPFLAGS?=-Wall -Wextra -Werror -I. +OPT_DEBUG=$(CPPFLAGS) -O0 -g -DHTTP_PARSER_STRICT=1 +OPT_FAST=$(CPPFLAGS) -O3 -DHTTP_PARSER_STRICT=0 CC?=gcc AR?=ar -test: test_g +test: test_g test_fast ./test_g + ./test_fast test_g: http_parser_g.o test_g.o $(CC) $(OPT_DEBUG) http_parser_g.o test_g.o -o $@ diff --git a/http_parser.c b/http_parser.c index 01de95f..cf64841 100644 --- a/http_parser.c +++ b/http_parser.c @@ -189,33 +189,7 @@ static const uint8_t normal_url_char[256] = { /* 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w */ 1, 1, 1, 1, 1, 1, 1, 1, /* 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ 127 del */ - 1, 1, 1, 1, 1, 1, 1, 0, - -#if HTTP_PARSER_STRICT - 0 -#else -/* Remainder of non-ASCII range are accepted as-is to support implicitly UTF-8 - encoded paths. This is out of spec, but clients generate this and most other - HTTP servers support it. We should, too. */ - - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1 -#endif -}; /* normal_url_char */ + 1, 1, 1, 1, 1, 1, 1, 0, }; enum state @@ -319,10 +293,24 @@ enum header_states }; -#define CR '\r' -#define LF '\n' -#define LOWER(c) (unsigned char)(c | 0x20) -#define TOKEN(c) tokens[(unsigned char)c] +/* Macros for character classes; depends on strict-mode */ +#define CR '\r' +#define LF '\n' +#define LOWER(c) (unsigned char)(c | 0x20) +#define TOKEN(c) (tokens[(unsigned char)c]) +#define IS_ALPHA(c) ((c) >= 'a' && (c) <= 'z') +#define IS_NUM(c) ((c) >= '0' && (c) <= '9') +#define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c)) + +#if HTTP_PARSER_STRICT +#define IS_URL_CHAR(c) (normal_url_char[(unsigned char) (c)]) +#define IS_HOST_CHAR(c) (IS_ALPHANUM(c) || (c) == '.' || (c) == '-') +#else +#define IS_URL_CHAR(c) \ + (normal_url_char[(unsigned char) (c)] || ((c) & 0x80)) +#define IS_HOST_CHAR(c) \ + (IS_ALPHANUM(c) || (c) == '.' || (c) == '-' || (c) == '_') +#endif #define start_state (parser->type == HTTP_REQUEST ? s_start_req : s_start_res) @@ -499,7 +487,7 @@ size_t http_parser_execute (http_parser *parser, break; } - if (ch < '0' || ch > '9') goto error; + if (!IS_NUM(ch)) goto error; parser->http_major *= 10; parser->http_major += ch - '0'; @@ -510,7 +498,7 @@ size_t http_parser_execute (http_parser *parser, /* first digit of minor HTTP version */ case s_res_first_http_minor: - if (ch < '0' || ch > '9') goto error; + if (!IS_NUM(ch)) goto error; parser->http_minor = ch - '0'; state = s_res_http_minor; break; @@ -523,7 +511,7 @@ size_t http_parser_execute (http_parser *parser, break; } - if (ch < '0' || ch > '9') goto error; + if (!IS_NUM(ch)) goto error; parser->http_minor *= 10; parser->http_minor += ch - '0'; @@ -534,7 +522,7 @@ size_t http_parser_execute (http_parser *parser, case s_res_first_status_code: { - if (ch < '0' || ch > '9') { + if (!IS_NUM(ch)) { if (ch == ' ') { break; } @@ -547,7 +535,7 @@ size_t http_parser_execute (http_parser *parser, case s_res_status_code: { - if (ch < '0' || ch > '9') { + if (!IS_NUM(ch)) { switch (ch) { case ' ': state = s_res_status; @@ -599,7 +587,7 @@ size_t http_parser_execute (http_parser *parser, CALLBACK2(message_begin); - if (ch < 'A' || 'Z' < ch) goto error; + if (!IS_ALPHA(LOWER(ch))) goto error; start_req_method_assign: parser->method = (enum http_method) 0; @@ -678,9 +666,13 @@ size_t http_parser_execute (http_parser *parser, c = LOWER(ch); - if (c >= 'a' && c <= 'z') { + /* Proxied requests are followed by scheme of an absolute URI (alpha). + * CONNECT is followed by a hostname, which begins with alphanum. + * All other methods are followed by '/' or '*' (handled above). + */ + if (IS_ALPHA(ch) || (parser->method == HTTP_CONNECT && IS_NUM(ch))) { MARK(url); - state = s_req_schema; + state = (parser->method == HTTP_CONNECT) ? s_req_host : s_req_schema; break; } @@ -691,17 +683,11 @@ size_t http_parser_execute (http_parser *parser, { c = LOWER(ch); - if (c >= 'a' && c <= 'z') break; + if (IS_ALPHA(c)) break; if (ch == ':') { state = s_req_schema_slash; break; - } else if (ch == '.') { - state = s_req_host; - break; - } else if ('0' <= ch && ch <= '9') { - state = s_req_host; - break; } goto error; @@ -720,8 +706,7 @@ size_t http_parser_execute (http_parser *parser, case s_req_host: { c = LOWER(ch); - if (c >= 'a' && c <= 'z') break; - if ((ch >= '0' && ch <= '9') || ch == '.' || ch == '-') break; + if (IS_HOST_CHAR(ch)) break; switch (ch) { case ':': state = s_req_port; @@ -749,7 +734,7 @@ size_t http_parser_execute (http_parser *parser, case s_req_port: { - if (ch >= '0' && ch <= '9') break; + if (IS_NUM(ch)) break; switch (ch) { case '/': MARK(path); @@ -774,7 +759,7 @@ size_t http_parser_execute (http_parser *parser, case s_req_path: { - if (normal_url_char[(unsigned char)ch]) break; + if (IS_URL_CHAR(ch)) break; switch (ch) { case ' ': @@ -812,7 +797,7 @@ size_t http_parser_execute (http_parser *parser, case s_req_query_string_start: { - if (normal_url_char[(unsigned char)ch]) { + if (IS_URL_CHAR(ch)) { MARK(query_string); state = s_req_query_string; break; @@ -848,7 +833,7 @@ size_t http_parser_execute (http_parser *parser, case s_req_query_string: { - if (normal_url_char[(unsigned char)ch]) break; + if (IS_URL_CHAR(ch)) break; switch (ch) { case '?': @@ -885,7 +870,7 @@ size_t http_parser_execute (http_parser *parser, case s_req_fragment_start: { - if (normal_url_char[(unsigned char)ch]) { + if (IS_URL_CHAR(ch)) { MARK(fragment); state = s_req_fragment; break; @@ -922,7 +907,7 @@ size_t http_parser_execute (http_parser *parser, case s_req_fragment: { - if (normal_url_char[(unsigned char)ch]) break; + if (IS_URL_CHAR(ch)) break; switch (ch) { case ' ': @@ -1000,7 +985,7 @@ size_t http_parser_execute (http_parser *parser, break; } - if (ch < '0' || ch > '9') goto error; + if (!IS_NUM(ch)) goto error; parser->http_major *= 10; parser->http_major += ch - '0'; @@ -1011,7 +996,7 @@ size_t http_parser_execute (http_parser *parser, /* first digit of minor HTTP version */ case s_req_first_http_minor: - if (ch < '0' || ch > '9') goto error; + if (!IS_NUM(ch)) goto error; parser->http_minor = ch - '0'; state = s_req_http_minor; break; @@ -1031,7 +1016,7 @@ size_t http_parser_execute (http_parser *parser, /* XXX allow spaces after digit? */ - if (ch < '0' || ch > '9') goto error; + if (!IS_NUM(ch)) goto error; parser->http_minor *= 10; parser->http_minor += ch - '0'; @@ -1264,7 +1249,7 @@ size_t http_parser_execute (http_parser *parser, break; case h_content_length: - if (ch < '0' || ch > '9') goto error; + if (!IS_NUM(ch)) goto error; parser->content_length = ch - '0'; break; @@ -1313,7 +1298,7 @@ size_t http_parser_execute (http_parser *parser, case h_content_length: if (ch == ' ') break; - if (ch < '0' || ch > '9') goto error; + if (!IS_NUM(ch)) goto error; parser->content_length *= 10; parser->content_length += ch - '0'; break; diff --git a/test.c b/test.c index 3b35395..e371c65 100644 --- a/test.c +++ b/test.c @@ -498,7 +498,7 @@ const struct message requests[] = #define CONNECT_REQUEST 17 , {.name = "connect request" ,.type= HTTP_REQUEST - ,.raw= "CONNECT home0.netscape.com:443 HTTP/1.0\r\n" + ,.raw= "CONNECT 0-home0.netscape.com:443 HTTP/1.0\r\n" "User-agent: Mozilla/1.1N\r\n" "Proxy-authorization: basic aGVsbG86d29ybGQ=\r\n" "\r\n" @@ -510,7 +510,7 @@ const struct message requests[] = ,.query_string= "" ,.fragment= "" ,.request_path= "" - ,.request_url= "home0.netscape.com:443" + ,.request_url= "0-home0.netscape.com:443" ,.num_headers= 2 ,.upgrade=1 ,.headers= { { "User-agent", "Mozilla/1.1N" } @@ -582,30 +582,7 @@ const struct message requests[] = ,.body= "" } -#if !HTTP_PARSER_STRICT -#define UTF8_PATH_REQ 21 -, {.name= "utf-8 path request" - ,.type= HTTP_REQUEST - ,.raw= "GET /δ¶/δt/pope?q=1#narf HTTP/1.1\r\n" - "Host: github.com\r\n" - "\r\n" - ,.should_keep_alive= TRUE - ,.message_complete_on_eof= FALSE - ,.http_major= 1 - ,.http_minor= 1 - ,.method= HTTP_GET - ,.query_string= "q=1" - ,.fragment= "narf" - ,.request_path= "/δ¶/δt/pope" - ,.request_url= "/δ¶/δt/pope?q=1#narf" - ,.num_headers= 1 - ,.headers= { {"Host", "github.com" } - } - ,.body= "" - } -#endif /* !HTTP_PARSER_STRICT */ - -#define QUERY_TERMINATED_HOST 22 +#define QUERY_TERMINATED_HOST 21 , {.name= "host terminated by a query string" ,.type= HTTP_REQUEST ,.raw= "GET http://hypnotoad.org?hail=all HTTP/1.1\r\n" @@ -624,7 +601,7 @@ const struct message requests[] = ,.body= "" } -#define QUERY_TERMINATED_HOSTPORT 23 +#define QUERY_TERMINATED_HOSTPORT 22 , {.name= "host:port terminated by a query string" ,.type= HTTP_REQUEST ,.raw= "GET http://hypnotoad.org:1234?hail=all HTTP/1.1\r\n" @@ -643,7 +620,7 @@ const struct message requests[] = ,.body= "" } -#define SPACE_TERMINATED_HOSTPORT 24 +#define SPACE_TERMINATED_HOSTPORT 23 , {.name= "host:port terminated by a space" ,.type= HTTP_REQUEST ,.raw= "GET http://hypnotoad.org:1234 HTTP/1.1\r\n" @@ -662,6 +639,53 @@ const struct message requests[] = ,.body= "" } +#if !HTTP_PARSER_STRICT +#define UTF8_PATH_REQ 24 +, {.name= "utf-8 path request" + ,.type= HTTP_REQUEST + ,.raw= "GET /δ¶/δt/pope?q=1#narf HTTP/1.1\r\n" + "Host: github.com\r\n" + "\r\n" + ,.should_keep_alive= TRUE + ,.message_complete_on_eof= FALSE + ,.http_major= 1 + ,.http_minor= 1 + ,.method= HTTP_GET + ,.query_string= "q=1" + ,.fragment= "narf" + ,.request_path= "/δ¶/δt/pope" + ,.request_url= "/δ¶/δt/pope?q=1#narf" + ,.num_headers= 1 + ,.headers= { {"Host", "github.com" } + } + ,.body= "" + } + +#define HOSTNAME_UNDERSCORE +, {.name = "hostname underscore" + ,.type= HTTP_REQUEST + ,.raw= "CONNECT home_0.netscape.com:443 HTTP/1.0\r\n" + "User-agent: Mozilla/1.1N\r\n" + "Proxy-authorization: basic aGVsbG86d29ybGQ=\r\n" + "\r\n" + ,.should_keep_alive= FALSE + ,.message_complete_on_eof= FALSE + ,.http_major= 1 + ,.http_minor= 0 + ,.method= HTTP_CONNECT + ,.query_string= "" + ,.fragment= "" + ,.request_path= "" + ,.request_url= "home_0.netscape.com:443" + ,.num_headers= 2 + ,.upgrade=1 + ,.headers= { { "User-agent", "Mozilla/1.1N" } + , { "Proxy-authorization", "basic aGVsbG86d29ybGQ=" } + } + ,.body= "" + } +#endif /* !HTTP_PARSER_STRICT */ + , {.name= NULL } /* sentinel */ };