Skip to content

Commit

Permalink
PERF: more flexible iso8601 parsing
Browse files Browse the repository at this point in the history
closes #9714
closes #11899
closes #11871
closes #12060
  • Loading branch information
chris-b1 authored and jreback committed Jan 26, 2016
1 parent 91ee418 commit 5de6b84
Show file tree
Hide file tree
Showing 7 changed files with 192 additions and 48 deletions.
30 changes: 12 additions & 18 deletions asv_bench/benchmarks/timeseries.py
Expand Up @@ -1059,33 +1059,27 @@ class timeseries_to_datetime_iso8601(object):
goal_time = 0.2

def setup(self):
self.N = 100000
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
if hasattr(Series, 'convert'):
Series.resample = Series.convert
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng]
self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
for x in self.rng]

def time_timeseries_to_datetime_iso8601(self):
to_datetime(self.strings)


class timeseries_to_datetime_iso8601_format(object):
goal_time = 0.2

def setup(self):
self.N = 100000
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
if hasattr(Series, 'convert'):
Series.resample = Series.convert
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
def time_timeseries_to_datetime_iso8601_nosep(self):
to_datetime(self.strings_nosep)

def time_timeseries_to_datetime_iso8601_format(self):
to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S')

def time_timeseries_to_datetime_iso8601_format_no_sep(self):
to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S')

def time_timeseries_to_datetime_iso8601_tz_spaceformat(self):
to_datetime(self.strings_tz_space)


class timeseries_with_format_no_exact(object):
goal_time = 0.2
Expand Down Expand Up @@ -1160,4 +1154,4 @@ def setup(self):
self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)

def time_timeseries_year_incr(self):
(self.date + self.year)
(self.date + self.year)
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.18.0.txt
Expand Up @@ -461,7 +461,7 @@ Performance Improvements
- Improved performance in construction of ``Categoricals`` with Series of datetimes containing ``NaT`` (:issue:`12077`)



- Improved performance of ISO 8601 date parsing for dates without separators (:issue:`11899`), leading zeros (:issue:`11871`) and with whitespace preceding the time zone (:issue:`9714`)



Expand Down
125 changes: 102 additions & 23 deletions pandas/src/datetime/np_datetime_strings.c
Expand Up @@ -346,8 +346,6 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc,
/*
* Parses (almost) standard ISO 8601 date strings. The differences are:
*
* + The date "20100312" is parsed as the year 20100312, not as
* equivalent to "2010-03-12". The '-' in the dates are not optional.
* + Only seconds may have a decimal point, with up to 18 digits after it
* (maximum attoseconds precision).
* + Either a 'T' as in ISO 8601 or a ' ' may be used to separate
Expand Down Expand Up @@ -396,6 +394,16 @@ parse_iso_8601_datetime(char *str, int len,
char *substr, sublen;
PANDAS_DATETIMEUNIT bestunit;

/* if date components in are separated by one of valid separators
* months/days without leadings 0s will be parsed
* (though not iso8601). If the components aren't separated,
* an error code will be retuned because the date is ambigous
*/
int has_sep = 0;
char sep;
char valid_sep[] = {'-', '.', '/', '\\', ' '};
int valid_sep_len = 5;

/* Initialize the output to all zeros */
memset(out, 0, sizeof(pandas_datetimestruct));
out->month = 1;
Expand Down Expand Up @@ -523,12 +531,16 @@ parse_iso_8601_datetime(char *str, int len,
goto parse_error;
}

/* PARSE THE YEAR (digits until the '-' character) */
/* PARSE THE YEAR (4 digits) */
out->year = 0;
while (sublen > 0 && isdigit(*substr)) {
out->year = 10 * out->year + (*substr - '0');
++substr;
--sublen;
if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) &&
isdigit(substr[2]) && isdigit(substr[3])) {

out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') +
10 * (substr[2] - '0') + (substr[3] - '0');

substr += 4;
sublen -= 4;;
}

/* Negate the year if necessary */
Expand All @@ -538,29 +550,49 @@ parse_iso_8601_datetime(char *str, int len,
/* Check whether it's a leap-year */
year_leap = is_leapyear(out->year);

/* Next character must be a '-' or the end of the string */
/* Next character must be a separator, start of month or end */
if (sublen == 0) {
if (out_local != NULL) {
*out_local = 0;
}
bestunit = PANDAS_FR_Y;
goto finish;
}
else if (*substr == '-') {
++substr;
--sublen;
}
else {
goto parse_error;
else if (!isdigit(*substr)) {
for (i = 0; i < valid_sep_len; ++i) {
if (*substr == valid_sep[i]) {
has_sep = 1;
sep = valid_sep[i];
++substr;
--sublen;
break;
}
}
if (i == valid_sep_len) {
goto parse_error;
}
}

/* Can't have a trailing '-' */
/* Can't have a trailing sep */
if (sublen == 0) {
goto parse_error;
}


/* PARSE THE MONTH (2 digits) */
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1]))
|| (sublen == 1 && isdigit(substr[0])))) {
out->month = (substr[0] - '0');

if (out->month < 1) {
PyErr_Format(PyExc_ValueError,
"Month out of range in datetime string \"%s\"", str);
goto error;
}
++substr;
--sublen;
}
else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->month = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->month < 1 || out->month > 12) {
Expand All @@ -577,18 +609,22 @@ parse_iso_8601_datetime(char *str, int len,

/* Next character must be a '-' or the end of the string */
if (sublen == 0) {
/* dates of form YYYYMM are not valid */
if (!has_sep) {
goto parse_error;
}
if (out_local != NULL) {
*out_local = 0;
}
bestunit = PANDAS_FR_M;
goto finish;
}
else if (*substr == '-') {
else if (has_sep && *substr == sep) {
++substr;
--sublen;
}
else {
goto parse_error;
else if (!isdigit(*substr)) {
goto parse_error;
}

/* Can't have a trailing '-' */
Expand All @@ -597,7 +633,19 @@ parse_iso_8601_datetime(char *str, int len,
}

/* PARSE THE DAY (2 digits) */
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1]))
|| (sublen == 1 && isdigit(substr[0])))) {
out->day = (substr[0] - '0');

if (out->day < 1) {
PyErr_Format(PyExc_ValueError,
"Day out of range in datetime string \"%s\"", str);
goto error;
}
++substr;
--sublen;
}
else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->day = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->day < 1 ||
Expand Down Expand Up @@ -633,14 +681,19 @@ parse_iso_8601_datetime(char *str, int len,
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->hour = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->hour < 0 || out->hour >= 24) {
if (out->hour >= 24) {
PyErr_Format(PyExc_ValueError,
"Hours out of range in datetime string \"%s\"", str);
goto error;
}
substr += 2;
sublen -= 2;
}
else if (sublen >= 1 && isdigit(substr[0])) {
out->hour = substr[0] - '0';
++substr;
--sublen;
}
else {
goto parse_error;
}
Expand All @@ -664,14 +717,19 @@ parse_iso_8601_datetime(char *str, int len,
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->min = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->hour < 0 || out->min >= 60) {
if (out->min >= 60) {
PyErr_Format(PyExc_ValueError,
"Minutes out of range in datetime string \"%s\"", str);
goto error;
}
substr += 2;
sublen -= 2;
}
else if (sublen >= 1 && isdigit(substr[0])) {
out->min = substr[0] - '0';
++substr;
--sublen;
}
else {
goto parse_error;
}
Expand All @@ -695,14 +753,19 @@ parse_iso_8601_datetime(char *str, int len,
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->sec = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->sec < 0 || out->sec >= 60) {
if (out->sec >= 60) {
PyErr_Format(PyExc_ValueError,
"Seconds out of range in datetime string \"%s\"", str);
goto error;
}
substr += 2;
sublen -= 2;
}
else if (sublen >= 1 && isdigit(substr[0])) {
out->sec = substr[0] - '0';
++substr;
--sublen;
}
else {
goto parse_error;
}
Expand Down Expand Up @@ -781,6 +844,12 @@ parse_iso_8601_datetime(char *str, int len,
}

parse_timezone:
/* trim any whitepsace between time/timeezone */
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
}

if (sublen == 0) {
// Unlike NumPy, treating no time zone as naive
goto finish;
Expand Down Expand Up @@ -832,6 +901,11 @@ parse_iso_8601_datetime(char *str, int len,
goto error;
}
}
else if (sublen >= 1 && isdigit(substr[0])) {
offset_hour = substr[0] - '0';
++substr;
--sublen;
}
else {
goto parse_error;
}
Expand All @@ -856,6 +930,11 @@ parse_iso_8601_datetime(char *str, int len,
goto error;
}
}
else if (sublen >= 1 && isdigit(substr[0])) {
offset_minute = substr[0] - '0';
++substr;
--sublen;
}
else {
goto parse_error;
}
Expand Down
14 changes: 12 additions & 2 deletions pandas/tseries/tests/test_timeseries.py
Expand Up @@ -2455,7 +2455,7 @@ def test_constructor_datetime64_tzformat(self):
idx = date_range('2013/1/1 0:00:00-5:00', '2016/1/1 23:59:59-5:00',
freq=freq)
expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59',
freq=freq, tz=tzoffset(None, -18000))
freq=freq, tz=pytz.FixedOffset(-300))
tm.assert_index_equal(idx, expected)
# Unable to use `US/Eastern` because of DST
expected_i8 = date_range('2013-01-01T00:00:00',
Expand All @@ -2466,7 +2466,7 @@ def test_constructor_datetime64_tzformat(self):
idx = date_range('2013/1/1 0:00:00+9:00',
'2016/1/1 23:59:59+09:00', freq=freq)
expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59',
freq=freq, tz=tzoffset(None, 32400))
freq=freq, tz=pytz.FixedOffset(540))
tm.assert_index_equal(idx, expected)
expected_i8 = date_range('2013-01-01T00:00:00',
'2016-01-01T23:59:59', freq=freq,
Expand Down Expand Up @@ -4834,6 +4834,16 @@ def test_to_datetime_infer_datetime_format_series_starting_with_nans(self):
pd.to_datetime(test_series, infer_datetime_format=True)
)

def test_to_datetime_iso8601_noleading_0s(self):
# GH 11871
test_series = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3'])
expected = pd.Series([pd.Timestamp('2014-01-01'),
pd.Timestamp('2014-02-02'),
pd.Timestamp('2015-03-03')])
tm.assert_series_equal(pd.to_datetime(test_series), expected)
tm.assert_series_equal(pd.to_datetime(test_series, format='%Y-%m-%d'),
expected)


class TestGuessDatetimeFormat(tm.TestCase):
def test_guess_datetime_format_with_parseable_formats(self):
Expand Down
26 changes: 26 additions & 0 deletions pandas/tseries/tests/test_tslib.py
Expand Up @@ -691,6 +691,32 @@ def test_parsers_timezone_minute_offsets_roundtrip(self):
converted_time = dt_time.tz_localize('UTC').tz_convert(tz)
self.assertEqual(dt_string_repr, repr(converted_time))

def test_parsers_iso8601(self):
# GH 12060
# test only the iso parser - flexibility to different
# separators and leadings 0s
# Timestamp construction falls back to dateutil
cases = {'2011-01-02': datetime.datetime(2011, 1, 2),
'2011-1-2': datetime.datetime(2011, 1, 2),
'2011-01': datetime.datetime(2011, 1, 1),
'2011-1': datetime.datetime(2011, 1, 1),
'2011 01 02': datetime.datetime(2011, 1, 2),
'2011.01.02': datetime.datetime(2011, 1, 2),
'2011/01/02': datetime.datetime(2011, 1, 2),
'2011\\01\\02': datetime.datetime(2011, 1, 2),
'2013-01-01 05:30:00': datetime.datetime(2013, 1, 1, 5, 30),
'2013-1-1 5:30:00': datetime.datetime(2013, 1, 1, 5, 30)}
for date_str, exp in compat.iteritems(cases):
actual = tslib._test_parse_iso8601(date_str)
self.assertEqual(actual, exp)

# seperators must all match - YYYYMM not valid
invalid_cases = ['2011-01/02', '2011^11^11', '201401',
'201111', '200101']
for date_str in invalid_cases:
with tm.assertRaises(ValueError):
tslib._test_parse_iso8601(date_str)


class TestArrayToDatetime(tm.TestCase):
def test_parsing_valid_dates(self):
Expand Down

0 comments on commit 5de6b84

Please sign in to comment.