Skip to content

Commit

Permalink
ENH: Ability to tz localize when index is implicility in tz
Browse files Browse the repository at this point in the history
Fix to issue pandas-dev#4230 which allows to localize an index which is
implicitly in a tz (e.g., reading from a file) by passing infer_dst to
tz_localize.
  • Loading branch information
rockg committed Oct 2, 2013
1 parent 354f10a commit e5ea6c8
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 16 deletions.
3 changes: 3 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ Improvements to existing features
:issue:`4998`)
- ``to_dict`` now takes ``records`` as a possible outtype. Returns an array
of column-keyed dictionaries. (:issue:`4936`)
- ``tz_localize`` can infer a fall daylight savings transition based on the
structure of unlocalized data (:issue:`4230`)
- DatetimeIndex is now in the API documentation

API Changes
~~~~~~~~~~~
Expand Down
14 changes: 14 additions & 0 deletions doc/source/timeseries.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1108,6 +1108,20 @@ TimeSeries, aligning the data on the UTC timestamps:
.. _timeseries.timedeltas:

In some cases, localize cannot determine the DST and non-DST hours when there are
duplicates. This often happens when reading files that simply duplicate the hours.
The infer_dst argument in tz_localize will attempt
to determine the right offset.

.. ipython:: python
rng_hourly = DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00',
'11/06/2011 01:00', '11/06/2011 02:00',
'11/06/2011 03:00'])
rng_hourly.tz_localize('US/Eastern')
rng_hourly_eastern = rng_hourly.tz_localize('US/Eastern', infer_dst=True)
rng_hourly_eastern.values
Time Deltas
-----------

Expand Down
6 changes: 5 additions & 1 deletion doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ enhancements along with a large number of bug fixes.

.. warning::

In 0.13.0 ``Series`` has internaly been refactored to no longer sub-class ``ndarray``
In 0.13.0 ``Series`` has internally been refactored to no longer sub-class ``ndarray``
but instead subclass ``NDFrame``, similarly to the rest of the pandas containers. This should be
a transparent change with only very limited API implications. See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`

Expand Down Expand Up @@ -481,6 +481,10 @@ Enhancements

:ref:`See the docs<indexing.basics.indexing_isin>` for more.

- ``tz_localize`` can infer a fall daylight savings transition based on the structure
of the unlocalized data (:issue:`4230`), see :ref:`here<timeseries.timezone>`
- DatetimeIndex is now in the API documentation, see :ref:`here<api.datetimeindex>`

.. _whatsnew_0130.experimental:

Experimental
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2752,7 +2752,7 @@ def tz_convert(self, tz, axis=0, copy=True):

return new_obj

def tz_localize(self, tz, axis=0, copy=True):
def tz_localize(self, tz, axis=0, copy=True, infer_dst=False):
"""
Localize tz-naive TimeSeries to target time zone
Expand All @@ -2761,6 +2761,8 @@ def tz_localize(self, tz, axis=0, copy=True):
tz : string or pytz.timezone object
copy : boolean, default True
Also make a copy of the underlying data
infer_dst : boolean, default False
Attempt to infer fall dst-transition times based on order
Returns
-------
Expand All @@ -2778,7 +2780,7 @@ def tz_localize(self, tz, axis=0, copy=True):
new_data = new_data.copy()

new_obj = self._constructor(new_data)
new_ax = ax.tz_localize(tz)
new_ax = ax.tz_localize(tz, infer_dst=infer_dst)

if axis == 0:
new_obj._set_axis(1, new_ax)
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2331,7 +2331,7 @@ def tz_convert(self, tz, copy=True):

return self._constructor(new_values, index=new_index, name=self.name)

def tz_localize(self, tz, copy=True):
def tz_localize(self, tz, copy=True, infer_dst=False):
"""
Localize tz-naive TimeSeries to target time zone
Entries will retain their "naive" value but will be annotated as
Expand All @@ -2345,6 +2345,8 @@ def tz_localize(self, tz, copy=True):
tz : string or pytz.timezone object
copy : boolean, default True
Also make a copy of the underlying data
infer_dst : boolean, default False
Attempt to infer fall dst-transition hours based on order
Returns
-------
Expand All @@ -2358,7 +2360,7 @@ def tz_localize(self, tz, copy=True):

new_index = DatetimeIndex([], tz=tz)
else:
new_index = self.index.tz_localize(tz)
new_index = self.index.tz_localize(tz, infer_dst=infer_dst)

new_values = self.values
if copy:
Expand Down
26 changes: 19 additions & 7 deletions pandas/tseries/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def __new__(cls, data=None,

dayfirst = kwds.pop('dayfirst', None)
yearfirst = kwds.pop('yearfirst', None)
infer_dst = kwds.pop('infer_dst', False)
warn = False
if 'offset' in kwds and kwds['offset']:
freq = kwds['offset']
Expand Down Expand Up @@ -183,7 +184,8 @@ def __new__(cls, data=None,

if data is None:
return cls._generate(start, end, periods, name, offset,
tz=tz, normalize=normalize)
tz=tz, normalize=normalize,
infer_dst=infer_dst)

if not isinstance(data, np.ndarray):
if np.isscalar(data):
Expand All @@ -209,7 +211,7 @@ def __new__(cls, data=None,
data.name = name

if tz is not None:
return data.tz_localize(tz)
return data.tz_localize(tz, infer_dst=infer_dst)

return data

Expand Down Expand Up @@ -261,7 +263,8 @@ def __new__(cls, data=None,
getattr(data, 'tz', None) is None):
# Convert tz-naive to UTC
ints = subarr.view('i8')
subarr = tslib.tz_localize_to_utc(ints, tz)
subarr = tslib.tz_localize_to_utc(ints, tz,
infer_dst=infer_dst)

subarr = subarr.view(_NS_DTYPE)

Expand All @@ -286,7 +289,7 @@ def __new__(cls, data=None,

@classmethod
def _generate(cls, start, end, periods, name, offset,
tz=None, normalize=False):
tz=None, normalize=False, infer_dst=False):
if com._count_not_none(start, end, periods) != 2:
raise ValueError('Must specify two of start, end, or periods')

Expand Down Expand Up @@ -375,7 +378,8 @@ def _generate(cls, start, end, periods, name, offset,
index = _generate_regular_range(start, end, periods, offset)

if tz is not None and getattr(index, 'tz', None) is None:
index = tslib.tz_localize_to_utc(com._ensure_int64(index), tz)
index = tslib.tz_localize_to_utc(com._ensure_int64(index), tz,
infer_dst=infer_dst)
index = index.view(_NS_DTYPE)

index = index.view(cls)
Expand Down Expand Up @@ -1537,9 +1541,17 @@ def tz_convert(self, tz):
# No conversion since timestamps are all UTC to begin with
return self._simple_new(self.values, self.name, self.offset, tz)

def tz_localize(self, tz):
def tz_localize(self, tz, infer_dst=False):
"""
Localize tz-naive DatetimeIndex to given time zone (using pytz)
Parameters
----------
tz : string or pytz.timezone
Time zone for time. Corresponding timestamps would be converted to
time zone of the TimeSeries
infer_dst : boolean, default False
Attempt to infer fall dst-transition hours based on order
Returns
-------
Expand All @@ -1550,7 +1562,7 @@ def tz_localize(self, tz):
tz = tools._maybe_get_tz(tz)

# Convert to UTC
new_dates = tslib.tz_localize_to_utc(self.asi8, tz)
new_dates = tslib.tz_localize_to_utc(self.asi8, tz, infer_dst=infer_dst)
new_dates = new_dates.view(_NS_DTYPE)

return self._simple_new(new_dates, self.name, self.offset, tz)
Expand Down
26 changes: 26 additions & 0 deletions pandas/tseries/tests/test_timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,32 @@ def test_with_tz_ambiguous_times(self):
dr = date_range(datetime(2011, 3, 13), periods=48,
freq=datetools.Minute(30), tz=pytz.utc)

def test_infer_dst(self):
# November 6, 2011, fall back, repeat 2 AM hour
# With no repeated hours, we cannot infer the transition
tz = pytz.timezone('US/Eastern')
dr = date_range(datetime(2011, 11, 6, 0), periods=5,
freq=datetools.Hour())
self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize,
tz, infer_dst=True)

# With repeated hours, we can infer the transition
dr = date_range(datetime(2011, 11, 6, 0), periods=5,
freq=datetools.Hour(), tz=tz)
di = DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00',
'11/06/2011 01:00', '11/06/2011 02:00',
'11/06/2011 03:00'])
localized = di.tz_localize(tz, infer_dst=True)
self.assert_(np.array_equal(dr, localized))

# When there is no dst transition, nothing special happens
dr = date_range(datetime(2011, 6, 1, 0), periods=10,
freq=datetools.Hour())
localized = dr.tz_localize(tz)
localized_infer = dr.tz_localize(tz, infer_dst=True)
self.assert_(np.array_equal(localized, localized_infer))


# test utility methods
def test_infer_tz(self):
eastern = pytz.timezone('US/Eastern')
Expand Down
55 changes: 51 additions & 4 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1630,7 +1630,7 @@ cpdef ndarray _unbox_utcoffsets(object transinfo):

@cython.boundscheck(False)
@cython.wraparound(False)
def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
def tz_localize_to_utc(ndarray[int64_t] vals, object tz, bint infer_dst=False):
"""
Localize tzinfo-naive DateRange to given time zone (using pytz). If
there are ambiguities in the values, raise AmbiguousTimeError.
Expand All @@ -1644,7 +1644,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
Py_ssize_t i, idx, pos, ntrans, n = len(vals)
int64_t *tdata
int64_t v, left, right
ndarray[int64_t] result, result_a, result_b
ndarray[int64_t] result, result_a, result_b, dst_hours
pandas_datetimestruct dts

# Vectorized version of DstTzInfo.localize
Expand Down Expand Up @@ -1701,6 +1701,48 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
# timestamp falls to the right side of the DST transition
if v + deltas[pos] == vals[i]:
result_b[i] = v


if infer_dst:
dst_hours = np.empty(n, dtype=np.int64)
dst_hours.fill(NPY_NAT)

# Get the ambiguous hours (given the above, these are the hours
# where result_a != result_b and neither of them are NAT)
both_nat = np.logical_and(result_a != NPY_NAT, result_b != NPY_NAT)
both_eq = result_a == result_b
trans_idx = np.squeeze(np.nonzero(np.logical_and(both_nat, ~both_eq)))
if trans_idx.size == 1:
stamp = Timestamp(vals[trans_idx])
raise pytz.AmbiguousTimeError("Cannot infer dst time from %s as"
"there are no repeated times" % stamp)
# Split the array into contiguous chunks (where the difference between
# indices is 1). These are effectively dst transitions in different years
# which is useful for checking that there is not an ambiguous transition
# in an individual year.
if trans_idx.size > 0:
one_diff = np.where(np.diff(trans_idx)!=1)[0]+1
trans_grp = np.array_split(trans_idx, one_diff)

# Iterate through each day, if there are no hours where the delta is negative
# (indicates a repeat of hour) the switch cannot be inferred
for grp in trans_grp:

delta = np.diff(result_a[grp])
if grp.size == 1 or np.all(delta>0):
stamp = Timestamp(vals[grp[0]])
raise pytz.AmbiguousTimeError(stamp)

# Find the index for the switch and pull from a for dst and b for standard
switch_idx = (delta<=0).nonzero()[0]
if switch_idx.size > 1:
raise pytz.AmbiguousTimeError("There are %i dst switches "
"when there should only be 1."
% switch_idx.size)
switch_idx = switch_idx[0]+1 # Pull the only index and adjust
a_idx = grp[:switch_idx]
b_idx = grp[switch_idx:]
dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx]))

for i in range(n):
left = result_a[i]
Expand All @@ -1709,8 +1751,13 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
if left == right:
result[i] = left
else:
stamp = Timestamp(vals[i])
raise pytz.AmbiguousTimeError(stamp)
if infer_dst and dst_hours[i] != NPY_NAT:
result[i] = dst_hours[i]
else:
stamp = Timestamp(vals[i])
raise pytz.AmbiguousTimeError("Cannot infer dst time from %r, "\
"try using the 'infer_dst' argument"
% stamp)
elif left != NPY_NAT:
result[i] = left
elif right != NPY_NAT:
Expand Down
18 changes: 18 additions & 0 deletions vb_suite/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,21 @@ def date_range(start=None, end=None, periods=None, freq=None):

datetimeindex_unique = Benchmark('index.unique()', setup,
start_date=datetime(2012, 7, 1))

# tz_localize with infer argument. This is an attempt to emulate the results
# of read_csv with duplicated data. Not passing infer_dst will fail
setup = common_setup + """
dst_rng = date_range('10/29/2000 1:00:00',
'10/29/2000 1:59:59', freq='S')
index = date_range('10/29/2000', '10/29/2000 00:59:59', freq='S')
index = index.append(dst_rng)
index = index.append(dst_rng)
index = index.append(date_range('10/29/2000 2:00:00',
'10/29/2000 3:00:00', freq='S'))
"""

datetimeindex_infer_dst = \
Benchmark('index.tz_localize("US/Eastern", infer_dst=True)',
setup, start_date=datetime(2013, 9, 30))


0 comments on commit e5ea6c8

Please sign in to comment.