In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
from pandas.tseries.offsets import Hour, Minute, Day, MonthEnd

In [2]:
#frequencies in pandas consist of two components:
#1) base frequency
#2) multiplier
#'M' is for monthly, 'H' is for hourly.
#for the rfrequency, there is a date offset.
#hours are represented by the "Hour" class
hour = Hour()

In [3]:
hour

<Hour>

In [4]:
#we can define a multiple of an offset by passing an integer
four_hours = Hour(4)

In [5]:
four_hours

<4 * Hours>

In [6]:
#in most applications we do not need to create these, we can use something like "H" or "4H"
pd.date_range('2000-01-01', '2000-01-03 23:59', freq='4H')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [7]:
#offsets can often be combined using addition
Hour(2) + Minute(30)

<150 * Minutes>

In [8]:
#in most applications we do not need to create these, we can use something like "H" or "4H"
pd.date_range('2000-01-01', '2000-01-03 23:59', freq=four_hours) #but here we use "four_hours"

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [9]:
#we can pass frequency strings like "1hr30min" 
pd.date_range('2000-01-01', periods=10, freq='1h30min')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

In [10]:
#some frequencies have points that are not evenly spaced
pd.date_range('2000-01-01', '2000-12-31', freq='QS-JAN')

DatetimeIndex(['2000-01-01', '2000-04-01', '2000-07-01', '2000-10-01'], dtype='datetime64[ns]', freq='QS-JAN')

In [11]:
#week of month dates
#we can get the third friday of each month
monthly_dates = pd.date_range('2012-01-01', '2012-09-01', freq='WOM-3FRI')

In [12]:
list(monthly_dates)

[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')]

In [13]:
#shifting (leading/lagging) date
#move data forward or back through time
#without altering index
ts = pd.Series(np.random.standard_normal(4),
              index=pd.date_range('2000-01-01', periods=4, freq='M'))

ts

2000-01-31   -0.616258
2000-02-29   -1.164947
2000-03-31    0.205786
2000-04-30   -1.092521
Freq: M, dtype: float64

In [14]:
ts.shift(2) #missing data introduced at the beginning

2000-01-31         NaN
2000-02-29         NaN
2000-03-31   -0.616258
2000-04-30   -1.164947
Freq: M, dtype: float64

In [15]:
ts.shift(-2) #missing data introduced at the end

2000-01-31    0.205786
2000-02-29   -1.092521
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

In [16]:
#one reason we use shift is to calculate consecutive percent changes in a time series or multiple time series
ts / ts.shift(1) - 1

2000-01-31         NaN
2000-02-29    0.890355
2000-03-31   -1.176648
2000-04-30   -6.309020
Freq: M, dtype: float64

In [17]:
#naive shifts leave the index unmodified, so we can pass the frequency to advance BOTH timestamp and date, rather than just data
ts.shift(freq='M')

2000-02-29   -0.616258
2000-03-31   -1.164947
2000-04-30    0.205786
2000-05-31   -1.092521
Freq: M, dtype: float64

In [18]:
#other frequencies can be passed also
ts.shift(3, freq='D')

2000-02-03   -0.616258
2000-03-03   -1.164947
2000-04-03    0.205786
2000-05-03   -1.092521
dtype: float64

In [19]:
ts.shift(1, freq='90T') #Note: the freq parameter indicates the offset applies to the timestamps but does not modify the data.

2000-01-31 01:30:00   -0.616258
2000-02-29 01:30:00   -1.164947
2000-03-31 01:30:00    0.205786
2000-04-30 01:30:00   -1.092521
dtype: float64

In [20]:
#date offsets work with datetime or timestamp objects
now = datetime(2011, 11, 17)

In [21]:
now + 3 * Day()

Timestamp('2011-11-20 00:00:00')

In [22]:
#adding an anchored offset like MonthEnd, then the first inrement will "roll forward"
now + MonthEnd()

Timestamp('2011-11-30 00:00:00')

In [23]:
now + MonthEnd(2)

Timestamp('2011-12-31 00:00:00')

In [24]:
#anchored offsets explicitly roll dates forward by simply using their rollforward and rollback methods, respectively.
offset = MonthEnd()

In [25]:
offset.rollforward(now)

Timestamp('2011-11-30 00:00:00')

In [26]:
offset.rollback(now)

Timestamp('2011-10-31 00:00:00')

In [27]:
#creative use of date offsets: use these methods with groupby
ts = pd.Series(np.random.standard_normal(20),
              index=pd.date_range('2001-01-15', periods=20, freq='4D'))

In [28]:
ts

2001-01-15   -1.601320
2001-01-19    0.130424
2001-01-23   -1.019238
2001-01-27    1.025176
2001-01-31   -0.836085
2001-02-04    0.935818
2001-02-08    0.905923
2001-02-12    0.258781
2001-02-16   -2.209227
2001-02-20    0.719798
2001-02-24    1.100150
2001-02-28   -1.124986
2001-03-04   -0.228383
2001-03-08    2.238445
2001-03-12   -0.852517
2001-03-16   -0.547619
2001-03-20   -0.096925
2001-03-24    0.458395
2001-03-28   -0.598085
2001-04-01    0.509224
Freq: 4D, dtype: float64

In [29]:
ts.groupby(MonthEnd().rollforward).mean()

2001-01-31   -0.460209
2001-02-28    0.083751
2001-03-31    0.053330
2001-04-30    0.509224
dtype: float64

In [30]:
#easier and faster waqy to do this is to resample
ts.resample('M').mean()

2001-01-31   -0.460209
2001-02-28    0.083751
2001-03-31    0.053330
2001-04-30    0.509224
Freq: M, dtype: float64

In [31]:
#time zones
import pytz

In [32]:
pytz.common_timezones[-5:]

['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']

In [33]:
#pz.timezone creates a time zone object
tz = pytz.timezone('America/New_York')

In [34]:
tz

<DstTzInfo 'America/New_York' LMT-1 day, 19:04:00 STD>

In [35]:
#time series in pandas are, by default, time zone naive
dates = pd.date_range('2012-03-09 9:30', periods=6)

In [36]:
ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)

In [37]:
ts

2012-03-09 09:30:00    0.215783
2012-03-10 09:30:00   -0.258104
2012-03-11 09:30:00    1.817215
2012-03-12 09:30:00   -0.090038
2012-03-13 09:30:00    0.249517
2012-03-14 09:30:00    1.246313
Freq: D, dtype: float64

In [38]:
#index tz field is None
print(ts.index.tz)

None


In [39]:
#generate date ranges using time zone preset
pd.date_range('2012-03-09 9:30', periods=10, tz='UTC')

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00',
               '2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [40]:
#tz_localize method: We use this method to convert from naive to localized time
ts

2012-03-09 09:30:00    0.215783
2012-03-10 09:30:00   -0.258104
2012-03-11 09:30:00    1.817215
2012-03-12 09:30:00   -0.090038
2012-03-13 09:30:00    0.249517
2012-03-14 09:30:00    1.246313
Freq: D, dtype: float64

In [41]:
ts_utc = ts.tz_localize('UTC')

In [42]:
ts_utc

2012-03-09 09:30:00+00:00    0.215783
2012-03-10 09:30:00+00:00   -0.258104
2012-03-11 09:30:00+00:00    1.817215
2012-03-12 09:30:00+00:00   -0.090038
2012-03-13 09:30:00+00:00    0.249517
2012-03-14 09:30:00+00:00    1.246313
Freq: D, dtype: float64

In [43]:
ts_utc.index

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [44]:
#once a time series is localized, it can be converted to another time zone with tz_convert
ts_utc.tz_convert('America/New_York')

2012-03-09 04:30:00-05:00    0.215783
2012-03-10 04:30:00-05:00   -0.258104
2012-03-11 05:30:00-04:00    1.817215
2012-03-12 05:30:00-04:00   -0.090038
2012-03-13 05:30:00-04:00    0.249517
2012-03-14 05:30:00-04:00    1.246313
Freq: D, dtype: float64

In [45]:
#the preceeding time zeries can be converted to eastern time
ts_eastern = ts.tz_localize('America/New_York')
ts_eastern

2012-03-09 09:30:00-05:00    0.215783
2012-03-10 09:30:00-05:00   -0.258104
2012-03-11 09:30:00-04:00    1.817215
2012-03-12 09:30:00-04:00   -0.090038
2012-03-13 09:30:00-04:00    0.249517
2012-03-14 09:30:00-04:00    1.246313
dtype: float64

In [46]:
ts_eastern.tz_convert('UTC')

2012-03-09 14:30:00+00:00    0.215783
2012-03-10 14:30:00+00:00   -0.258104
2012-03-11 13:30:00+00:00    1.817215
2012-03-12 13:30:00+00:00   -0.090038
2012-03-13 13:30:00+00:00    0.249517
2012-03-14 13:30:00+00:00    1.246313
dtype: float64

In [47]:
ts_eastern.tz_convert('Europe/Berlin')

2012-03-09 15:30:00+01:00    0.215783
2012-03-10 15:30:00+01:00   -0.258104
2012-03-11 14:30:00+01:00    1.817215
2012-03-12 14:30:00+01:00   -0.090038
2012-03-13 14:30:00+01:00    0.249517
2012-03-14 14:30:00+01:00    1.246313
dtype: float64

In [48]:
#tz_localize and tz_convert are also instances methods on DatetimeIndex
ts.index.tz_localize('Asia/Shanghai')

DatetimeIndex(['2012-03-09 09:30:00+08:00', '2012-03-10 09:30:00+08:00',
               '2012-03-11 09:30:00+08:00', '2012-03-12 09:30:00+08:00',
               '2012-03-13 09:30:00+08:00', '2012-03-14 09:30:00+08:00'],
              dtype='datetime64[ns, Asia/Shanghai]', freq=None)

In [49]:
#operations with timezone-aware Timestamp objects
#just like time series and date rangers, we can localize Timestamp objects from naive to time zone-aware
stamp = pd.Timestamp('2011-03-12 04:00')
stamp

Timestamp('2011-03-12 04:00:00')

In [50]:
stamp_utc = stamp.tz_localize('utc')

In [51]:
stamp_utc

Timestamp('2011-03-12 04:00:00+0000', tz='UTC')

In [52]:
stamp_utc.tz_convert('America/New_York')

Timestamp('2011-03-11 23:00:00-0500', tz='America/New_York')

In [53]:
#You can also pass a time zone when creating a Timestamp
stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow')

In [54]:
stamp_moscow

Timestamp('2011-03-12 04:00:00+0300', tz='Europe/Moscow')

In [55]:
#time-zone-aware Timestamp objects store a UTC timestamp value as nanoseconds since the Unix epoch (01-01-1970)
stamp_utc.value

1299902400000000000

In [56]:
stamp_utc.tz_convert('America/New_York').value

1299902400000000000

In [57]:
#When we perform time arithmetic with DateOffset objects, pandas respects DST.
#Let's make stamps occurring right before DST transitions.
stamp = pd.Timestamp('2012-03-11 01:30', tz='US/Eastern')

In [58]:
stamp

Timestamp('2012-03-11 01:30:00-0500', tz='US/Eastern')

In [59]:
stamp + Hour() #two hour difference because DST begins

Timestamp('2012-03-11 03:30:00-0400', tz='US/Eastern')

In [60]:
#90 minutes before transitioning out of DST
stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern')

In [61]:
stamp

Timestamp('2012-11-04 00:30:00-0400', tz='US/Eastern')

In [62]:
stamp + 2 * hour

Timestamp('2012-11-04 01:30:00-0500', tz='US/Eastern')

In [63]:
#operations between different time zones
#if two time series objects 