In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
from pandas.tseries.offsets import Hour, Minute, Day, MonthEnd

In [2]:
#frequencies in pandas consist of two components:
#1) base frequency
#2) multiplier
#'M' is for monthly, 'H' is for hourly.
#for the rfrequency, there is a date offset.
#hours are represented by the "Hour" class
hour = Hour()

In [3]:
hour

<Hour>

In [4]:
#we can define a multiple of an offset by passing an integer
four_hours = Hour(4)

In [5]:
four_hours

<4 * Hours>

In [6]:
#in most applications we do not need to create these, we can use something like "H" or "4H"
pd.date_range('2000-01-01', '2000-01-03 23:59', freq='4H')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [7]:
#offsets can often be combined using addition
Hour(2) + Minute(30)

<150 * Minutes>

In [8]:
#in most applications we do not need to create these, we can use something like "H" or "4H"
pd.date_range('2000-01-01', '2000-01-03 23:59', freq=four_hours) #but here we use "four_hours"

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [9]:
#we can pass frequency strings like "1hr30min" 
pd.date_range('2000-01-01', periods=10, freq='1h30min')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

In [10]:
#some frequencies have points that are not evenly spaced
pd.date_range('2000-01-01', '2000-12-31', freq='QS-JAN')

DatetimeIndex(['2000-01-01', '2000-04-01', '2000-07-01', '2000-10-01'], dtype='datetime64[ns]', freq='QS-JAN')

In [11]:
#week of month dates
#we can get the third friday of each month
monthly_dates = pd.date_range('2012-01-01', '2012-09-01', freq='WOM-3FRI')

In [12]:
list(monthly_dates)

[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')]

In [13]:
#shifting (leading/lagging) date
#move data forward or back through time
#without altering index
ts = pd.Series(np.random.standard_normal(4),
              index=pd.date_range('2000-01-01', periods=4, freq='M'))

ts

2000-01-31    1.199935
2000-02-29    0.571806
2000-03-31    0.402750
2000-04-30    1.546524
Freq: M, dtype: float64

In [14]:
ts.shift(2) #missing data introduced at the beginning

2000-01-31         NaN
2000-02-29         NaN
2000-03-31    1.199935
2000-04-30    0.571806
Freq: M, dtype: float64

In [15]:
ts.shift(-2) #missing data introduced at the end

2000-01-31    0.402750
2000-02-29    1.546524
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

In [16]:
#one reason we use shift is to calculate consecutive percent changes in a time series or multiple time series
ts / ts.shift(1) - 1

2000-01-31         NaN
2000-02-29   -0.523469
2000-03-31   -0.295653
2000-04-30    2.839908
Freq: M, dtype: float64

In [17]:
#naive shifts leave the index unmodified, so we can pass the frequency to advance BOTH timestamp and date, rather than just data
ts.shift(freq='M')

2000-02-29    1.199935
2000-03-31    0.571806
2000-04-30    0.402750
2000-05-31    1.546524
Freq: M, dtype: float64

In [18]:
#other frequencies can be passed also
ts.shift(3, freq='D')

2000-02-03    1.199935
2000-03-03    0.571806
2000-04-03    0.402750
2000-05-03    1.546524
dtype: float64

In [19]:
ts.shift(1, freq='90T') #Note: the freq parameter indicates the offset applies to the timestamps but does not modify the data.

2000-01-31 01:30:00    1.199935
2000-02-29 01:30:00    0.571806
2000-03-31 01:30:00    0.402750
2000-04-30 01:30:00    1.546524
dtype: float64

In [20]:
#date offsets work with datetime or timestamp objects
now = datetime(2011, 11, 17)

In [21]:
now + 3 * Day()

Timestamp('2011-11-20 00:00:00')

In [22]:
#adding an anchored offset like MonthEnd, then the first inrement will "roll forward"
now + MonthEnd()

Timestamp('2011-11-30 00:00:00')

In [23]:
now + MonthEnd(2)

Timestamp('2011-12-31 00:00:00')

In [24]:
#anchored offsets explicitly roll dates forward by simply using their rollforward and rollback methods, respectively.
offset = MonthEnd()

In [25]:
offset.rollforward(now)

Timestamp('2011-11-30 00:00:00')

In [26]:
offset.rollback(now)

Timestamp('2011-10-31 00:00:00')

In [27]:
#creative use of date offsets: use these methods with groupby
ts = pd.Series(np.random.standard_normal(20),
              index=pd.date_range('2001-01-15', periods=20, freq='4D'))

In [28]:
ts

2001-01-15    1.692963
2001-01-19    0.138470
2001-01-23   -1.010685
2001-01-27    0.341818
2001-01-31   -2.829653
2001-02-04    0.402733
2001-02-08    0.376858
2001-02-12   -0.114628
2001-02-16    0.568906
2001-02-20   -1.229390
2001-02-24   -0.426231
2001-02-28    0.232337
2001-03-04    0.375309
2001-03-08    1.923097
2001-03-12    0.043976
2001-03-16    0.102062
2001-03-20    0.131272
2001-03-24    0.334917
2001-03-28    0.325291
2001-04-01    0.337263
Freq: 4D, dtype: float64

In [29]:
ts.groupby(MonthEnd().rollforward).mean()

2001-01-31   -0.333417
2001-02-28   -0.027059
2001-03-31    0.462275
2001-04-30    0.337263
dtype: float64

In [30]:
#easier and faster waqy to do this is to resample
ts.resample('M').mean()

2001-01-31   -0.333417
2001-02-28   -0.027059
2001-03-31    0.462275
2001-04-30    0.337263
Freq: M, dtype: float64

In [31]:
#time zones
import pytz

In [32]:
pytz.common_timezones[-5:]

['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']

In [33]:
#pz.timezone creates a time zone object
tz = pytz.timezone('America/New_York')

In [34]:
tz

<DstTzInfo 'America/New_York' LMT-1 day, 19:04:00 STD>

In [35]:
#time series in pandas are, by default, time zone naive
dates = pd.date_range('2012-03-09 9:30', periods=6)

In [36]:
ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)

In [37]:
ts

2012-03-09 09:30:00    0.044180
2012-03-10 09:30:00    0.757846
2012-03-11 09:30:00   -1.011174
2012-03-12 09:30:00    1.649797
2012-03-13 09:30:00    1.360590
2012-03-14 09:30:00    0.640470
Freq: D, dtype: float64

In [38]:
#index tz field is None
print(ts.index.tz)

None


In [39]:
#generate date ranges using time zone preset
pd.date_range('2012-03-09 9:30', periods=10, tz='UTC')

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00',
               '2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [40]:
#tz_localize method: We use this method to convert from naive to localized time
ts

2012-03-09 09:30:00    0.044180
2012-03-10 09:30:00    0.757846
2012-03-11 09:30:00   -1.011174
2012-03-12 09:30:00    1.649797
2012-03-13 09:30:00    1.360590
2012-03-14 09:30:00    0.640470
Freq: D, dtype: float64

In [41]:
ts_utc = ts.tz_localize('UTC')

In [42]:
ts_utc

2012-03-09 09:30:00+00:00    0.044180
2012-03-10 09:30:00+00:00    0.757846
2012-03-11 09:30:00+00:00   -1.011174
2012-03-12 09:30:00+00:00    1.649797
2012-03-13 09:30:00+00:00    1.360590
2012-03-14 09:30:00+00:00    0.640470
Freq: D, dtype: float64

In [43]:
ts_utc.index

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [44]:
#once a time series is localized, it can be converted to another time zone with tz_convert
ts_utc.tz_convert('America/New_York')

2012-03-09 04:30:00-05:00    0.044180
2012-03-10 04:30:00-05:00    0.757846
2012-03-11 05:30:00-04:00   -1.011174
2012-03-12 05:30:00-04:00    1.649797
2012-03-13 05:30:00-04:00    1.360590
2012-03-14 05:30:00-04:00    0.640470
Freq: D, dtype: float64

In [45]:
#the preceeding time zeries can be converted to eastern time
ts_eastern = ts.tz_localize('America/New_York')
ts_eastern

2012-03-09 09:30:00-05:00    0.044180
2012-03-10 09:30:00-05:00    0.757846
2012-03-11 09:30:00-04:00   -1.011174
2012-03-12 09:30:00-04:00    1.649797
2012-03-13 09:30:00-04:00    1.360590
2012-03-14 09:30:00-04:00    0.640470
dtype: float64

In [46]:
ts_eastern.tz_convert('UTC')

2012-03-09 14:30:00+00:00    0.044180
2012-03-10 14:30:00+00:00    0.757846
2012-03-11 13:30:00+00:00   -1.011174
2012-03-12 13:30:00+00:00    1.649797
2012-03-13 13:30:00+00:00    1.360590
2012-03-14 13:30:00+00:00    0.640470
dtype: float64

In [47]:
ts_eastern.tz_convert('Europe/Berlin')

2012-03-09 15:30:00+01:00    0.044180
2012-03-10 15:30:00+01:00    0.757846
2012-03-11 14:30:00+01:00   -1.011174
2012-03-12 14:30:00+01:00    1.649797
2012-03-13 14:30:00+01:00    1.360590
2012-03-14 14:30:00+01:00    0.640470
dtype: float64

In [48]:
#tz_localize and tz_convert are also instances methods on DatetimeIndex
ts.index.tz_localize('Asia/Shanghai')

DatetimeIndex(['2012-03-09 09:30:00+08:00', '2012-03-10 09:30:00+08:00',
               '2012-03-11 09:30:00+08:00', '2012-03-12 09:30:00+08:00',
               '2012-03-13 09:30:00+08:00', '2012-03-14 09:30:00+08:00'],
              dtype='datetime64[ns, Asia/Shanghai]', freq=None)

In [49]:
#operations with timezone-aware Timestamp objects
#just like time series and date rangers, we can localize Timestamp objects from naive to time zone-aware
stamp = pd.Timestamp('2011-03-12 04:00')
stamp

Timestamp('2011-03-12 04:00:00')

In [50]:
stamp_utc = stamp.tz_localize('utc')

In [51]:
stamp_utc

Timestamp('2011-03-12 04:00:00+0000', tz='UTC')

In [52]:
stamp_utc.tz_convert('America/New_York')

Timestamp('2011-03-11 23:00:00-0500', tz='America/New_York')

In [53]:
#You can also pass a time zone when creating a Timestamp
stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow')

In [54]:
stamp_moscow

Timestamp('2011-03-12 04:00:00+0300', tz='Europe/Moscow')

In [55]:
#time-zone-aware Timestamp objects store a UTC timestamp value as nanoseconds since the Unix epoch (01-01-1970)
stamp_utc.value

1299902400000000000

In [56]:
stamp_utc.tz_convert('America/New_York').value

1299902400000000000

In [57]:
#When we perform time arithmetic with DateOffset objects, pandas respects DST.
#Let's make stamps occurring right before DST transitions.
stamp = pd.Timestamp('2012-03-11 01:30', tz='US/Eastern')

In [58]:
stamp

Timestamp('2012-03-11 01:30:00-0500', tz='US/Eastern')

In [59]:
stamp + Hour() #two hour difference because DST begins

Timestamp('2012-03-11 03:30:00-0400', tz='US/Eastern')

In [60]:
#90 minutes before transitioning out of DST
stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern')

In [61]:
stamp

Timestamp('2012-11-04 00:30:00-0400', tz='US/Eastern')

In [62]:
stamp + 2 * hour

Timestamp('2012-11-04 01:30:00-0500', tz='US/Eastern')

In [63]:
#operations between different time zones
#if two time series objects with difference time series are added, then we store the result in UTC.
#this does not require conversion, as the Timestamps store a value in UTC.
dates = pd.date_range('2012-03-07 09:30', periods=10, freq='B')

In [64]:
dates

DatetimeIndex(['2012-03-07 09:30:00', '2012-03-08 09:30:00',
               '2012-03-09 09:30:00', '2012-03-12 09:30:00',
               '2012-03-13 09:30:00', '2012-03-14 09:30:00',
               '2012-03-15 09:30:00', '2012-03-16 09:30:00',
               '2012-03-19 09:30:00', '2012-03-20 09:30:00'],
              dtype='datetime64[ns]', freq='B')

In [65]:
ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)

In [66]:
ts

2012-03-07 09:30:00   -1.073889
2012-03-08 09:30:00    0.250145
2012-03-09 09:30:00   -0.176619
2012-03-12 09:30:00    1.006232
2012-03-13 09:30:00    0.140329
2012-03-14 09:30:00   -0.127688
2012-03-15 09:30:00    0.182961
2012-03-16 09:30:00   -0.105899
2012-03-19 09:30:00    0.866445
2012-03-20 09:30:00    1.484315
Freq: B, dtype: float64

In [67]:
ts1 = ts[:7].tz_localize('Europe/London')

In [68]:
ts2 = ts1[2:].tz_convert('Europe/Moscow')

In [69]:
ts1

2012-03-07 09:30:00+00:00   -1.073889
2012-03-08 09:30:00+00:00    0.250145
2012-03-09 09:30:00+00:00   -0.176619
2012-03-12 09:30:00+00:00    1.006232
2012-03-13 09:30:00+00:00    0.140329
2012-03-14 09:30:00+00:00   -0.127688
2012-03-15 09:30:00+00:00    0.182961
dtype: float64

In [70]:
ts2

2012-03-09 13:30:00+04:00   -0.176619
2012-03-12 13:30:00+04:00    1.006232
2012-03-13 13:30:00+04:00    0.140329
2012-03-14 13:30:00+04:00   -0.127688
2012-03-15 13:30:00+04:00    0.182961
dtype: float64

In [71]:
result = ts1 + ts2

In [72]:
result

2012-03-07 09:30:00+00:00         NaN
2012-03-08 09:30:00+00:00         NaN
2012-03-09 09:30:00+00:00   -0.353239
2012-03-12 09:30:00+00:00    2.012464
2012-03-13 09:30:00+00:00    0.280658
2012-03-14 09:30:00+00:00   -0.255375
2012-03-15 09:30:00+00:00    0.365921
dtype: float64

In [73]:
result.index

DatetimeIndex(['2012-03-07 09:30:00+00:00', '2012-03-08 09:30:00+00:00',
               '2012-03-09 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq=None)

In [74]:
#periods and period arithmetic
#Periods represent timespans. The pd.Period class represents this datatype.
p = pd.Period('2011', freq='A-DEC')

In [75]:
p

Period('2011', 'A-DEC')

In [76]:
#This period represents the entire year of 2001. The first day is 2011-01-01 and the last is 2011-12-31.
p + 5

Period('2016', 'A-DEC')

In [77]:
p - 2

Period('2009', 'A-DEC')

In [78]:
#if two periods have the same frequency, their difference is the number of units between them as a date offset
pd.Period('2014', freq='A-DEC') - p

<3 * YearEnds: month=12>

In [79]:
#regular ranges of periods:
#period_range: construct regular ranges of periods
periods = pd.period_range('2000-01-01', '2000-06-30', freq='M')

In [80]:
periods

PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]')

In [81]:
#the PeriodIndex stores a sequence of periods and can serve as an axis index in any Pandas data structure
pd.Series(np.random.standard_normal(6), index=periods)

2000-01   -0.715710
2000-02   -1.297893
2000-03    1.193392
2000-04   -0.651776
2000-05    0.197270
2000-06   -1.650755
Freq: M, dtype: float64

In [82]:
#if we have an Array of strings, we can use a PeriodIndex, which consists of period values
values = ['2001Q3', '2002Q2', '2003Q1']

In [83]:
index = pd.PeriodIndex(values, freq='Q-DEC')

In [84]:
index

PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]')

In [85]:
#period frequency conversion
#We can convert a Period or PeriodIndex from one frequency to another.
#Example: Converting an annual period to a monthly period.
p = pd.Period('2011', freq='A-DEC')

In [86]:
p

Period('2011', 'A-DEC')

In [87]:
p.asfreq('M', how='start')

Period('2011-01', 'M')

In [88]:
p.asfreq('M', how='end')

Period('2011-12', 'M')

In [89]:
#Periods for non-calendar fiscal years.
p = pd.Period('2011', freq='A-JUN')

In [90]:
p

Period('2011', 'A-JUN')

In [91]:
p.asfreq('M', how='start')

Period('2010-07', 'M')

In [92]:
p.asfreq('M', how='end')

Period('2011-06', 'M')

In [93]:
#Low-frequency to high-frequency conversion: pandas determines the subperiod based on where
#the superprior 'belongs'
#e.g. 'Aug-2011' may appear to be part of 2011, but it is not. It is part of 2012.
p = pd.Period('Aug-2011', 'M')

In [94]:
p.asfreq('A-JUN')

Period('2012', 'A-JUN')

In [95]:
#whole PeriodIndex objects or time series can be converted with these same semantics.
periods = pd.period_range('2006', '2009', freq='A-DEC')

In [96]:
periods

PeriodIndex(['2006', '2007', '2008', '2009'], dtype='period[A-DEC]')

In [97]:
ts = pd.Series(np.random.standard_normal(len(periods)),
              index=periods)

In [98]:
ts

2006    0.227203
2007    0.426532
2008   -0.267964
2009   -0.982436
Freq: A-DEC, dtype: float64

In [99]:
ts.asfreq('M', how='start')

2006-01    0.227203
2007-01    0.426532
2008-01   -0.267964
2009-01   -0.982436
Freq: M, dtype: float64

In [100]:
#Above, we use monthly periods starting with the first annual month in each annual period.
#Let's use the "B" frequency.
ts.asfreq('B', how='end')

2006-12-29    0.227203
2007-12-31    0.426532
2008-12-31   -0.267964
2009-12-31   -0.982436
Freq: B, dtype: float64

In [101]:
#quarterly period frequencies
#Frequently used in accounting, finance, and other fields.
#Many times, the quarterly data corresponds to a fiscal year end.
#2012Q4 means different things for different periods.
p = pd.Period('2012Q4', freq='Q-JAN')

In [102]:
p

Period('2012Q4', 'Q-JAN')

In [103]:
#If we had a fiscal year ending in January, then 2012Q4 runs through January 2012.
p.asfreq('D', how='start')

Period('2011-11-01', 'D')

In [104]:
p.asfreq('D', how='end')

Period('2012-01-31', 'D')

In [105]:
#Thus we can do period arithmetic, so we can get the timestamp at 4pm on the second-to-last business day.
p4pm = (p.asfreq('B', how='end') - 1).asfreq('T', how='start') + 16 * 6

In [106]:
p4pm

Period('2012-01-30 01:36', 'T')

In [107]:
p4pm.to_timestamp() #period.to_timestamp = returns the timestamp at the start of the priod

Timestamp('2012-01-30 01:36:00')

In [108]:
#Generating a quarterly range.
periods = pd.period_range('2011Q3', '2012Q4', freq='Q-JAN')

In [109]:
ts = pd.Series(np.arange(len(periods)), index=periods)

In [110]:
ts

2011Q3    0
2011Q4    1
2012Q1    2
2012Q2    3
2012Q3    4
2012Q4    5
Freq: Q-JAN, dtype: int64

In [111]:
new_periods = (periods.asfreq('B', 'end') - 1).asfreq('H', 'start') + 16

In [112]:
new_periods

PeriodIndex(['2010-10-28 16:00', '2011-01-28 16:00', '2011-04-28 16:00',
             '2011-07-28 16:00', '2011-10-28 16:00', '2012-01-30 16:00'],
            dtype='period[H]')

In [113]:
ts.index = new_periods.to_timestamp()

In [114]:
ts

2010-10-28 16:00:00    0
2011-01-28 16:00:00    1
2011-04-28 16:00:00    2
2011-07-28 16:00:00    3
2011-10-28 16:00:00    4
2012-01-30 16:00:00    5
dtype: int64