In [1]:
from datetime import datetime
import numpy as np
import pandas as pd

In [2]:
dates = pd.date_range('2000-01-01', periods=100)

In [3]:
dates[:10]

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08',
               '2000-01-09', '2000-01-10'],
              dtype='datetime64[ns]', freq='D')

In [4]:
ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)

In [5]:
ts

2000-01-01   -0.218817
2000-01-02    0.556600
2000-01-03   -0.158278
2000-01-04   -0.657281
2000-01-05    0.493075
                ...   
2000-04-05   -0.002421
2000-04-06    0.327136
2000-04-07    0.003186
2000-04-08    0.429938
2000-04-09    1.004624
Freq: D, Length: 100, dtype: float64

In [6]:
#pandas objects equipped to a resample method.
#Aggregation is downsampling, conversion from lower to higher frequency is upsampling.
ts.resample('M').mean() #A

2000-01-31   -0.064335
2000-02-29   -0.274024
2000-03-31   -0.097005
2000-04-30    0.176277
Freq: M, dtype: float64

In [7]:
ts.resample('M', kind='period').mean() #B

2000-01   -0.064335
2000-02   -0.274024
2000-03   -0.097005
2000-04    0.176277
Freq: M, dtype: float64

In [8]:
#practice
ts.resample(rule='M').mean() #A2

2000-01-31   -0.064335
2000-02-29   -0.274024
2000-03-31   -0.097005
2000-04-30    0.176277
Freq: M, dtype: float64

In [9]:
ts.resample(rule='M', axis=0).mean() #A3

2000-01-31   -0.064335
2000-02-29   -0.274024
2000-03-31   -0.097005
2000-04-30    0.176277
Freq: M, dtype: float64

In [10]:
#Downsampling: aggregating higher frequency to lower frequency.
#start with one-minute frequency data
dates = pd.date_range('2000-01-01', periods=12, freq='T')

In [11]:
dates

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
               '2000-01-01 00:02:00', '2000-01-01 00:03:00',
               '2000-01-01 00:04:00', '2000-01-01 00:05:00',
               '2000-01-01 00:06:00', '2000-01-01 00:07:00',
               '2000-01-01 00:08:00', '2000-01-01 00:09:00',
               '2000-01-01 00:10:00', '2000-01-01 00:11:00'],
              dtype='datetime64[ns]', freq='T')

In [12]:
ts = pd.Series(np.arange(len(dates)), index=dates)

In [15]:
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int64

In [16]:
#what if we want to aggregate the data into
#five-minute chunks or bars by taking a sample of each group
ts.resample('5min').sum()

2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int64

In [17]:
#the default is that the bin edge is closed on the left side
#the following is the same as above.
ts.resample('5min', closed='left').sum()

2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int64

In [18]:
#This means that a time on exactly the interval will be recorded in the following, rather
#than the previous interval.
#For example '2000-01-01 00:00:00' is under the '1990-12-31 interval'
ts.resample('5min', closed='right').sum()

1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int64

In [23]:
#the defaults are chosen to be intuitive, rather than consistent.
#e.g. "M", "A", "Q", "BM", "BQ", and "W" use closed=right
ts.resample('BQ').sum()

2000-03-31    66
Freq: BQ-DEC, dtype: int64

In [24]:
ts.resample('BQ', closed='right').sum()

2000-03-31    66
Freq: BQ-DEC, dtype: int64

In [26]:
ts.resample('M').sum() #default is closed='right'

2000-01-31    66
Freq: M, dtype: int64

In [27]:
ts.resample('M', closed='right').sum()

2000-01-31    66
Freq: M, dtype: int64

In [28]:
#as for the remaining frequencies, they use closed='left'
ts.resample('D').sum()

2000-01-01    66
Freq: D, dtype: int64

In [29]:
ts.resample('D', closed='left').sum()

2000-01-01    66
Freq: D, dtype: int64

In [31]:
#this is different
ts.resample('D', closed='right').sum()

1999-12-31     0
2000-01-01    66
Freq: D, dtype: int64

In [33]:
#Downsampling: high to low frequency