In [1]:
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [2]:
DATA_PATH : Path = Path('data/Traffic_Violations.csv')

In [3]:
#resampling means taking a time series from one index to another
#Downsampling: Aggregating higher frequency to lower frequency.
#Upsampling: Converting lower frequency to higher frequency.
dates = pd.date_range('2000-01-02', periods=100)

In [4]:
dates

DatetimeIndex(['2000-01-02', '2000-01-03', '2000-01-04', '2000-01-05',
               '2000-01-06', '2000-01-07', '2000-01-08', '2000-01-09',
               '2000-01-10', '2000-01-11', '2000-01-12', '2000-01-13',
               '2000-01-14', '2000-01-15', '2000-01-16', '2000-01-17',
               '2000-01-18', '2000-01-19', '2000-01-20', '2000-01-21',
               '2000-01-22', '2000-01-23', '2000-01-24', '2000-01-25',
               '2000-01-26', '2000-01-27', '2000-01-28', '2000-01-29',
               '2000-01-30', '2000-01-31', '2000-02-01', '2000-02-02',
               '2000-02-03', '2000-02-04', '2000-02-05', '2000-02-06',
               '2000-02-07', '2000-02-08', '2000-02-09', '2000-02-10',
               '2000-02-11', '2000-02-12', '2000-02-13', '2000-02-14',
               '2000-02-15', '2000-02-16', '2000-02-17', '2000-02-18',
               '2000-02-19', '2000-02-20', '2000-02-21', '2000-02-22',
               '2000-02-23', '2000-02-24', '2000-02-25', '2000-02-26',
      

In [5]:
ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)

In [6]:
ts

2000-01-02   -0.322821
2000-01-03   -1.230934
2000-01-04   -0.085114
2000-01-05   -1.418302
2000-01-06    1.646228
                ...   
2000-04-06    1.008034
2000-04-07    1.438145
2000-04-08   -1.039647
2000-04-09    0.225117
2000-04-10   -0.607272
Freq: D, Length: 100, dtype: float64

In [7]:
ts.resample('M')

<pandas.core.resample.DatetimeIndexResampler object at 0x7fc2b1d6e110>

In [8]:
ts.resample('M').mean()

2000-01-31    0.570731
2000-02-29    0.332498
2000-03-31    0.240137
2000-04-30   -0.034738
Freq: M, dtype: float64

In [9]:
ts.resample(rule='M').mean()

2000-01-31    0.570731
2000-02-29    0.332498
2000-03-31    0.240137
2000-04-30   -0.034738
Freq: M, dtype: float64

In [10]:
ts.resample(rule='D').mean()

2000-01-02   -0.322821
2000-01-03   -1.230934
2000-01-04   -0.085114
2000-01-05   -1.418302
2000-01-06    1.646228
                ...   
2000-04-06    1.008034
2000-04-07    1.438145
2000-04-08   -1.039647
2000-04-09    0.225117
2000-04-10   -0.607272
Freq: D, Length: 100, dtype: float64

In [11]:
ts.resample(rule='H').mean()

2000-01-02 00:00:00   -0.322821
2000-01-02 01:00:00         NaN
2000-01-02 02:00:00         NaN
2000-01-02 03:00:00         NaN
2000-01-02 04:00:00         NaN
                         ...   
2000-04-09 20:00:00         NaN
2000-04-09 21:00:00         NaN
2000-04-09 22:00:00         NaN
2000-04-09 23:00:00         NaN
2000-04-10 00:00:00   -0.607272
Freq: H, Length: 2377, dtype: float64

In [12]:
ts.resample(rule='H').mean().fillna(method='ffill')

2000-01-02 00:00:00   -0.322821
2000-01-02 01:00:00   -0.322821
2000-01-02 02:00:00   -0.322821
2000-01-02 03:00:00   -0.322821
2000-01-02 04:00:00   -0.322821
                         ...   
2000-04-09 20:00:00    0.225117
2000-04-09 21:00:00    0.225117
2000-04-09 22:00:00    0.225117
2000-04-09 23:00:00    0.225117
2000-04-10 00:00:00   -0.607272
Freq: H, Length: 2377, dtype: float64

In [13]:
ts.resample(rule='H').mean().fillna(method='ffill')[:48]

2000-01-02 00:00:00   -0.322821
2000-01-02 01:00:00   -0.322821
2000-01-02 02:00:00   -0.322821
2000-01-02 03:00:00   -0.322821
2000-01-02 04:00:00   -0.322821
2000-01-02 05:00:00   -0.322821
2000-01-02 06:00:00   -0.322821
2000-01-02 07:00:00   -0.322821
2000-01-02 08:00:00   -0.322821
2000-01-02 09:00:00   -0.322821
2000-01-02 10:00:00   -0.322821
2000-01-02 11:00:00   -0.322821
2000-01-02 12:00:00   -0.322821
2000-01-02 13:00:00   -0.322821
2000-01-02 14:00:00   -0.322821
2000-01-02 15:00:00   -0.322821
2000-01-02 16:00:00   -0.322821
2000-01-02 17:00:00   -0.322821
2000-01-02 18:00:00   -0.322821
2000-01-02 19:00:00   -0.322821
2000-01-02 20:00:00   -0.322821
2000-01-02 21:00:00   -0.322821
2000-01-02 22:00:00   -0.322821
2000-01-02 23:00:00   -0.322821
2000-01-03 00:00:00   -1.230934
2000-01-03 01:00:00   -1.230934
2000-01-03 02:00:00   -1.230934
2000-01-03 03:00:00   -1.230934
2000-01-03 04:00:00   -1.230934
2000-01-03 05:00:00   -1.230934
2000-01-03 06:00:00   -1.230934
2000-01-

In [14]:
ts.resample(rule='H').mean().fillna(method='bfill')[:48]

2000-01-02 00:00:00   -0.322821
2000-01-02 01:00:00   -1.230934
2000-01-02 02:00:00   -1.230934
2000-01-02 03:00:00   -1.230934
2000-01-02 04:00:00   -1.230934
2000-01-02 05:00:00   -1.230934
2000-01-02 06:00:00   -1.230934
2000-01-02 07:00:00   -1.230934
2000-01-02 08:00:00   -1.230934
2000-01-02 09:00:00   -1.230934
2000-01-02 10:00:00   -1.230934
2000-01-02 11:00:00   -1.230934
2000-01-02 12:00:00   -1.230934
2000-01-02 13:00:00   -1.230934
2000-01-02 14:00:00   -1.230934
2000-01-02 15:00:00   -1.230934
2000-01-02 16:00:00   -1.230934
2000-01-02 17:00:00   -1.230934
2000-01-02 18:00:00   -1.230934
2000-01-02 19:00:00   -1.230934
2000-01-02 20:00:00   -1.230934
2000-01-02 21:00:00   -1.230934
2000-01-02 22:00:00   -1.230934
2000-01-02 23:00:00   -1.230934
2000-01-03 00:00:00   -1.230934
2000-01-03 01:00:00   -0.085114
2000-01-03 02:00:00   -0.085114
2000-01-03 03:00:00   -0.085114
2000-01-03 04:00:00   -0.085114
2000-01-03 05:00:00   -0.085114
2000-01-03 06:00:00   -0.085114
2000-01-

In [15]:
ts.resample(rule='H').mean().fillna(method='pad')[:48]

2000-01-02 00:00:00   -0.322821
2000-01-02 01:00:00   -0.322821
2000-01-02 02:00:00   -0.322821
2000-01-02 03:00:00   -0.322821
2000-01-02 04:00:00   -0.322821
2000-01-02 05:00:00   -0.322821
2000-01-02 06:00:00   -0.322821
2000-01-02 07:00:00   -0.322821
2000-01-02 08:00:00   -0.322821
2000-01-02 09:00:00   -0.322821
2000-01-02 10:00:00   -0.322821
2000-01-02 11:00:00   -0.322821
2000-01-02 12:00:00   -0.322821
2000-01-02 13:00:00   -0.322821
2000-01-02 14:00:00   -0.322821
2000-01-02 15:00:00   -0.322821
2000-01-02 16:00:00   -0.322821
2000-01-02 17:00:00   -0.322821
2000-01-02 18:00:00   -0.322821
2000-01-02 19:00:00   -0.322821
2000-01-02 20:00:00   -0.322821
2000-01-02 21:00:00   -0.322821
2000-01-02 22:00:00   -0.322821
2000-01-02 23:00:00   -0.322821
2000-01-03 00:00:00   -1.230934
2000-01-03 01:00:00   -1.230934
2000-01-03 02:00:00   -1.230934
2000-01-03 03:00:00   -1.230934
2000-01-03 04:00:00   -1.230934
2000-01-03 05:00:00   -1.230934
2000-01-03 06:00:00   -1.230934
2000-01-

In [16]:
test_frequency = ts.resample(rule='H').mean().fillna(method='pad')[:49]

In [17]:
first, second = test_frequency[0], test_frequency[24]

In [18]:
first_day_progression = np.linspace(first, second, 23)

In [19]:
first

-0.3228206893017752

In [20]:
second

-1.2309343369410157

In [21]:
third = test_frequency[48]

In [22]:
third

-0.08511368175044647

In [23]:
second_day_progression = np.linspace(second, third, 23)

In [24]:
test_frequency[:23] = first_day_progression
test_frequency

2000-01-02 00:00:00   -0.322821
2000-01-02 01:00:00   -0.364099
2000-01-02 02:00:00   -0.405376
2000-01-02 03:00:00   -0.446654
2000-01-02 04:00:00   -0.487932
2000-01-02 05:00:00   -0.529210
2000-01-02 06:00:00   -0.570488
2000-01-02 07:00:00   -0.611766
2000-01-02 08:00:00   -0.653044
2000-01-02 09:00:00   -0.694322
2000-01-02 10:00:00   -0.735600
2000-01-02 11:00:00   -0.776878
2000-01-02 12:00:00   -0.818155
2000-01-02 13:00:00   -0.859433
2000-01-02 14:00:00   -0.900711
2000-01-02 15:00:00   -0.941989
2000-01-02 16:00:00   -0.983267
2000-01-02 17:00:00   -1.024545
2000-01-02 18:00:00   -1.065823
2000-01-02 19:00:00   -1.107101
2000-01-02 20:00:00   -1.148379
2000-01-02 21:00:00   -1.189656
2000-01-02 22:00:00   -1.230934
2000-01-02 23:00:00   -0.322821
2000-01-03 00:00:00   -1.230934
2000-01-03 01:00:00   -1.230934
2000-01-03 02:00:00   -1.230934
2000-01-03 03:00:00   -1.230934
2000-01-03 04:00:00   -1.230934
2000-01-03 05:00:00   -1.230934
2000-01-03 06:00:00   -1.230934
2000-01-

In [25]:
test_frequency[23:46] = second_day_progression
test_frequency

2000-01-02 00:00:00   -0.322821
2000-01-02 01:00:00   -0.364099
2000-01-02 02:00:00   -0.405376
2000-01-02 03:00:00   -0.446654
2000-01-02 04:00:00   -0.487932
2000-01-02 05:00:00   -0.529210
2000-01-02 06:00:00   -0.570488
2000-01-02 07:00:00   -0.611766
2000-01-02 08:00:00   -0.653044
2000-01-02 09:00:00   -0.694322
2000-01-02 10:00:00   -0.735600
2000-01-02 11:00:00   -0.776878
2000-01-02 12:00:00   -0.818155
2000-01-02 13:00:00   -0.859433
2000-01-02 14:00:00   -0.900711
2000-01-02 15:00:00   -0.941989
2000-01-02 16:00:00   -0.983267
2000-01-02 17:00:00   -1.024545
2000-01-02 18:00:00   -1.065823
2000-01-02 19:00:00   -1.107101
2000-01-02 20:00:00   -1.148379
2000-01-02 21:00:00   -1.189656
2000-01-02 22:00:00   -1.230934
2000-01-02 23:00:00   -1.230934
2000-01-03 00:00:00   -1.178852
2000-01-03 01:00:00   -1.126769
2000-01-03 02:00:00   -1.074686
2000-01-03 03:00:00   -1.022603
2000-01-03 04:00:00   -0.970521
2000-01-03 05:00:00   -0.918438
2000-01-03 06:00:00   -0.866355
2000-01-

In [26]:
#Downsampling. Things to think about: which side of the interval is closed, and
#how to label each aggregated bin.
dates = pd.date_range('2000-01-01', periods=12, freq='T')

In [27]:
dates

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
               '2000-01-01 00:02:00', '2000-01-01 00:03:00',
               '2000-01-01 00:04:00', '2000-01-01 00:05:00',
               '2000-01-01 00:06:00', '2000-01-01 00:07:00',
               '2000-01-01 00:08:00', '2000-01-01 00:09:00',
               '2000-01-01 00:10:00', '2000-01-01 00:11:00'],
              dtype='datetime64[ns]', freq='T')

In [28]:
ts = pd.Series(np.arange(len(dates)), index=dates)

In [29]:
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int64

In [30]:
#Suppose we wanted to aggregate this data into five-minute chunks or bars by taking the sum of each group.
ts.resample('5min').sum()

2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int64

In [31]:
ts.resample('10min').sum() 

2000-01-01 00:00:00    45
2000-01-01 00:10:00    21
Freq: 10T, dtype: int64

In [32]:
#the frequency defines bin edges. 00:05 value is included in the interval by default 
#(for this frequency), and the 0:05 interval is excluded by default
ts.resample('5min', closed='right').sum()

1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int64

In [33]:
ts.resample('5min', closed='left').sum() #same as the default step above

2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int64

In [34]:
ts.resample('10min', closed='right').sum()

1999-12-31 23:50:00     0
2000-01-01 00:00:00    55
2000-01-01 00:10:00    11
Freq: 10T, dtype: int64

In [35]:
#the labels are from the left side of the index, but we can label them with the right bin edge
#by passing label='right'
ts.resample('5min').sum()

2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int64

In [36]:
#the labels are from the left side of the index, but we can label them with the right bin edge
#by passing label='right'
ts.resample('5min', closed='right').sum()

1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int64

In [37]:
#the labels are from the left side of the index, but we can label them with the right bin edge
#by passing label='right'
ts.resample('5min', closed='right', label='right').sum()

2000-01-01 00:00:00     0
2000-01-01 00:05:00    15
2000-01-01 00:10:00    40
2000-01-01 00:15:00    11
Freq: 5T, dtype: int64

In [38]:
#the labels are from the left side of the index, but we can label them with the right bin edge
#by passing label='right'
ts.resample('5min', closed='right', label='left').sum()

1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int64

In [39]:
#We might want to make the index more clear by shifting it by some default amount. 
from pandas.tseries.frequencies import to_offset

In [40]:
result = ts.resample('5min', closed='right', label='right').sum()

In [41]:
result

2000-01-01 00:00:00     0
2000-01-01 00:05:00    15
2000-01-01 00:10:00    40
2000-01-01 00:15:00    11
Freq: 5T, dtype: int64

In [42]:
result.index = result.index + to_offset('-1s')

In [43]:
result

1999-12-31 23:59:59     0
2000-01-01 00:04:59    15
2000-01-01 00:09:59    40
2000-01-01 00:14:59    11
Freq: 5T, dtype: int64

In [44]:
#Open-high-low-close (OHLC) formatting
#one way to aggregate financial time series
#computed in a single function call
ts = pd.Series(np.random.permutation(np.arange(len(dates))), index=dates)

In [45]:
ts

2000-01-01 00:00:00    11
2000-01-01 00:01:00     9
2000-01-01 00:02:00     2
2000-01-01 00:03:00     0
2000-01-01 00:04:00     8
2000-01-01 00:05:00     3
2000-01-01 00:06:00     6
2000-01-01 00:07:00    10
2000-01-01 00:08:00     7
2000-01-01 00:09:00     4
2000-01-01 00:10:00     1
2000-01-01 00:11:00     5
Freq: T, dtype: int64

In [46]:
ts.resample('5min').ohlc()

Unnamed: 0,open,high,low,close
2000-01-01 00:00:00,11,11,0,8
2000-01-01 00:05:00,3,10,3,4
2000-01-01 00:10:00,1,5,1,5


In [47]:
#Upsampling/Interpolation: upsamping = conversion from lower to higher frequency. No aggregation is needed.
#Example: a timeframe with weekly data.
pd.DataFrame(np.random.standard_normal((2, 4)))

Unnamed: 0,0,1,2,3
0,-1.031838,-0.463591,1.19615,0.099543
1,-0.306789,-0.814426,1.11756,0.406341


In [48]:
#Let's view the index separately here
pd.date_range('2000-01-01', periods=2)

DatetimeIndex(['2000-01-01', '2000-01-02'], dtype='datetime64[ns]', freq='D')

In [49]:
#Let's view the index separately here
pd.date_range('2000-01-01', periods=2, freq='W-WED')

DatetimeIndex(['2000-01-05', '2000-01-12'], dtype='datetime64[ns]', freq='W-WED')

In [50]:
pd.DataFrame(np.random.standard_normal((2, 4)),
            index=pd.date_range('2000-01-01', periods=2, freq='W-WED'))

Unnamed: 0,0,1,2,3
2000-01-05,0.204533,1.983053,-0.675003,-1.454214
2000-01-12,-0.364383,0.666435,-0.882011,1.552979


In [51]:
frame = pd.DataFrame(np.random.standard_normal((2, 4)),
            index=pd.date_range('2000-01-01', periods=2, freq='W-WED'),
            columns=['Colorado', 'Texas', 'New York', 'Ohio'])

In [52]:
frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.706398,0.223247,0.6601,0.814763
2000-01-12,-0.796032,0.187475,1.230669,0.530354


In [53]:
#we only have one value per group with this data
#missing values occur in the gaps
df_daily = frame.resample('D').asfreq()

In [54]:
df_daily

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.706398,0.223247,0.6601,0.814763
2000-01-06,,,,
2000-01-07,,,,
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,-0.796032,0.187475,1.230669,0.530354


In [55]:
#Same methods as fillna or reindex
frame.resample('D').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.706398,0.223247,0.6601,0.814763
2000-01-06,-0.706398,0.223247,0.6601,0.814763
2000-01-07,-0.706398,0.223247,0.6601,0.814763
2000-01-08,-0.706398,0.223247,0.6601,0.814763
2000-01-09,-0.706398,0.223247,0.6601,0.814763
2000-01-10,-0.706398,0.223247,0.6601,0.814763
2000-01-11,-0.706398,0.223247,0.6601,0.814763
2000-01-12,-0.796032,0.187475,1.230669,0.530354


In [56]:
#we can also fill only a certain number of periods forward
frame.resample('D').ffill(limit=2)

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.706398,0.223247,0.6601,0.814763
2000-01-06,-0.706398,0.223247,0.6601,0.814763
2000-01-07,-0.706398,0.223247,0.6601,0.814763
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,-0.796032,0.187475,1.230669,0.530354


In [57]:
#Interestingly, we do not need to have a date index that is similar to the old one at all
frame.resample('W-THU').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-06,-0.706398,0.223247,0.6601,0.814763
2000-01-13,-0.796032,0.187475,1.230669,0.530354


In [58]:
#practice
step0 = frame.resample('D').ffill(limit=2)

In [59]:
step0

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.706398,0.223247,0.6601,0.814763
2000-01-06,-0.706398,0.223247,0.6601,0.814763
2000-01-07,-0.706398,0.223247,0.6601,0.814763
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,-0.796032,0.187475,1.230669,0.530354


In [60]:
step1 = step0.bfill(limit=2)

In [61]:
step1

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.706398,0.223247,0.6601,0.814763
2000-01-06,-0.706398,0.223247,0.6601,0.814763
2000-01-07,-0.706398,0.223247,0.6601,0.814763
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,-0.796032,0.187475,1.230669,0.530354
2000-01-11,-0.796032,0.187475,1.230669,0.530354
2000-01-12,-0.796032,0.187475,1.230669,0.530354


In [62]:
step2 = step0.fillna(step0.mean(axis=0))

In [63]:
step2

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.706398,0.223247,0.6601,0.814763
2000-01-06,-0.706398,0.223247,0.6601,0.814763
2000-01-07,-0.706398,0.223247,0.6601,0.814763
2000-01-08,-0.728806,0.214304,0.802742,0.743661
2000-01-09,-0.728806,0.214304,0.802742,0.743661
2000-01-10,-0.728806,0.214304,0.802742,0.743661
2000-01-11,-0.728806,0.214304,0.802742,0.743661
2000-01-12,-0.796032,0.187475,1.230669,0.530354


In [64]:
#resampling with periods
#simlar to timestamps
frame = pd.DataFrame(np.random.standard_normal((24, 4)),
                    index=pd.period_range('1-2000', '12-2001',
                                         freq='M'),
                    columns=['Colorado', 'Texas', 'New York', 'Ohio'])

In [65]:
frame.head(n=5)

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01,-0.563938,0.485148,0.243539,0.877014
2000-02,-0.556545,2.117484,-0.032936,0.575301
2000-03,-1.564062,1.592204,-0.750089,0.14442
2000-04,-1.178436,-0.238127,-1.760505,-0.568869
2000-05,-1.128359,-1.340785,1.075025,-0.790068


In [66]:
annual_frame = frame.resample('A-DEC').mean()

In [67]:
annual_frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000,-0.521683,-0.303638,-0.155024,-0.31589
2001,0.010259,-0.070632,0.182642,0.479801


In [68]:
#upsampling is more nuanced, as we must make the decision about which end in which to place
#the values. Default is start, but we can specify end.
#Quarterly, year ending in December
annual_frame.resample('Q-DEC').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q1,-0.521683,-0.303638,-0.155024,-0.31589
2000Q2,-0.521683,-0.303638,-0.155024,-0.31589
2000Q3,-0.521683,-0.303638,-0.155024,-0.31589
2000Q4,-0.521683,-0.303638,-0.155024,-0.31589
2001Q1,0.010259,-0.070632,0.182642,0.479801
2001Q2,0.010259,-0.070632,0.182642,0.479801
2001Q3,0.010259,-0.070632,0.182642,0.479801
2001Q4,0.010259,-0.070632,0.182642,0.479801


In [69]:
annual_frame.resample('Q-DEC', convention='start').asfreq()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q1,-0.521683,-0.303638,-0.155024,-0.31589
2000Q2,,,,
2000Q3,,,,
2000Q4,,,,
2001Q1,0.010259,-0.070632,0.182642,0.479801
2001Q2,,,,
2001Q3,,,,
2001Q4,,,,


In [70]:
annual_frame.resample('Q-DEC', convention='end').asfreq()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,-0.521683,-0.303638,-0.155024,-0.31589
2001Q1,,,,
2001Q2,,,,
2001Q3,,,,
2001Q4,0.010259,-0.070632,0.182642,0.479801


In [71]:
annual_frame.resample('Q-DEC', convention='end').asfreq()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,-0.521683,-0.303638,-0.155024,-0.31589
2001Q1,,,,
2001Q2,,,,
2001Q3,,,,
2001Q4,0.010259,-0.070632,0.182642,0.479801


In [72]:
#in resampling, a larger frequency must be a superperiod or subperiod of the original period range,
#depending on if we are downsampling or upsampling, respectively.
annual_frame.resample('Q-MAR').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,-0.521683,-0.303638,-0.155024,-0.31589
2001Q1,-0.521683,-0.303638,-0.155024,-0.31589
2001Q2,-0.521683,-0.303638,-0.155024,-0.31589
2001Q3,-0.521683,-0.303638,-0.155024,-0.31589
2001Q4,0.010259,-0.070632,0.182642,0.479801
2002Q1,0.010259,-0.070632,0.182642,0.479801
2002Q2,0.010259,-0.070632,0.182642,0.479801
2002Q3,0.010259,-0.070632,0.182642,0.479801


In [73]:
test0 = annual_frame.resample('Q-MAR').ffill()

In [74]:
test0

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,-0.521683,-0.303638,-0.155024,-0.31589
2001Q1,-0.521683,-0.303638,-0.155024,-0.31589
2001Q2,-0.521683,-0.303638,-0.155024,-0.31589
2001Q3,-0.521683,-0.303638,-0.155024,-0.31589
2001Q4,0.010259,-0.070632,0.182642,0.479801
2002Q1,0.010259,-0.070632,0.182642,0.479801
2002Q2,0.010259,-0.070632,0.182642,0.479801
2002Q3,0.010259,-0.070632,0.182642,0.479801


In [75]:
#Q-Mar only lines up with A-MAR, A-JUN, A-SEP, and A-DEC.
try:
    test0.resample('A-FEB').asfreq()
except Exception as e:
    print(e)

Reindexing only valid with uniquely valued Index objects


In [76]:
#grouped time resampling
#for time series data, resample is a group operation based on a time intervalization
#for example (pg. 394)
N = 15

In [77]:
times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N)

In [78]:
times

DatetimeIndex(['2017-05-20 00:00:00', '2017-05-20 00:01:00',
               '2017-05-20 00:02:00', '2017-05-20 00:03:00',
               '2017-05-20 00:04:00', '2017-05-20 00:05:00',
               '2017-05-20 00:06:00', '2017-05-20 00:07:00',
               '2017-05-20 00:08:00', '2017-05-20 00:09:00',
               '2017-05-20 00:10:00', '2017-05-20 00:11:00',
               '2017-05-20 00:12:00', '2017-05-20 00:13:00',
               '2017-05-20 00:14:00'],
              dtype='datetime64[ns]', freq='T')

In [79]:
pd.DataFrame({'time': times})

Unnamed: 0,time
0,2017-05-20 00:00:00
1,2017-05-20 00:01:00
2,2017-05-20 00:02:00
3,2017-05-20 00:03:00
4,2017-05-20 00:04:00
5,2017-05-20 00:05:00
6,2017-05-20 00:06:00
7,2017-05-20 00:07:00
8,2017-05-20 00:08:00
9,2017-05-20 00:09:00


In [80]:
df = pd.DataFrame({'time': times,
             'value': np.arange(N)})

In [81]:
#we can index by "time" and resample
df.set_index('time')

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,0
2017-05-20 00:01:00,1
2017-05-20 00:02:00,2
2017-05-20 00:03:00,3
2017-05-20 00:04:00,4
2017-05-20 00:05:00,5
2017-05-20 00:06:00,6
2017-05-20 00:07:00,7
2017-05-20 00:08:00,8
2017-05-20 00:09:00,9


In [82]:
#we can index by "time" and resample
df.set_index('time').resample('5min').count()

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,5
2017-05-20 00:05:00,5
2017-05-20 00:10:00,5


In [83]:
#what if we have multiple time series (in same df), marked by an aditional group key column
pd.DataFrame({'time': times.repeat(3)})[:10]

Unnamed: 0,time
0,2017-05-20 00:00:00
1,2017-05-20 00:00:00
2,2017-05-20 00:00:00
3,2017-05-20 00:01:00
4,2017-05-20 00:01:00
5,2017-05-20 00:01:00
6,2017-05-20 00:02:00
7,2017-05-20 00:02:00
8,2017-05-20 00:02:00
9,2017-05-20 00:03:00


In [84]:
#what if we have multiple time series (in same df), marked by an aditional group key column
pd.DataFrame({'time': times.repeat(3),
             'key': np.tile(['a', 'b', 'c'], N)})[:10]

Unnamed: 0,time,key
0,2017-05-20 00:00:00,a
1,2017-05-20 00:00:00,b
2,2017-05-20 00:00:00,c
3,2017-05-20 00:01:00,a
4,2017-05-20 00:01:00,b
5,2017-05-20 00:01:00,c
6,2017-05-20 00:02:00,a
7,2017-05-20 00:02:00,b
8,2017-05-20 00:02:00,c
9,2017-05-20 00:03:00,a


In [85]:
#what if we have multiple time series (in same df), marked by an aditional group key column
pd.DataFrame({'time': times.repeat(3),
             'key': np.tile(['a', 'b', 'c'], N),
             })[:10]

Unnamed: 0,time,key
0,2017-05-20 00:00:00,a
1,2017-05-20 00:00:00,b
2,2017-05-20 00:00:00,c
3,2017-05-20 00:01:00,a
4,2017-05-20 00:01:00,b
5,2017-05-20 00:01:00,c
6,2017-05-20 00:02:00,a
7,2017-05-20 00:02:00,b
8,2017-05-20 00:02:00,c
9,2017-05-20 00:03:00,a


In [86]:
df2 = pd.DataFrame({'time': times.repeat(3),
             'key': np.tile(['a', 'b', 'c'], N),
             'value': np.arange(N * 3.)})

In [87]:
df2

Unnamed: 0,time,key,value
0,2017-05-20 00:00:00,a,0.0
1,2017-05-20 00:00:00,b,1.0
2,2017-05-20 00:00:00,c,2.0
3,2017-05-20 00:01:00,a,3.0
4,2017-05-20 00:01:00,b,4.0
5,2017-05-20 00:01:00,c,5.0
6,2017-05-20 00:02:00,a,6.0
7,2017-05-20 00:02:00,b,7.0
8,2017-05-20 00:02:00,c,8.0
9,2017-05-20 00:03:00,a,9.0


In [88]:
#to do the same resampling for each value of 'key' we introduce the pandas.grouper object
time_key = pd.Grouper(freq='5min')