In [13]:
import pandas as pd
import numpy as np

In [14]:
rng = pd.date_range('1/1/2011', periods=72, freq='H')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts.head()

2011-01-01 00:00:00   -0.591308
2011-01-01 01:00:00   -0.113344
2011-01-01 02:00:00    0.178569
2011-01-01 03:00:00   -0.890350
2011-01-01 04:00:00    0.487800
Freq: H, dtype: float64

In [15]:
converted = ts.asfreq('45Min', method='pad')
converted.head()

2011-01-01 00:00:00   -0.591308
2011-01-01 00:45:00   -0.591308
2011-01-01 01:30:00   -0.113344
2011-01-01 02:15:00    0.178569
2011-01-01 03:00:00   -0.890350
Freq: 45T, dtype: float64

In [16]:
# Does asfreq change the # of rows?
#the datasize below shows the change in data rows
print(ts.shape, converted.shape)

(72,) (95,)


In [17]:
#If we use None, NaN values for instances where we do not have data
converted_None = ts.asfreq('45Min')
converted_None.head()

2011-01-01 00:00:00   -0.591308
2011-01-01 00:45:00         NaN
2011-01-01 01:30:00         NaN
2011-01-01 02:15:00         NaN
2011-01-01 03:00:00   -0.890350
Freq: 45T, dtype: float64

In [18]:
# What do the different methods do?
# method : {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}

**'pad'/'ffill' use the last valid value to fill the next valid.<br>**
**'bfill'/'backfill' uses the next valid value to fill.**

In [19]:
# Might any of these methods have pitfalls from a logical point of view?

**when we use bfill, we are assuming we know values from the future.**

In [20]:
# What's the difference between going to a higher frequency and a lower frequency?
#asfreq() just drops. Better to use resampling.

In [21]:
converted = ts.asfreq('90Min', method = 'bfill')
converted.head()

2011-01-01 00:00:00   -0.591308
2011-01-01 01:30:00    0.178569
2011-01-01 03:00:00   -0.890350
2011-01-01 04:30:00    0.179774
2011-01-01 06:00:00    0.357252
Freq: 90T, dtype: float64

In [22]:
# What's different logically about going to a higher frequency vs a lower frequency? 
# What do you want to do when switching to a lower freqeuncy that is not logical when switching to a higher frequency?

**we are likely to loose information/details as we go to lower frequency. It is therefore better to resample, rather than simply dropping data points.**

In [23]:
ts.resample('2H').mean()[1:10]

2011-01-01 02:00:00   -0.355891
2011-01-01 04:00:00    0.333787
2011-01-01 06:00:00   -0.533451
2011-01-01 08:00:00   -0.607210
2011-01-01 10:00:00    0.018203
2011-01-01 12:00:00    0.121104
2011-01-01 14:00:00   -0.846877
2011-01-01 16:00:00    0.106017
2011-01-01 18:00:00    0.702338
Freq: 2H, dtype: float64

In [24]:
ts.resample('D').sum()

2011-01-01    1.355134
2011-01-02    0.759733
2011-01-03   -1.132304
Freq: D, dtype: float64

In [25]:
# What if you want to downsample and you don't want to ffill or bfill?

**Just use 'None'**

In [None]:
# What is the difference between .resample() and .asfreq()?

**.resample() is a data aggregation and .asfreq() is a data selection. We can do a lot more with resampling (since it is aggregation method, we can do count, mean, etc). For up-sampling, both asfreq() and resample() yield similar results although resample() has more options. For down-sampling, resample() is preferred to asfreq() which simply drops some values and losses information.**

In [None]:
# What are some special things you can do with .resample() you can't do with .asfreq()?


**resampling is routinely used to even out irregular time-series**

In [26]:
irregular_ts = ts[list(np.random.choice(a = list(range(len(ts))), size=10, replace=False))]

In [27]:
irregular_ts

2011-01-02 11:00:00   -0.153664
2011-01-02 12:00:00   -0.382771
2011-01-02 06:00:00    0.115786
2011-01-03 20:00:00   -0.262492
2011-01-02 10:00:00   -0.150956
2011-01-03 12:00:00   -0.306069
2011-01-03 19:00:00    0.633383
2011-01-03 15:00:00   -0.940674
2011-01-03 13:00:00    0.189094
2011-01-02 15:00:00    0.578121
dtype: float64

In [29]:
irregular_ts.asfreq('D')

2011-01-02 11:00:00   -0.153664
Freq: D, dtype: float64

**The above does not work since it is not ordered**

In [30]:
irregular_ts = irregular_ts.sort_index()

In [31]:
irregular_ts

2011-01-02 06:00:00    0.115786
2011-01-02 10:00:00   -0.150956
2011-01-02 11:00:00   -0.153664
2011-01-02 12:00:00   -0.382771
2011-01-02 15:00:00    0.578121
2011-01-03 12:00:00   -0.306069
2011-01-03 13:00:00    0.189094
2011-01-03 15:00:00   -0.940674
2011-01-03 19:00:00    0.633383
2011-01-03 20:00:00   -0.262492
dtype: float64

In [32]:
irregular_ts.asfreq('D')

2011-01-02 06:00:00    0.115786
2011-01-03 06:00:00         NaN
Freq: D, dtype: float64

**everything that does not fall exactly at 06:00 (since freq = 'D') is a NaN. It is better to use resample.**

In [33]:
irregular_ts.resample('D').count()

2011-01-02    5
2011-01-03    5
Freq: D, dtype: int64

In [34]:
irregular_ts.resample('D').mean()

2011-01-02    0.001303
2011-01-03   -0.137352
Freq: D, dtype: float64

In [35]:
irregular_ts.resample('D').var()

2011-01-02    0.135134
2011-01-03    0.347513
Freq: D, dtype: float64

In [36]:
#How can I forward-fill only few days?

In [37]:
irre_ts = ts[list(np.random.choice(a = list(range(len(ts))), size=10, replace=False))]

In [38]:
irre_ts = irre_ts.sort_index()

In [39]:
irre_ts

2011-01-01 23:00:00    0.826137
2011-01-02 02:00:00    0.265237
2011-01-02 06:00:00    0.115786
2011-01-02 08:00:00   -1.582702
2011-01-02 12:00:00   -0.382771
2011-01-02 17:00:00    2.211117
2011-01-02 23:00:00   -0.365309
2011-01-03 02:00:00   -0.355808
2011-01-03 03:00:00    0.610664
2011-01-03 09:00:00    0.705341
dtype: float64

In [42]:
#Use NaN if the no values for 3-consecutive indices
irre_ts.resample('H').fillna(method='ffill', limit=3)

2011-01-01 23:00:00    0.826137
2011-01-02 00:00:00    0.826137
2011-01-02 01:00:00    0.826137
2011-01-02 02:00:00    0.265237
2011-01-02 03:00:00    0.265237
2011-01-02 04:00:00    0.265237
2011-01-02 05:00:00    0.265237
2011-01-02 06:00:00    0.115786
2011-01-02 07:00:00    0.115786
2011-01-02 08:00:00   -1.582702
2011-01-02 09:00:00   -1.582702
2011-01-02 10:00:00   -1.582702
2011-01-02 11:00:00   -1.582702
2011-01-02 12:00:00   -0.382771
2011-01-02 13:00:00   -0.382771
2011-01-02 14:00:00   -0.382771
2011-01-02 15:00:00   -0.382771
2011-01-02 16:00:00         NaN
2011-01-02 17:00:00    2.211117
2011-01-02 18:00:00    2.211117
2011-01-02 19:00:00    2.211117
2011-01-02 20:00:00    2.211117
2011-01-02 21:00:00         NaN
2011-01-02 22:00:00         NaN
2011-01-02 23:00:00   -0.365309
2011-01-03 00:00:00   -0.365309
2011-01-03 01:00:00   -0.365309
2011-01-03 02:00:00   -0.355808
2011-01-03 03:00:00    0.610664
2011-01-03 04:00:00    0.610664
2011-01-03 05:00:00    0.610664
2011-01-