# 第九章 时间序列

In [1]:
import pandas as pd
import numpy as np

## 一、时序的创建
### 1.四类时间变量
#### 1)Date times（时间点/时刻） 描述特定日期或时间点  Timestamp  to_datetime或date_range
#### 2)Time spans（时间段/时期） 由时间点定义的一段时期  Period  Period或period_range
#### 3) Date offsets（相对时间差） 一段时间的相对大小（与夏/冬令时无关） DateOffset  DateOffset
#### 4) Time deltas (绝对时间差)  一段时间的绝对大小（与夏/冬令时有关） Timedelta  to_timedelta或timedelta_range


### 2.时间点的创建
#### a）to_datetime方法
#### Pandas在时间点建立的输入格式规定上给了很大的自由度，下面的语句都能正确建立同一时间点

In [2]:
pd.to_datetime('2020.1.1')
pd.to_datetime('2020 1.1')
pd.to_datetime('2020 1 1')
pd.to_datetime('2020 1-1')
pd.to_datetime('2020-1 1')
pd.to_datetime('2020-1-1')
pd.to_datetime('2020/1/1')
pd.to_datetime('1.1.2020')
pd.to_datetime('1.1 2020')
pd.to_datetime('1 1 2020')
pd.to_datetime('1 1-2020')
pd.to_datetime('1-1 2020')
pd.to_datetime('1-1-2020')
pd.to_datetime('1/1/2020')
pd.to_datetime('20200101')
pd.to_datetime('2020.0101')

Timestamp('2020-01-01 00:00:00')

#### 可利用format参数强制匹配

In [3]:
pd.to_datetime('2020\\1\\1',format='%Y\\%m\\%d')
pd.to_datetime('2020`1`1',format='%Y`%m`%d')
pd.to_datetime('2020.1 1',format='%Y.%m %d')
pd.to_datetime('1 1.2020',format='%d %m.%Y')

Timestamp('2020-01-01 00:00:00')

#### 同时，使用列表可以将其转为时间点索引

In [4]:
pd.Series(range(2),index=pd.to_datetime(['2020/1/1',
                                         '2020/1/2']))

2020-01-01    0
2020-01-02    1
dtype: int64

In [5]:
type(pd.to_datetime(['2020/1/1','2020/1/2']))

pandas.core.indexes.datetimes.DatetimeIndex

#### 对于DataFrame而言，如果列已经按照时间顺序排好，则利用to_datetime可自动转换

In [6]:
df = pd.DataFrame({'year': [2020, 2020],'month': [1, 1], 'day': [1, 2]})
pd.to_datetime(df)

0   2020-01-01
1   2020-01-02
dtype: datetime64[ns]

#### b）时间精度与范围限制

#### 事实上，Timestamp的精度远远不止day，可以最小到纳秒ns

In [7]:
pd.to_datetime('2020/1/1 00:00:00.123456789')

Timestamp('2020-01-01 00:00:00.123456789')

#### 同时，它带来范围的代价就是只有大约584年的时间点是可用的

In [9]:
pd.Timestamp.min

Timestamp('1677-09-21 00:12:43.145225')

In [10]:
pd.Timestamp.max

Timestamp('2262-04-11 23:47:16.854775807')

#### c）date_range方法

#### 一般来说，start/end/periods（时间点个数）/freq（间隔方法）是该方法最重要的参数，给定了其中的3个，剩下的一个就会被确定

In [12]:
pd.date_range(start='2020/1/1',end='2020/1/10',periods=3)

DatetimeIndex(['2020-01-01 00:00:00', '2020-01-05 12:00:00',
               '2020-01-10 00:00:00'],
              dtype='datetime64[ns]', freq=None)

In [14]:
pd.date_range(start='2020/1/1/12:00',end='2020/1/2/12:30',freq='S')

DatetimeIndex(['2020-01-01 12:00:00', '2020-01-01 12:00:01',
               '2020-01-01 12:00:02', '2020-01-01 12:00:03',
               '2020-01-01 12:00:04', '2020-01-01 12:00:05',
               '2020-01-01 12:00:06', '2020-01-01 12:00:07',
               '2020-01-01 12:00:08', '2020-01-01 12:00:09',
               ...
               '2020-01-02 12:29:51', '2020-01-02 12:29:52',
               '2020-01-02 12:29:53', '2020-01-02 12:29:54',
               '2020-01-02 12:29:55', '2020-01-02 12:29:56',
               '2020-01-02 12:29:57', '2020-01-02 12:29:58',
               '2020-01-02 12:29:59', '2020-01-02 12:30:00'],
              dtype='datetime64[ns]', length=88201, freq='S')

#### bdate_range是一个类似与date_range的方法，特点在于可以在自带的工作日间隔设置上，再选择weekmask参数和holidays参数
#### 它的freq中有一个特殊的'C'/'CBM'/'CBMS'选项，表示定制，需要联合weekmask参数和holidays参数使用
#### 例如现在需要将工作日中的周一、周二、周五3天保留，并将部分holidays剔除

In [15]:
weekmask = 'Mon Tue Fri'
holidays = [pd.Timestamp('2020/1/%s'%i) for i in range(7,13)]
#注意holidays
pd.bdate_range(start='2020-1-1',end='2020-1-15',freq='C',weekmask=weekmask,holidays=holidays)

DatetimeIndex(['2020-01-03', '2020-01-06', '2020-01-13', '2020-01-14'], dtype='datetime64[ns]', freq='C')

### 3. DateOffset对象

#### a）DataOffset与Timedelta的区别
#### Timedelta绝对时间差的特点指无论是冬令时还是夏令时，增减1day都只计算24小时
#### DataOffset相对时间差指，无论一天是23\24\25小时，增减1day都与当天相同的时间保持一致
#### 例如，英国当地时间 2020年03月29日，01:00:00 时钟向前调整 1 小时 变为 2020年03月29日，02:00:00，开始夏令时

In [16]:
ts = pd.Timestamp('2020-3-29 01:00:00', tz='Europe/Helsinki')
ts + pd.Timedelta(days=1)

Timestamp('2020-03-30 02:00:00+0300', tz='Europe/Helsinki')

In [17]:
ts + pd.DateOffset(days=1)

Timestamp('2020-03-30 01:00:00+0300', tz='Europe/Helsinki')

#### 这似乎有些令人头大，但只要把tz（time zone）去除就可以不用管它了，两者保持一致，除非要使用到时区变换

In [18]:
ts = pd.Timestamp('2020-3-29 01:00:00')
ts + pd.Timedelta(days=1)

Timestamp('2020-03-30 01:00:00')

In [19]:
ts + pd.DateOffset(days=1)

Timestamp('2020-03-30 01:00:00')

#### b)增减一段时间

#### DateOffset的可选参数包括years/months/weeks/days/hours/minutes/seconds

In [20]:
pd.Timestamp('2020-01-01') + pd.DateOffset(minutes=20) - pd.DateOffset(weeks=2)

Timestamp('2019-12-18 00:20:00')

In [21]:
pd.Timestamp('2020-01-01') + pd.offsets.Week(2)

Timestamp('2020-01-15 00:00:00')

In [22]:
pd.Timestamp('2020-01-01') + pd.offsets.BQuarterBegin(1)

Timestamp('2020-03-02 00:00:00')

#### d）序列的offset操作
#### 利用apply函数

In [23]:
# 有点懵逼
pd.Series(pd.offsets.BYearBegin(3).apply(i)
          for i in pd.date_range('20200101',periods=3,freq='Y'))

0   2023-01-02
1   2024-01-01
2   2025-01-01
dtype: datetime64[ns]

#### 直接使用对象加减

In [24]:
pd.date_range('20200101',periods=3,freq='Y')\
+ pd.offsets.BYearBegin(3)

DatetimeIndex(['2023-01-02', '2024-01-01', '2025-01-01'], dtype='datetime64[ns]', freq='A-DEC')

#### 定制offset，可以指定weekmask和holidays参数（思考为什么三个都是一个值）

In [25]:
pd.Series(pd.offsets.CDay(3,weekmask='Wed Fri',holidays='2020010').apply(i)
                                  for i in pd.date_range('20200105',periods=3,freq='D'))

0   2020-01-15
1   2020-01-15
2   2020-01-15
dtype: datetime64[ns]

## 二、时序的索引及属性
### 1. 索引切片
#### 这一部分几乎与第二章的规则完全一致

In [26]:
rng = pd.date_range('2020','2021', freq='W')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts.head()

2020-01-05   -0.019886
2020-01-12    0.246206
2020-01-19   -0.893988
2020-01-26   -0.529773
2020-02-02    1.029997
Freq: W-SUN, dtype: float64

In [27]:
ts['2020-01-26']

-0.5297731390666697

#### 合法字符自动转换为时间点

In [28]:
ts['2020-01-26':'20200726'].head()

2020-01-26   -0.529773
2020-02-02    1.029997
2020-02-09    0.365770
2020-02-16   -2.787935
2020-02-23    0.207220
Freq: W-SUN, dtype: float64

### 2. 子集索引

In [29]:
ts['2020-7'].head()

2020-07-05    0.049849
2020-07-12    0.525298
2020-07-19   -0.592458
2020-07-26    1.779098
Freq: W-SUN, dtype: float64

#### 支持混合形态索引

In [30]:
ts['2011-1':'20200726'].head()

2020-01-05   -0.019886
2020-01-12    0.246206
2020-01-19   -0.893988
2020-01-26   -0.529773
2020-02-02    1.029997
Freq: W-SUN, dtype: float64

### 3.时间点的属性
#### 采用dt对象可以轻松获得关于时间的信息

In [31]:
pd.Series(ts.index).dt.week.head()

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [32]:
pd.Series(ts.index).dt.day.head()

0     5
1    12
2    19
3    26
4     2
dtype: int64

#### 利用strftime可重新修改时间格式

In [33]:
pd.Series(ts.index).dt.strftime('%Y-间隔1-%m-间隔2-%d').head()

0    2020-间隔1-01-间隔2-05
1    2020-间隔1-01-间隔2-12
2    2020-间隔1-01-间隔2-19
3    2020-间隔1-01-间隔2-26
4    2020-间隔1-02-间隔2-02
dtype: object

#### 对于datetime对象可以直接通过属性获取信息

In [34]:
pd.date_range('2020','2021', freq='W').month

Int64Index([ 1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,  3,  4,  4,  4,  4,
             5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,
             8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12,
            12],
           dtype='int64')

In [35]:
pd.date_range('2020','2021', freq='W').weekday

Int64Index([6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
            6, 6, 6, 6, 6, 6, 6, 6],
           dtype='int64')

## 三、重采样

#### 所谓重采样，就是指resample函数，它可以看做时序版本的groupby函数

### 1. resample对象的基本操作
#### 采样频率一般设置为上面提到的offset字符

In [42]:
df_r = pd.DataFrame(np.random.randn(1000, 3),
                    index=pd.date_range('1/1/2020', freq='S', 
                                        periods=1000),
                  columns=['A', 'B', 'C'])
df_r.head()

Unnamed: 0,A,B,C
2020-01-01 00:00:00,-0.710459,-1.242423,-0.399911
2020-01-01 00:00:01,0.998817,-1.231277,-1.707065
2020-01-01 00:00:02,-1.241261,0.218132,0.607488
2020-01-01 00:00:03,0.634646,-0.024115,-1.056445
2020-01-01 00:00:04,0.312386,-0.422934,-0.71532


In [39]:
r = df_r.resample('3min')
r

<pandas.core.resample.DatetimeIndexResampler object at 0x11f860240>

In [40]:
r.sum()

Unnamed: 0,A,B,C
2020-01-01 00:00:00,-19.90558,-1.147018,-12.081827
2020-01-01 00:03:00,8.76144,-2.277045,7.807917
2020-01-01 00:06:00,27.647427,-6.472897,-9.581858
2020-01-01 00:09:00,-4.149138,17.063382,-3.069133
2020-01-01 00:12:00,8.556679,18.573375,1.518345
2020-01-01 00:15:00,-0.648293,-4.13244,-7.276577


In [43]:
df_r2 = pd.DataFrame(np.random.randn(200, 3),index=pd.date_range('1/1/2020', freq='D', periods=200),
                  columns=['A', 'B', 'C'])
r = df_r2.resample('CBMS')
r.sum()

Unnamed: 0,A,B,C
2020-01-01,7.193769,-4.459487,-5.363618
2020-02-03,4.559114,3.802629,0.981003
2020-03-02,6.674487,4.37683,7.864737
2020-04-01,3.071808,12.011614,0.982141
2020-05-01,-4.322686,2.372165,-3.098655
2020-06-01,-0.628935,-5.098372,-4.214073
2020-07-01,-1.176535,-1.896276,-3.150601


### 2.采样聚合

In [44]:
r = df_r.resample('3T')

In [45]:
r['A'].mean()

2020-01-01 00:00:00    0.040173
2020-01-01 00:03:00    0.070012
2020-01-01 00:06:00   -0.111557
2020-01-01 00:09:00    0.081217
2020-01-01 00:12:00   -0.062303
2020-01-01 00:15:00   -0.102796
Freq: 3T, Name: A, dtype: float64

In [46]:
r['A'].agg([np.sum, np.mean, np.std])

Unnamed: 0,sum,mean,std
2020-01-01 00:00:00,7.23115,0.040173,0.988067
2020-01-01 00:03:00,12.602101,0.070012,1.069133
2020-01-01 00:06:00,-20.080282,-0.111557,1.056758
2020-01-01 00:09:00,14.619133,0.081217,1.004673
2020-01-01 00:12:00,-11.214508,-0.062303,1.057115
2020-01-01 00:15:00,-10.279648,-0.102796,1.084047


#### 类似地，可以使用函数/lambda表达式

In [47]:
r.agg({'A': np.sum,'B': lambda x: max(x)-min(x)})

Unnamed: 0,A,B
2020-01-01 00:00:00,7.23115,4.869375
2020-01-01 00:03:00,12.602101,5.285105
2020-01-01 00:06:00,-20.080282,6.063629
2020-01-01 00:09:00,14.619133,5.186871
2020-01-01 00:12:00,-11.214508,5.367948
2020-01-01 00:15:00,-10.279648,4.663849


### 3. 采样组的迭代
#### 采样组的迭代和groupby迭代完全类似，对于每一个组都可以分别做相应操作

In [49]:
small = pd.Series(range(6),index=pd.to_datetime(['2020-01-01 00:00:00', '2020-01-01 00:30:00'
                                                 , '2020-01-01 00:31:00','2020-01-01 01:00:00'
                                                 ,'2020-01-01 03:00:00','2020-01-01 03:05:00']))
resampled = small.resample('H')
for name, group in resampled:
    print("Group: ", name)
    print("-" * 27)
    print(group, end="\n\n")

Group:  2020-01-01 00:00:00
---------------------------
2020-01-01 00:00:00    0
2020-01-01 00:30:00    1
2020-01-01 00:31:00    2
dtype: int64

Group:  2020-01-01 01:00:00
---------------------------
2020-01-01 01:00:00    3
dtype: int64

Group:  2020-01-01 02:00:00
---------------------------
Series([], dtype: int64)

Group:  2020-01-01 03:00:00
---------------------------
2020-01-01 03:00:00    4
2020-01-01 03:05:00    5
dtype: int64



## 四、窗口函数

#### 下面主要介绍pandas中两类主要的窗口(window)函数:rolling/expanding

In [50]:
s = pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2020', periods=1000))
s.head()

2020-01-01    0.696212
2020-01-02   -0.960595
2020-01-03   -0.109992
2020-01-04   -0.771961
2020-01-05   -0.898523
Freq: D, dtype: float64

### 1. Rolling

#### a）常用聚合

#### 所谓rolling方法，就是规定一个窗口，它和groupby对象一样，本身不会进行操作，需要配合聚合函数才能计算结果

In [52]:
s.rolling(window=50)

Rolling [window=50,center=False,axis=0]

In [53]:
s.rolling(window=50).mean()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04         NaN
2020-01-05         NaN
                ...   
2022-09-22    0.033686
2022-09-23    0.081512
2022-09-24    0.078890
2022-09-25    0.045592
2022-09-26    0.006374
Freq: D, Length: 1000, dtype: float64

#### min_periods参数是指需要的非缺失数据点数量阀值

In [54]:
s.rolling(window=50,min_periods=3).mean().head()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03   -0.124792
2020-01-04   -0.286584
2020-01-05   -0.408972
Freq: D, dtype: float64

#### count/sum/mean/median/min/max/std/var/skew/kurt/quantile/cov/corr都是常用的聚合函数

#### b）rolling的apply聚合

#### 使用apply聚合时，只需记住传入的是window大小的Series，输出的必须是标量即可，比如如下计算变异系数

In [55]:
s.rolling(window=50,min_periods=3).apply(lambda x:x.std()/x.mean()).head()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03   -6.639086
2020-01-04   -2.616610
2020-01-05   -1.723154
Freq: D, dtype: float64

#### c）基于时间的rolling

In [57]:
s.rolling('15D').mean().head()

2020-01-01    0.696212
2020-01-02   -0.132191
2020-01-03   -0.124792
2020-01-04   -0.286584
2020-01-05   -0.408972
Freq: D, dtype: float64

#### 可选closed='right'（默认）\'left'\'both'\'neither'参数，决定端点的包含情况

In [58]:
s.rolling('15D', closed='right').sum().head()

2020-01-01    0.696212
2020-01-02   -0.264383
2020-01-03   -0.374375
2020-01-04   -1.146336
2020-01-05   -2.044859
Freq: D, dtype: float64

### 2. Expanding

#### a）expanding函数

#### 普通的expanding函数等价与rolling(window=len(s),min_periods=1)，是对序列的累计计算

In [60]:
s.rolling(window=len(s),min_periods=1).sum().head()

2020-01-01    0.696212
2020-01-02   -0.264383
2020-01-03   -0.374375
2020-01-04   -1.146336
2020-01-05   -2.044859
Freq: D, dtype: float64

In [61]:
s.expanding().sum().head()

2020-01-01    0.696212
2020-01-02   -0.264383
2020-01-03   -0.374375
2020-01-04   -1.146336
2020-01-05   -2.044859
Freq: D, dtype: float64

In [62]:
# apply方法也是同样可用的
s.expanding().apply(lambda x:sum(x)).head()

2020-01-01    0.696212
2020-01-02   -0.264383
2020-01-03   -0.374375
2020-01-04   -1.146336
2020-01-05   -2.044859
Freq: D, dtype: float64

#### b）几个特别的Expanding类型函数
#### cumsum/cumprod/cummax/cummin都是特殊expanding累计计算方法

In [63]:
s.cumsum().head()

2020-01-01    0.696212
2020-01-02   -0.264383
2020-01-03   -0.374375
2020-01-04   -1.146336
2020-01-05   -2.044859
Freq: D, dtype: float64

#### shift/diff/pct_change都是涉及到了元素关系
#### shift是指序列索引不变，但值向后移动
#### diff是指前后元素的差，period参数表示间隔，默认为1，并且可以为负
#### pct_change是值前后元素的变化百分比，period参数与diff类似

In [64]:
s.shift(2).head()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03    0.696212
2020-01-04   -0.960595
2020-01-05   -0.109992
Freq: D, dtype: float64

In [65]:
s.diff(3).head()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04   -1.468173
2020-01-05    0.062072
Freq: D, dtype: float64

In [66]:
s.pct_change(3).head()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04   -2.108802
2020-01-05   -0.064618
Freq: D, dtype: float64