# Financial and Economic Data Applications
- 크로스섹션 : 고정된 한 시간 위치에 존재하는 데이터
    - 한 단면을 살펴봄으로써 문제점 확인

In [1]:
from __future__ import division
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn
import numpy as np
pd.options.display.max_rows = 12
np.set_printoptions(precision=4, suppress=True)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(12, 6))

In [None]:
%matplotlib inline

In [2]:
%pwd

'C:\\Users\\tmznq\\workspace\\ml_scratch\\seoul_coding_academy\\week5'

___
## 빈도가 다른 두 시계열 결합
- DatatimeIndex
    - 규칙적인 빈도를 갖는다면 : resample
    - 불규칙적인 빈도를 갖는다면 : reindex

- PeriodIndex
    - asfreq으로 Period 변환
    - reindex로 색인을 맞춤.

### 시간 데이터 추출
- 특정 시간 데이터 추출
    - ts[time(hour, min)]
    - ts[time(h1, m1):time(h2,m2)]

- 특정 시간 근접 데이터 추출
    - date_range 생성
    - ts.asof(date_range)

### pandas_datareader 주식 데이터 가져오기
- get_data_yahoo
- 종목코드인 name을 입력하는 방법:
    - 종목코드.KS
    - 종목코드.KQ

### 요인에 따른 수익 분석
- 요인
    - 주가를 결정하는 여러가지 요소
    - 환율, 금리, 반도체지수, 실업률 등등
    - 숨어 있는 결정적인 요소
    - 여러 변수들 중에서 결정적인 요소에 해당하는 변수
- 테마별/ 업종별 요인에 대한 스코어 도출
    - 회귀분석

## Data munging topics

### Time series and cross-section alignment
- 금융 데이터를 다룰 때, 가장많은 시간이 소요되는 것은 정렬이다.

In [7]:
close_px = pd.read_csv('C:/Users/tmznq/workspace/ml_scratch/seoul_coding_academy/pydata-book-1st-edition/ch11/stock_px.csv', parse_dates=True, index_col=0)
volume = pd.read_csv('C:/Users/tmznq/workspace/ml_scratch/seoul_coding_academy/pydata-book-1st-edition/ch11/volume.csv', parse_dates=True, index_col=0)
prices = close_px.loc['2011-09-05':'2011-09-14', ['AAPL', 'JNJ', 'SPX', 'XOM']]
volume = volume.loc['2011-09-05':'2011-09-12', ['AAPL', 'JNJ', 'XOM']]

In [5]:
prices.head() # 종가

Unnamed: 0,AAPL,JNJ,SPX,XOM
2011-09-06,379.74,64.64,1165.24,71.15
2011-09-07,383.93,65.43,1198.62,73.65
2011-09-08,384.14,64.95,1185.9,72.82
2011-09-09,377.48,63.64,1154.23,71.01
2011-09-12,379.94,63.59,1162.27,71.84


In [6]:
volume.head() # 거래량

Unnamed: 0,AAPL,JNJ,XOM
2011-09-06,18173500.0,15848300.0,25416300.0
2011-09-07,12492000.0,10759700.0,23108400.0
2011-09-08,14839800.0,15551500.0,22434800.0
2011-09-09,20171900.0,17008200.0,27969100.0
2011-09-12,16697300.0,13448200.0,26205800.0


In [8]:
prices * volume.head()

Unnamed: 0,AAPL,JNJ,SPX,XOM
2011-09-06,6901205000.0,1024434000.0,,1808370000.0
2011-09-07,4796054000.0,704007200.0,,1701934000.0
2011-09-08,5700561000.0,1010070000.0,,1633702000.0
2011-09-09,7614489000.0,1082402000.0,,1986086000.0
2011-09-12,6343972000.0,855171000.0,,1882625000.0
2011-09-13,,,,
2011-09-14,,,,


In [9]:
vwap = (prices * volume).sum() / volume.sum()

In [10]:
vwap

AAPL    380.655181
JNJ      64.394769
SPX            NaN
XOM      72.024288
dtype: float64

In [11]:
vwap.dropna()

AAPL    380.655181
JNJ      64.394769
XOM      72.024288
dtype: float64

In [12]:
prices.align(volume, join='inner')

(              AAPL    JNJ    XOM
 2011-09-06  379.74  64.64  71.15
 2011-09-07  383.93  65.43  73.65
 2011-09-08  384.14  64.95  72.82
 2011-09-09  377.48  63.64  71.01
 2011-09-12  379.94  63.59  71.84,
                   AAPL         JNJ         XOM
 2011-09-06  18173500.0  15848300.0  25416300.0
 2011-09-07  12492000.0  10759700.0  23108400.0
 2011-09-08  14839800.0  15551500.0  22434800.0
 2011-09-09  20171900.0  17008200.0  27969100.0
 2011-09-12  16697300.0  13448200.0  26205800.0)

In [15]:
pd.concat([prices, volume], axis=1, keys=['close', 'volume'], names=['stats', 'stocks'])

stats,close,close,close,close,volume,volume,volume
stocks,AAPL,JNJ,SPX,XOM,AAPL,JNJ,XOM
2011-09-06,379.74,64.64,1165.24,71.15,18173500.0,15848300.0,25416300.0
2011-09-07,383.93,65.43,1198.62,73.65,12492000.0,10759700.0,23108400.0
2011-09-08,384.14,64.95,1185.9,72.82,14839800.0,15551500.0,22434800.0
2011-09-09,377.48,63.64,1154.23,71.01,20171900.0,17008200.0,27969100.0
2011-09-12,379.94,63.59,1162.27,71.84,16697300.0,13448200.0,26205800.0
2011-09-13,384.62,63.61,1172.87,71.65,,,
2011-09-14,389.3,63.73,1188.68,72.64,,,


In [16]:
s1 = Series(range(3), index=['a', 'b', 'c'])
s2 = Series(range(4), index=['d', 'b', 'c', 'e'])
s3 = Series(range(3), index=['f', 'a', 'c'])
DataFrame({'one': s1, 'two': s2, 'three': s3})

Unnamed: 0,one,two,three
a,0.0,,1.0
b,1.0,1.0,
c,2.0,2.0,2.0
d,,0.0,
e,,3.0,
f,,,0.0


- 서로 다르게 색인되어 있어도 DataFrame으로 만들면 자동으로 색인을 합쳐서 만듦.

In [None]:
DataFrame({'one': s1, 'two': s2, 'three': s3}, index=list('face'))

- 특정 index만 추출해서 작업 가능 (face -> datetime으로 사용가능)

### Operations with time series of different frequencies
- 빈도가 다른 데이터일 경우
    - resample : 빈도 변환
    - reindex : allgn을 사용하기위해 색인 변환

In [17]:
ts1 = Series(np.random.randn(3),
             index=pd.date_range('2012-6-13', periods=3, freq='W-WED'))
ts1

2012-06-13    0.100373
2012-06-20    0.648141
2012-06-27    0.080296
Freq: W-WED, dtype: float64

In [18]:
ts1.resample('B')

DatetimeIndexResampler [freq=<BusinessDay>, axis=0, closed=left, label=left, convention=start, base=0]

In [21]:
ts1.resample('B').ffill()

2012-06-13    0.100373
2012-06-14    0.100373
2012-06-15    0.100373
2012-06-18    0.100373
2012-06-19    0.100373
2012-06-20    0.648141
2012-06-21    0.648141
2012-06-22    0.648141
2012-06-25    0.648141
2012-06-26    0.648141
2012-06-27    0.080296
Freq: B, dtype: float64

In [27]:
dates = pd.DatetimeIndex(['2012-6-12', '2012-6-17', '2012-6-18',
                          '2012-6-21', '2012-6-22', '2012-6-29'])
ts2 = Series(np.random.randn(6), index=dates)
ts2

2012-06-12   -0.656146
2012-06-17    1.384827
2012-06-18    0.133136
2012-06-21   -1.700712
2012-06-22   -1.377752
2012-06-29   -1.501914
dtype: float64

In [24]:
ts1.reindex(ts2.index).ffill()

2012-06-12   NaN
2012-06-17   NaN
2012-06-18   NaN
2012-06-21   NaN
2012-06-22   NaN
2012-06-29   NaN
dtype: float64

In [28]:
ts2 + ts1.reindex(ts2.index).ffill()

2012-06-12   NaN
2012-06-17   NaN
2012-06-18   NaN
2012-06-21   NaN
2012-06-22   NaN
2012-06-29   NaN
dtype: float64

#### pandas resampler의 다양한 interpolation 사용하기
Resampler.interpolate(method='linear', axis=0, limit=None, inplace=False, limit_direction='forward', downcast=None, \*\*kwargs):
- method : {‘linear’, ‘time’, ‘index’, ‘values’, ‘nearest’, ‘zero’,
‘slinear’, ‘quadratic’, ‘cubic’, ‘barycentric’, ‘krogh’, ‘polynomial’, ‘spline’, ‘piecewise_polynomial’, ‘from_derivatives’, ‘pchip’, ‘akima’}
    - linear: 데이터 간격에 대한 균등 선형 보간. BDay가 freq로 주어진 경우 time보다 더 적절.
    - time: Daily 이상의 상세 freq를 갖는 시계열에서 freq 단위로 균등 선형 보간. BDay가 아닌 Day로만 보간.
    - index, values: index의 실제 수치 값으로 보간. 시계열에서는 time과 동일한 결과.
    - nearest, zero, slinear quardratic, cubic, barycentric, polynomial: `scipy.interpolate.interp1d`로 전달된다.
        - polynomial과 spline은 모두 차수(int)를 지정해야 한다.
        - 예: df.interpolate(method='polynomial', order=4)
    - krogh, piecewise_polynomial spline pchip, akima: scipy 보간법의 유사명칭에 대한 wrapper이다.
        - index의 실제 수치값을 사용하며,
        - 자세한 내용은 [scipy 문서](https://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation)와 관련 [tutorials](https://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html)를 참조.
    - from_derivatives: scipy 0.18에서 `piecewise_polynomial`을 대체하는 `BPoly.from_derivatives`를 따른다.
- axis : {0, 1}, default 0
    - 0: fill column-by-column
    - 1: fill row-by-row
- limit : int, default None. 최대 연속 NaN. 이보다 큰 추가적인 NaN 보간하지 않는다.
- limit_direction : {'forward', 'backward', 'both'}, default 'forward'.
- kwargs : 보간 함수에 전달될 파라미터

현재는 Resampler.interpolate() 메소드로 갱신되었다:<br>
Resampler.interpolate(method='linear', axis=0, limit=None, inplace=False, limit_direction='forward', limit_area=None, downcast=None, **kwargs

In [29]:
DataFrame({'linear':ts1.resample('B').interpolate('linear'),
          'time': ts1.resample('B').interpolate('time'),
          'index':ts1.resample('B').interpolate('index'),
          'values':ts1.resample('B').interpolate('values'),
          'nearest':ts1.resample('B').interpolate('nearest'),
          'zero':ts1.resample('B').interpolate('zero'),
          'slinear':ts1.resample('B').interpolate('time'),
          'quadratic':ts1.resample('B').interpolate('time'),
          'cubic':ts1.resample('B').interpolate('time'),
          'barycentric':ts1.resample('B').interpolate('time'),
          'krogh':ts1.resample('B').interpolate('time'),
          'polynomial':ts1.resample('B').interpolate('time'),
          'spline':ts1.resample('B').interpolate('time'),
          })

Unnamed: 0,linear,time,index,values,nearest,zero,slinear,quadratic,cubic,barycentric,krogh,polynomial,spline
2012-06-13,0.100373,0.100373,0.100373,0.100373,0.100373,0.100373,0.100373,0.100373,0.100373,0.100373,0.100373,0.100373,0.100373
2012-06-14,0.209927,0.178626,0.178626,0.178626,0.100373,0.100373,0.178626,0.178626,0.178626,0.178626,0.178626,0.178626,0.178626
2012-06-15,0.31948,0.256878,0.256878,0.256878,0.100373,0.100373,0.256878,0.256878,0.256878,0.256878,0.256878,0.256878,0.256878
2012-06-18,0.429034,0.491636,0.491636,0.491636,0.648141,0.100373,0.491636,0.491636,0.491636,0.491636,0.491636,0.491636,0.491636
2012-06-19,0.538587,0.569888,0.569888,0.569888,0.648141,0.100373,0.569888,0.569888,0.569888,0.569888,0.569888,0.569888,0.569888
2012-06-20,0.648141,0.648141,0.648141,0.648141,0.648141,0.648141,0.648141,0.648141,0.648141,0.648141,0.648141,0.648141,0.648141
2012-06-21,0.534572,0.56702,0.56702,0.56702,0.648141,0.648141,0.56702,0.56702,0.56702,0.56702,0.56702,0.56702,0.56702
2012-06-22,0.421003,0.485899,0.485899,0.485899,0.648141,0.648141,0.485899,0.485899,0.485899,0.485899,0.485899,0.485899,0.485899
2012-06-25,0.307434,0.242538,0.242538,0.242538,0.080296,0.648141,0.242538,0.242538,0.242538,0.242538,0.242538,0.242538,0.242538
2012-06-26,0.193865,0.161417,0.161417,0.161417,0.080296,0.648141,0.161417,0.161417,0.161417,0.161417,0.161417,0.161417,0.161417


#### Using periods instead of timestamps
- 다양한 빈도의 시계열 데이터를 다루기 위해 사용.

In [30]:
gdp = Series([1.78, 1.94, 2.08, 2.01, 2.15, 2.31, 2.46],
             index=pd.period_range('1984Q2', periods=7, freq='Q-SEP'))
infl = Series([0.025, 0.045, 0.037, 0.04],
              index=pd.period_range('1982', periods=4, freq='A-DEC'))
gdp

1984Q2    1.78
1984Q3    1.94
1984Q4    2.08
1985Q1    2.01
1985Q2    2.15
1985Q3    2.31
1985Q4    2.46
Freq: Q-SEP, dtype: float64

In [31]:
infl

1982    0.025
1983    0.045
1984    0.037
1985    0.040
Freq: A-DEC, dtype: float64

- 위 예시처럼 서로 다른 빈도의 시계열일 때

In [36]:
infl_q = infl.asfreq('Q-SEP', how='end')

- 1982년이 'end'로 처리되고 1982년의 DEC로 생각함.
- start와 end로 각각 해보장

In [37]:
infl_q

1983Q1    0.025
1984Q1    0.045
1985Q1    0.037
1986Q1    0.040
Freq: Q-SEP, dtype: float64

In [39]:
infl_q.reindex(gdp.index).ffill()

1984Q2      NaN
1984Q3      NaN
1984Q4      NaN
1985Q1    0.037
1985Q2    0.037
1985Q3    0.037
1985Q4    0.037
Freq: Q-SEP, dtype: float64

- 빈도를 맞춘 infl_q를 reindex를 통해 보간을 해준다.
- 올바르지 않은 방법이므로 <- why? sol) interpolation의 linear(선형보간)를 사용해야 한다.

### Time of day and "as of" data selection

In [40]:
# Make an intraday date range and time series
rng = pd.date_range('2012-06-01 09:30', '2012-06-01 15:59', freq='T')
# Make a 5-day series of 9:30-15:59 values
rng = rng.append([rng + pd.offsets.BDay(i) for i in range(1, 4)])
ts = Series(np.arange(len(rng), dtype=float), index=rng)
ts

2012-06-01 09:30:00       0.0
2012-06-01 09:31:00       1.0
2012-06-01 09:32:00       2.0
2012-06-01 09:33:00       3.0
2012-06-01 09:34:00       4.0
2012-06-01 09:35:00       5.0
                        ...  
2012-06-06 15:54:00    1554.0
2012-06-06 15:55:00    1555.0
2012-06-06 15:56:00    1556.0
2012-06-06 15:57:00    1557.0
2012-06-06 15:58:00    1558.0
2012-06-06 15:59:00    1559.0
Length: 1560, dtype: float64

- datetime.time 객체를 통해 원하는 대인 10시 0분으로 indexing 할 수 있다.

In [41]:
from datetime import time
ts[time(10, 0)]

2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64

- 모든 날짜의 10시 0분 데이터를 가져옴.

In [42]:
ts.at_time(time(10, 0)) # 똑같은 결과

2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64

In [43]:
ts.between_time(time(10, 0), time(10, 1)) # 10시 ~ 10시 1분 사이 데이터

2012-06-01 10:00:00      30.0
2012-06-01 10:01:00      31.0
2012-06-04 10:00:00     420.0
2012-06-04 10:01:00     421.0
2012-06-05 10:00:00     810.0
2012-06-05 10:01:00     811.0
2012-06-06 10:00:00    1200.0
2012-06-06 10:01:00    1201.0
dtype: float64

In [44]:
ts[time(10, 0):time(10,1)] # 똑같은 결과. 이게 더 좋을 듯.

2012-06-01 10:00:00      30.0
2012-06-01 10:01:00      31.0
2012-06-04 10:00:00     420.0
2012-06-04 10:01:00     421.0
2012-06-05 10:00:00     810.0
2012-06-05 10:01:00     811.0
2012-06-06 10:00:00    1200.0
2012-06-06 10:01:00    1201.0
dtype: float64

In [45]:
np.random.seed(12346)

In [46]:
# Set most of the time series randomly to NA
indexer = np.sort(np.random.permutation(len(ts))[700:])
irr_ts = ts.copy()
irr_ts[indexer] = np.nan
irr_ts['2012-06-01 09:50':'2012-06-01 10:00']

2012-06-01 09:50:00    20.0
2012-06-01 09:51:00     NaN
2012-06-01 09:52:00    22.0
2012-06-01 09:53:00    23.0
2012-06-01 09:54:00     NaN
2012-06-01 09:55:00    25.0
2012-06-01 09:56:00     NaN
2012-06-01 09:57:00     NaN
2012-06-01 09:58:00     NaN
2012-06-01 09:59:00     NaN
2012-06-01 10:00:00     NaN
dtype: float64

- 원하는 10시의 데이터를 찾아보기 위함

In [48]:
selection = pd.date_range('2012-06-01 10:00', periods=4, freq='B')
selection

DatetimeIndex(['2012-06-01 10:00:00', '2012-06-04 10:00:00',
               '2012-06-05 10:00:00', '2012-06-06 10:00:00'],
              dtype='datetime64[ns]', freq='B')

In [47]:
irr_ts.asof(selection) # asof : 해당 값이 결측치일 때, 가장 근접한 데이터를 찾아주는 함수.

2012-06-01 10:00:00      25.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1197.0
Freq: B, dtype: float64

- ex) iot 데이터를 다룰 때, 해당 시간에 누락된 데이터가 있더라도 근처의 값을 찾아서 얻어낼 수 있음.

### Splicing together data sources
- 어떤 데이터 원본을 특점 시점의 데이터로 변환
- 데이터를 국가 또는 자산지표 등의 기준으로 분류하여 대체하기

- 여러 df를 단일 시계열로 묶는 방법으로 concat 사용

In [50]:
data1 = DataFrame(np.ones((6, 3), dtype=float),
                  columns=['a', 'b', 'c'],
                  index=pd.date_range('6/12/2012', periods=6))
data2 = DataFrame(np.ones((6, 3), dtype=float) * 2,
                  columns=['a', 'b', 'c'],
                  index=pd.date_range('6/13/2012', periods=6))
spliced = pd.concat([data1.loc[:'2012-06-14'], data2.loc['2012-06-15':]])
spliced

Unnamed: 0,a,b,c
2012-06-12,1.0,1.0,1.0
2012-06-13,1.0,1.0,1.0
2012-06-14,1.0,1.0,1.0
2012-06-15,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0


In [52]:
data2 = DataFrame(np.ones((6, 4), dtype=float) * 2,
                  columns=['a', 'b', 'c', 'd'],
                  index=pd.date_range('6/13/2012', periods=6))
spliced = pd.concat([data1.loc[:'2012-06-14'], data2.loc['2012-06-15':]])
spliced

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,
2012-06-14,1.0,1.0,1.0,
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


In [53]:
spliced_filled = spliced.combine_first(data2)
spliced_filled

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,2.0
2012-06-14,1.0,1.0,1.0,2.0
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


- df와 other의 두 df를 결합하는데 NaN값은 가져와
- 새로운 df 반환

In [54]:
spliced.update(data2, overwrite=False)

- NaN값만 갱신하는 update 함수
- df.update()
- df 객체의 값 자체를 바꿔준다(inplace). 새로운 df 반환 X

In [55]:
spliced

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,2.0
2012-06-14,1.0,1.0,1.0,2.0
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


In [None]:
cp_spliced = spliced.copy()
cp_spliced[['a', 'c']] = data1[['a', 'c']]
cp_spliced

- 값을 직접 바꿔주는 방법

### Return indexes and cumulative returns

In [61]:
import pandas_datareader as pdr
price = pdr.get_data_yahoo('AAPL', '2011-01-01')
price[-5:]

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-03-28,189.559998,187.529999,188.949997,188.720001,20780400.0,188.720001
2019-03-29,190.080002,188.539993,189.830002,189.949997,23564000.0,189.949997
2019-04-01,191.679993,188.380005,191.639999,191.240005,27862000.0,191.240005
2019-04-02,194.460007,191.050003,191.089996,194.020004,22765700.0,194.020004
2019-04-03,196.5,193.149994,193.25,195.350006,23249700.0,195.350006


- Adj Close : 배당한 값까지 합쳐진 주가

In [71]:
# 수익률
price.Close[1] / price.Close[0] - 1

0.02173234932691459

In [72]:
# 수익률
intrate = (price.Close / price.Close.shift(1)) - 1
intrate[:2]

Date
2010-12-31         NaN
2011-01-03    0.021732
Name: Close, dtype: float64

In [66]:
# 수익률 : 위와 동일
returns = price.pct_change()
returns[:2]

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-12-31,,,,,,
2011-01-03,0.02096,0.010986,0.008329,0.021732,1.300362,0.021732


- cumprod() : 누적곱, 첫 번째 값 대비 현재 값의 비율
- c2/c1, c3/c1, c4/c1
- 처음 상장된 이후 특정 기간 대비 얼마나 올랐나 알아보기위함. ex) 첫 상장일과 100일 후의 변화율

In [73]:
ret_index = (1 + returns).cumprod() # 1 + returns : 오늘의 수익률?
ret_index[0] = 1  # Set first value to 1
ret_index

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,0
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-12-31,,,,,,,1
2011-01-03,1.020960,1.010986,1.008329,1.021732,2.300362,1.021732,1
2011-01-04,1.027884,1.021288,1.029385,1.027065,1.597251,1.027064,1
2011-01-05,1.033572,1.025489,1.020437,1.035466,1.320460,1.035466,1
2011-01-06,1.036386,1.036071,1.036445,1.034629,1.552539,1.034629,1
2011-01-07,1.039786,1.032959,1.034185,1.042039,1.611981,1.042038,1
...,...,...,...,...,...,...,...
2019-03-27,4.106343,4.064144,4.091190,4.090061,0.616996,6.133835,1
2019-03-28,4.102015,4.085494,4.095525,4.095486,0.429551,6.141971,1
2019-03-29,4.113268,4.107497,4.114600,4.122179,0.487091,6.182002,1


In [75]:
m_returns = ret_index.resample('BM').last().pct_change()
m_returns['2012']

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,0
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-01-31,0.127892,0.122878,0.129067,0.127111,1.180114,0.127111,0.0
2012-02-29,0.195029,0.182378,0.1887,0.188311,1.430562,0.188311,0.0
2012-03-30,0.114954,0.116184,0.124104,0.105284,-0.232112,0.105284,0.0
2012-04-30,-0.019916,-0.024986,-0.01802,-0.02597,-0.307635,-0.025969,0.0
2012-05-31,-0.028242,-0.019794,-0.028538,-0.010702,-0.028589,-0.010702,0.0
2012-06-29,0.004299,0.004882,-0.004718,0.010853,-0.142724,0.010853,0.0
2012-07-31,0.047431,0.049578,0.043651,0.045822,0.096861,0.045822,0.0
2012-08-31,0.093019,0.090473,0.106129,0.0892,-0.268222,0.122802,0.0
2012-09-28,0.018711,0.014454,0.017235,0.002796,0.581665,0.002796,0.0
2012-10-31,-0.116207,-0.11856,-0.123565,-0.1076,-0.04692,-0.1076,0.0


- ex) ((c5/c1) / (c3/c1)) - 1 = c5/c3 - 1

In [76]:
price.Close.resample('BM').last().pct_change()['2012'] 

Date
2012-01-31    0.127111
2012-02-29    0.188311
2012-03-30    0.105284
2012-04-30   -0.025970
2012-05-31   -0.010702
2012-06-29    0.010853
2012-07-31    0.045822
2012-08-31    0.089200
2012-09-28    0.002796
2012-10-31   -0.107600
2012-11-30   -0.016865
2012-12-31   -0.090743
Freq: BM, Name: Close, dtype: float64

- WTF?

In [77]:
m_rets = (1 + returns).resample('M', how='prod', kind='period') - 1
m_rets['2012']

the new syntax is .resample(...).prod()
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-01,0.127892,0.122878,0.129067,0.127111,1.180114,0.127111
2012-02,0.195029,0.182378,0.1887,0.188311,1.430562,0.188311
2012-03,0.114954,0.116184,0.124104,0.105284,-0.232112,0.105284
2012-04,-0.019916,-0.024986,-0.01802,-0.02597,-0.307635,-0.025969
2012-05,-0.028242,-0.019794,-0.028538,-0.010702,-0.028589,-0.010702
2012-06,0.004299,0.004882,-0.004718,0.010853,-0.142724,0.010853
2012-07,0.047431,0.049578,0.043651,0.045822,0.096861,0.045822
2012-08,0.093019,0.090473,0.106129,0.0892,-0.268222,0.122802
2012-09,0.018711,0.014454,0.017235,0.002796,0.581665,0.002796
2012-10,-0.116207,-0.11856,-0.123565,-0.1076,-0.04692,-0.1076


- 'M', 누적 prod, period data로 변환 후 -1
- 위 2개와 전부 결과 동일

#### 주식 데이터 가져오기
- import pandas_datareader.data as web

## Group transforms and analysis
- 9장 학습내용 응용
- 금융지식
    - 모멘텀을 가진 주에 투자하는 방법?
    - 여러 요인들의 값 == 모멘텀

In [79]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 10
np.random.seed(12345)

In [83]:
import random; random.seed(0)
import string

N = 1000
def rands(n):
    choices = string.ascii_uppercase # 문자를 ascii 코드로 바꾸고 대문자로!
    return ''.join([random.choice(choices) for _ in range(n)]) # null string으로 묶기.
tickers = np.array([rands(5) for _ in range(N)])

In [84]:
tickers[:5] # 주식 종목에 대한 임의 코드 생성

array(['MYNBI', 'QPMZJ', 'PLSGQ', 'EJEYD', 'TZIRW'], dtype='<U5')

In [86]:
# 종목별 표
M = 500
df = DataFrame({'Momentum' : np.random.randn(M) / 200 + 0.03,
                'Value' : np.random.randn(M) / 200 + 0.08,
                'ShortInterest' : np.random.randn(M) / 200 - 0.02}, # 단기 이자율
                index=tickers[:M])
df[:5]

Unnamed: 0,Momentum,Value,ShortInterest
MYNBI,0.028797,0.077106,-0.02466
QPMZJ,0.030815,0.089237,-0.010868
PLSGQ,0.027067,0.082269,-0.021151
EJEYD,0.032073,0.078488,-0.024193
TZIRW,0.032136,0.087013,-0.024851


In [92]:
ind_names = np.array(['FINANCIAL', 'TECH'])
sampler = np.random.randint(0, len(ind_names), N) # boolean indexing. 0 == False, 1 == True
industries = Series(ind_names[sampler], index=tickers,
                    name='industry')
industries[:5]

MYNBI    FINANCIAL
QPMZJ         TECH
PLSGQ    FINANCIAL
EJEYD    FINANCIAL
TZIRW    FINANCIAL
Name: industry, dtype: object

In [93]:
# 유망한 업종 판별
by_industry = df.groupby(industries)
by_industry.mean()

Unnamed: 0_level_0,Momentum,Value,ShortInterest
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FINANCIAL,0.0306,0.079825,-0.019672
TECH,0.03004,0.080096,-0.020085


In [94]:
by_industry.describe()

Unnamed: 0_level_0,Momentum,Momentum,Momentum,Momentum,Momentum,...,ShortInterest,ShortInterest,ShortInterest,ShortInterest,ShortInterest
Unnamed: 0_level_1,count,mean,std,min,25%,...,min,25%,50%,75%,max
industry,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
FINANCIAL,253.0,0.0306,0.00504,0.014715,0.027611,...,-0.033259,-0.023597,-0.019568,-0.015864,-0.007475
TECH,247.0,0.03004,0.005267,0.011273,0.026649,...,-0.031289,-0.024108,-0.019899,-0.016701,-0.008714


In [96]:
# 테마별 표준화(정규화)
# Within-Industry Standardize
def zscore(group):
    return (group - group.mean()) / group.std() # 표준 정규분포화

df_stand = by_industry.apply(zscore)
df_stand[:5]

Unnamed: 0,Momentum,Value,ShortInterest
MYNBI,-0.357653,-0.529697,-0.959026
QPMZJ,0.147161,1.883457,1.901442
PLSGQ,-0.701008,0.47592,-0.284267
EJEYD,0.292329,-0.260533,-0.86917
TZIRW,0.3048,1.399974,-0.995678


In [97]:
df_stand.groupby(industries).agg(['mean', 'std'])

Unnamed: 0_level_0,Momentum,Momentum,Value,Value,ShortInterest,ShortInterest
Unnamed: 0_level_1,mean,std,mean,std,mean,std
industry,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
FINANCIAL,2.976977e-15,1.0,1.218788e-14,1.0,-8.486843e-16,1.0
TECH,3.478998e-16,1.0,2.7387e-15,1.0,-2.002896e-15,1.0


- 분산은 1, 평균은 0이 되었다!

In [99]:
# 순위 확인 : monmentum을 확인해서 투자할 곳을 결정하기 위함.
# Within-industry rank descending
ind_rank = by_industry.rank(ascending=False)
ind_rank[:5]

Unnamed: 0,Momentum,Value,ShortInterest
MYNBI,165.0,182.0,213.0
QPMZJ,108.0,8.0,9.0
PLSGQ,200.0,77.0,150.0
EJEYD,97.0,154.0,206.0
TZIRW,94.0,20.0,215.0


In [100]:
ind_rank.groupby(industries).agg(['min', 'max'])

Unnamed: 0_level_0,Momentum,Momentum,Value,Value,ShortInterest,ShortInterest
Unnamed: 0_level_1,min,max,min,max,min,max
industry,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
FINANCIAL,1.0,253.0,1.0,253.0,1.0,253.0
TECH,1.0,247.0,1.0,247.0,1.0,247.0


- 투자 우선순위로 253개는 financial, 247개는 tech의 주를 찾음

In [103]:
# 여러 지표들 중에 순위별로 찾아냄.
# Industry rank and standardize
by_industry.apply(lambda x: zscore(x.rank()))[:5] 

Unnamed: 0,Momentum,Value,ShortInterest
MYNBI,-0.519275,-0.751582,-1.1752
QPMZJ,0.223942,1.623582,1.609586
PLSGQ,-0.997554,0.683256,-0.314298
EJEYD,0.409954,-0.368958,-1.079545
TZIRW,0.450949,1.462168,-1.202531


### Group factor exposures
- 요인 분석 : 포트폴리오 관리 기법
- 아래는 임의로 만든 3가지 요인(표준화된 인지부하)과 가중치를 가지고 만든 가상의 포트폴리오임.

In [106]:
from numpy.random import rand
fac1, fac2, fac3 = np.random.rand(3, 1000)
fac1[:5]

array([0.1074, 0.9408, 0.7529, 0.7319, 0.048 ])

In [105]:
ticker_subset = tickers.take(np.random.permutation(N)[:1000])
ticker_subset[:5]

array(['DSUDQ', 'LAPUB', 'VYSHG', 'PHDVM', 'YQFAU'], dtype='<U5')

- 위에서는 ticker를 마구잡이로 섞고
- 아래에서는 각 ticker에 대한 요인 점수를 port / 포트폴리오
- 각 ticker에 대한 요인 값을 factors로하는 df 만듦.

In [110]:
# Weighted sum of factors plus noise
port = Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + rand(1000),
              index=ticker_subset)
factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3},
                    index=ticker_subset)
factors[:5]

Unnamed: 0,f1,f2,f3
DSUDQ,0.10744,0.416501,0.675653
LAPUB,0.940784,0.835894,0.93388
VYSHG,0.752854,0.823489,0.232738
PHDVM,0.731861,0.345268,0.958379
YQFAU,0.047961,0.70752,0.707575


In [111]:
port.head() # port : 각 주의 momentum?

DSUDQ   -0.065111
LAPUB    0.837096
VYSHG    0.062473
PHDVM    1.041935
YQFAU   -0.360411
dtype: float64

- 요인 factor와 포트폴리오 port 간의 상관관계를 보자.

In [112]:
factors.corrwith(port)

f1    0.367878
f2   -0.691712
f3    0.166006
dtype: float64

정의한 값이 아래와 같으므로 위와 같은 결과가 나옴.
- fac1 : 0.7
- fac2 : -1.2
- fac3 : 0.3

- 가중치의 범위는 0과 1 사이, 따라서 그 절반 값들이 나올 수 밖에 없음.

- 요인을 밝히기 위해 회귀 분석을 통해 전체 종목에 대한 요인을 밝힐 수 있다.

In [116]:
!pip install statsmodels

Collecting statsmodels
  Downloading https://files.pythonhosted.org/packages/41/a9/a89fced784543d565b49f5f1e52d6b90ad53f498eae85e09d16e1c3581a8/statsmodels-0.9.0-cp37-none-win_amd64.whl (7.0MB)
Collecting patsy (from statsmodels)
  Downloading https://files.pythonhosted.org/packages/ea/0c/5f61f1a3d4385d6bf83b83ea495068857ff8dfb89e74824c6e9eb63286d8/patsy-0.5.1-py2.py3-none-any.whl (231kB)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.9.0


In [None]:
# optimize 
# 최소 제곱법

In [117]:
# 선형 회귀 모델을 만드는 틀
from statsmodels.formula.api import ols

In [128]:
fact_reg = ols(formula = 'port ~ f1 + f2 + f3', data=pd.concat([factors, port], axis = 1)).fit()
fact_reg.params

Intercept    0.510937
f1           0.662348
f2          -1.205092
f3           0.292889
dtype: float64

In [129]:
def beta_exposure(chunk, factors=None):
    return ols(formula = 'port ~ f1 + f2 + f3', data=pd.concat([factors, port], axis = 1)).fit().params

In [None]:
by_ind = port.groupby(industries)
exposures = by_ind.apply(beta_exposure, factors=factors)
exposures.unstack()

### Decile and quartile analysis
- rank에 의한 가중치를 이용해서 얼마나 투자할지를 결정하는 것.

In [None]:
import pandas.io.data as web
data = web.get_data_yahoo('SPY', '2006-01-01')
data.info()

In [None]:
px = data['Adj Close']
returns = px.pct_change()

def to_index(rets):
    index = (1 + rets).cumprod()
    first_loc = max(index.index.get_loc(index.idxmax()) - 1, 0)
    index.values[first_loc] = 1
    return index

def trend_signal(rets, lookback, lag):
    signal = pd.rolling_sum(rets, lookback, min_periods=lookback - 5)
    return signal.shift(lag)

In [None]:
signal = trend_signal(returns, 100, 3)
trade_friday = signal.resample('W-FRI').resample('B', fill_method='ffill')
trade_rets = trade_friday.shift(1) * returns
trade_rets = trade_rets[:len(returns)]

In [None]:
to_index(trade_rets).plot()

In [None]:
vol = pd.rolling_std(returns, 250, min_periods=200) * np.sqrt(250)

def sharpe(rets, ann=250):
    return rets.mean() / rets.std()  * np.sqrt(ann)

In [None]:
cats = pd.qcut(vol, 4)
print('cats: %d, trade_rets: %d, vol: %d' % (len(cats), len(trade_rets), len(vol)))

In [None]:
trade_rets.groupby(cats).agg(sharpe)

## More example applications

### Signal frontier analysis

In [None]:
names = ['AAPL', 'GOOG', 'MSFT', 'DELL', 'GS', 'MS', 'BAC', 'C']
def get_px(stock, start, end):
    return web.get_data_yahoo(stock, start, end)['Adj Close']
px = DataFrame({n: get_px(n, None, None) for n in names})

In [None]:
#px = pd.read_csv('ch11/stock_px.csv')

In [None]:
plt.close('all')

In [None]:
px = px.asfreq('B').fillna(method='pad')
rets = px.pct_change()
((1 + rets).cumprod() - 1).plot()

In [None]:
def calc_mom(price, lookback, lag):
    mom_ret = price.shift(lag).pct_change(lookback)
    ranks = mom_ret.rank(axis=1, ascending=False)
    demeaned = ranks.subtract(ranks.mean(axis=1), axis=0)
    return demeaned.divide(demeaned.std(axis=1), axis=0)

In [None]:
compound = lambda x : (1 + x).prod() - 1
daily_sr = lambda x: x.mean() / x.std()

def strat_sr(prices, lb, hold):
    # Compute portfolio weights
    freq = '%dB' % hold
    port = calc_mom(prices, lb, lag=1)

    daily_rets = prices.pct_change()

    # Compute portfolio returns
    port = port.shift(1).resample(freq, how='first')
    returns = daily_rets.resample(freq, how=compound)
    port_rets = (port * returns).sum(axis=1)

    return daily_sr(port_rets) * np.sqrt(252 / hold)

In [None]:
strat_sr(px, 70, 30)

In [None]:
from collections import defaultdict

lookbacks = range(20, 90, 5)
holdings = range(20, 90, 5)
dd = defaultdict(dict)
for lb in lookbacks:
    for hold in holdings:
        dd[lb][hold] = strat_sr(px, lb, hold)

ddf = DataFrame(dd)
ddf.index.name = 'Holding Period'
ddf.columns.name = 'Lookback Period'

In [None]:
import matplotlib.pyplot as plt

def heatmap(df, cmap=plt.cm.gray_r):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    axim = ax.imshow(df.values, cmap=cmap, interpolation='nearest')
    ax.set_xlabel(df.columns.name)
    ax.set_xticks(np.arange(len(df.columns)))
    ax.set_xticklabels(list(df.columns))
    ax.set_ylabel(df.index.name)
    ax.set_yticks(np.arange(len(df.index)))
    ax.set_yticklabels(list(df.index))
    plt.colorbar(axim)

In [None]:
heatmap(ddf)

### Future contract rolling

In [None]:
pd.options.display.max_rows = 10

In [None]:
import pandas.io.data as web
# Approximate price of S&P 500 index
px = web.get_data_yahoo('SPY')['Adj Close'] * 10
px

In [None]:
from datetime import datetime
expiry = {'ESU2': datetime(2012, 9, 21),
          'ESZ2': datetime(2012, 12, 21)}
expiry = Series(expiry).order()

In [None]:
expiry

In [None]:
np.random.seed(12347)
N = 200
walk = (np.random.randint(0, 200, size=N) - 100) * 0.25
perturb = (np.random.randint(0, 20, size=N) - 10) * 0.25
walk = walk.cumsum()

rng = pd.date_range(px.index[0], periods=len(px) + N, freq='B')
near = np.concatenate([px.values, px.values[-1] + walk])
far = np.concatenate([px.values, px.values[-1] + walk + perturb])
prices = DataFrame({'ESU2': near, 'ESZ2': far}, index=rng)

In [None]:
prices.tail()

In [None]:
def get_roll_weights(start, expiry, items, roll_periods=5):
    # start : first date to compute weighting DataFrame
    # expiry : Series of ticker -> expiration dates
    # items : sequence of contract names

    dates = pd.date_range(start, expiry[-1], freq='B')
    weights = DataFrame(np.zeros((len(dates), len(items))),
                        index=dates, columns=items)

    prev_date = weights.index[0]
    for i, (item, ex_date) in enumerate(expiry.iteritems()):
        if i < len(expiry) - 1:
            weights.ix[prev_date:ex_date - pd.offsets.BDay(), item] = 1
            roll_rng = pd.date_range(end=ex_date - pd.offsets.BDay(),
                                     periods=roll_periods + 1, freq='B')

            decay_weights = np.linspace(0, 1, roll_periods + 1)
            weights.ix[roll_rng, item] = 1 - decay_weights
            weights.ix[roll_rng, expiry.index[i + 1]] = decay_weights
        else:
            weights.ix[prev_date:, item] = 1

        prev_date = ex_date

    return weights

In [None]:
weights = get_roll_weights('6/1/2012', expiry, prices.columns)
weights.ix['2012-09-12':'2012-09-21']

In [None]:
rolled_returns = (prices.pct_change() * weights).sum(1)

### Rolling correlation and linear regression

In [None]:
aapl = web.get_data_yahoo('AAPL', '2000-01-01')['Adj Close']
msft = web.get_data_yahoo('MSFT', '2000-01-01')['Adj Close']

aapl_rets = aapl.pct_change()
msft_rets = msft.pct_change()

In [None]:
plt.figure()

In [None]:
pd.rolling_corr(aapl_rets, msft_rets, 250).plot()

In [None]:
plt.figure()

In [None]:
model = pd.ols(y=aapl_rets, x={'MSFT': msft_rets}, window=250)
model.beta

In [None]:
model.beta['MSFT'].plot()