In [34]:
import numpy as np
import pandas as pd

### Pandas資料建立
- `DataFrame`常用資料格式，類似Table格式
- 變數名稱, index, 變數內容

#### 最常見用法
- 利用 `np.random.randn`生成 100*4的矩陣
- 再轉換成DataFrame的資料型態
- 設定變數名稱 (A,B,C,D)

In [48]:
# 利用 np.random.randn生成 100*4的矩陣
# 再轉換成DataFrame的資料型態
# 設定變數名稱 (A,B,C,D)
simulation = pd.DataFrame(np.random.randn(100,4),columns=['A','B','C','D'])

Unnamed: 0,A,B,C,D
0,0.955164,0.569706,1.461796,-1.329101
1,-1.432139,-0.372836,1.670907,-1.168997
2,-1.786885,-1.494149,0.445907,-1.604200
3,-0.581999,1.634963,-0.336149,2.327238
4,0.120343,-0.962840,0.669944,1.105383
5,1.101188,0.171456,-0.487769,0.132233
6,0.953695,0.011830,0.038379,1.068288
7,-1.129431,-2.333244,0.882208,0.793748
8,0.306325,-0.012917,-0.704695,-0.214665
9,1.403254,-1.096972,1.293647,0.864897


- `pd.DataFrame({'key':['a','b','a','a','b','c'],'value': range(6)})`
- 兩個變數，key與value

In [23]:
left1 = pd.DataFrame({'key':['a','b','a','a','b','c'],
                     'value': range(6)})

- `pd.DataFrame({'group_val': [3.5,7]},index=['a','b'])`
- 一個變數: group_val
- 設定index為 a,b

In [33]:
right1 = pd.DataFrame({'group_val': [3.5,7]},
                      index=['a','b'])

In [28]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


### 生成一個資料庫內有`遺失值`
- `data.iloc[1,1] = np.nan `選擇特定位置給值(NaN)
- `.dropna(thresh=2)`丟掉2個以上遺失值的資料

In [81]:
df = pd.DataFrame(np.random.randn(7,3),columns=['A','B','C'])

In [88]:
df.iloc[:4,1] = np.nan
df.iloc[:2,2] = np.nan
df

Unnamed: 0,A,B,C
0,-0.253754,,
1,-0.119045,,
2,-0.786578,,-0.611454
3,-0.536654,,-1.618277
4,1.312382,0.863454,-0.280453
5,0.052933,-1.050575,-0.120576
6,-0.010081,-0.711409,0.468316


In [90]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
2,-0.786578,,-0.611454
3,-0.536654,,-1.618277
4,1.312382,0.863454,-0.280453
5,0.052933,-1.050575,-0.120576
6,-0.010081,-0.711409,0.468316


### 處理遺失值
- `data.isnull()`判斷資料中的遺失值: 得到True/False
- `data.notnull()`判斷資料中**非**遺失值
- `data.dropna()`回傳非遺失值資料
- `data[data.notnull()]`意思同上

### 資料合併
- `pd.merge(data1,data2,on='date')`
- 根據日期(date)合併 (也可選擇同時用兩個變數來合併) `on=['date','XX']`
- 若不設定，則自行判斷(相同變數名稱)
- `pd.merge(data1[['date','VIX1'],data2[['date','r1','r2']]]`
- 只選擇特定幾個變數來合併

In [1]:
import pandas as pd

- 匯入資料

In [2]:
data1 = pd.read_csv('vixts.csv',names=['date','VIX1','VIX2','VIX3','VIX6','VIX9','VIX12'],header = 0)

In [3]:
data2 = pd.read_csv('straddle_ret.csv', names = ['date','r1','r2','r3','r6','r9','r12','rs1','rs2','rs3','rs6','rs9','rs12'], header = 6)

- 合併資料範例

In [18]:
results = pd.merge(data1[['date','VIX1']][1:20],data2[5:10])
results

Unnamed: 0,date,VIX1,r1,r2,r3,r6,r9,r12,rs1,rs2,rs3,rs6,rs9,rs12
0,19960112,0.01926,-1.134,-1.423,-3.053,-4.441,-1.237,-0.338,0.152,0.274,0.711,1.512,0.531,0.172
1,19960115,0.02199,-1.035,2.117,3.95,5.082,1.465,0.881,0.141,-0.406,-0.898,-1.662,-0.623,-0.447
2,19960116,0.01891,10.329,0.773,2.182,2.476,-0.704,0.194,-1.552,-0.156,-0.535,-0.887,0.307,-0.1
3,19960117,0.01908,-0.054,1.091,1.05,0.583,-0.306,-0.46,0.007,-0.206,-0.239,-0.195,0.131,0.235
4,19960118,0.01834,0.0,-0.011,-0.15,-0.304,-0.652,-1.144,0.0,0.002,0.035,0.103,0.28,0.584


- 若兩資料沒有相同變數名稱
- 也可進行合併，需指定變數名稱

In [20]:
data1 = pd.read_csv('vixts.csv',names=['Date','VIX1','VIX2','VIX3','VIX6','VIX9','VIX12'],header = 0)
data2 = pd.read_csv('straddle_ret.csv', \
                    names = ['date','r1','r2','r3','r6','r9','r12','rs1','rs2','rs3','rs6','rs9','rs12'], header = 6)

In [22]:
results = pd.merge(data1,data2,left_on='Date',right_on='date')
results.head()

Unnamed: 0,Date,VIX1,VIX2,VIX3,VIX6,VIX9,VIX12,date,r1,r2,r3,r6,r9,r12,rs1,rs2,rs3,rs6,rs9,rs12
0,19960105,0.01708,0.01674,0.01661,0.01828,0.02034,0.01993,19960105,-4.067,0.827,1.018,-0.307,-0.067,0.855,0.5,-0.145,-0.221,0.101,0.029,-0.428
1,19960108,0.0161,0.01589,0.01582,0.01739,0.01863,0.01955,19960108,-1.795,-0.642,-0.443,0.307,0.234,0.167,0.224,0.116,0.098,-0.101,-0.101,-0.085
2,19960109,0.02406,0.01904,0.01675,0.01858,0.01955,0.02088,19960109,34.233,9.657,4.109,-0.396,-0.324,4.148,-4.098,-1.743,-0.916,0.132,0.141,-2.104
3,19960110,0.02494,0.02151,0.02006,0.01978,0.02079,0.02184,19960110,0.968,-2.854,-1.956,0.723,-0.77,-5.758,-0.139,0.597,0.482,-0.246,0.34,3.095
4,19960111,0.01975,0.01932,0.01912,0.01829,0.0192,0.02071,19960111,-9.191,-6.238,-3.26,1.116,-1.824,-1.151,1.41,1.363,0.842,-0.404,0.835,0.594


### 主成分分析


In [None]:
from sklearn.decomposition import PCA 

In [None]:
data_vix = data[['VIX1','VIX2','VIX3','VIX6','VIX9','VIX12']][:]

In [None]:
pca = PCA(n_components=2)
newData = pca.fit_transform(data_vix)
newData


In [None]:
data_vix