#  <font color=red> Module_03_索引資料</font>

## 索引的重要性

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

np.random.seed(123456)
df = pd.DataFrame({'foo': np.random.random(10000), 'key': range(100, 10100)})
df

Unnamed: 0,foo,key
0,0.126970,100
1,0.966718,101
2,0.260476,102
3,0.897237,103
4,0.376750,104
...,...,...
9995,0.769913,10095
9996,0.752521,10096
9997,0.216083,10097
9998,0.448789,10098


In [2]:
df[df.key == 10099] # 直線搜尋所有資料，沒有效率

Unnamed: 0,foo,key
9999,0.272283,10099


In [3]:
%timeit df[df.key == 10099]

117 µs ± 2.19 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


---

In [4]:
df_with_index = df.set_index(['key'])
df_with_index

Unnamed: 0_level_0,foo
key,Unnamed: 1_level_1
100,0.126970
101,0.966718
102,0.260476
103,0.897237
104,0.376750
...,...
10095,0.769913
10096,0.752521
10097,0.216083
10098,0.448789


In [5]:
df_with_index.loc[10099] # 索引利用直接查詢而非搜尋的方式，較有效率 

foo    0.272283
Name: 10099, dtype: float64

In [6]:
%timeit df_with_index.loc[10099] # 缺點是需要花時間建立索引，也會消耗更多記憶體

32.6 µs ± 1.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## 基本型別的索引

In [7]:
temps = pd.DataFrame({['City']: ['Missoula', 'Philadelphia'],
                      'Temperature': [70, 80] })
temps

TypeError: unhashable type: 'list'

In [8]:
temps.columns # 基本型別，內含值須是要可雜湊的 (hashable)，可雜湊的可先想成不可變的資料型態

NameError: name 'temps' is not defined

## 整數索引標籤- Int64Index 及 RangeIndex

In [None]:
df_i64 = pd.DataFrame(np.arange(10, 20, 2), index = np.arange(0, 10, 2))
df_i64

In [None]:
df_i64.index

---

In [None]:
df_range = pd.DataFrame(np.arange(10, 15))
df_range

In [None]:
df_range.index # 預設索引 # 只要三個值，節省記憶體，且執行時間跟 Int64Index 同一等級

---

In [None]:
# 每種索引標籤都可用類似這樣的建立方法來建立，試試看!
index  = pd.RangeIndex(0, 10, 2)
df = pd.DataFrame(np.random.randn(len(index)), index = index, columns = ['Col1'])
df

In [None]:
df.index

## 浮點索引標籤- Float64Index

In [None]:
df_f64 = pd.DataFrame(np.arange(0, 1000, 5),
                     index = np.arange(0, 100, 0.5))
df_f64

In [9]:
df_f64[:5] # 因為不會搞混，所以這裡被當成使用索引標籤切割 # 有包含 5 

NameError: name 'df_f64' is not defined

In [None]:
df_f64.index

## 離散區間- IntervalIndex

In [None]:
df_inerval = pd.DataFrame({'A': [1, 2, 3, 4]},
                           index = pd.IntervalIndex.from_breaks([0, 0.5, 1.0, 1.5, 2.0])) 
df_inerval

In [None]:
df_inerval.index

---

In [None]:
index = pd.IntervalIndex.from_breaks([0, 10, 20, 30], closed = 'left') # 有 closed 參數可用
df = pd.DataFrame(np.random.randn(len(index)), index = index)
df

In [None]:
df.index

## 以類別值作為索引-CategoricalIndex

In [None]:
df_categorical = pd.DataFrame({'A': np.arange(6),
                               'B': list('aabbca')})
df = df_categorical.copy()
df

In [None]:
df = df.set_index('B')
df

In [None]:
df.index # 這樣是基本索引

In [None]:
df.loc['a']

---

In [None]:
df_categorical

In [None]:
df_categorical['B'] = df_categorical['B'].astype('category')
df_categorical

In [None]:
df_categorical = df_categorical.set_index('B')
df_categorical

In [None]:
df_categorical.index # 類別索引，之後會有更多討論

In [10]:
df_categorical.loc['a']

NameError: name 'df_categorical' is not defined

---

In [11]:
ages = np.random.randint(0, 80, 20)
bins = [0, 15, 45, 60, 80]
pieces = pd.cut(ages, bins = bins, right = False)
pieces

[[60, 80), [15, 45), [15, 45), [0, 15), [45, 60), ..., [15, 45), [60, 80), [60, 80), [15, 45), [0, 15)]
Length: 20
Categories (4, interval[int64, left]): [[0, 15) < [15, 45) < [45, 60) < [60, 80)]

In [12]:
s = pieces.value_counts()
s

[0, 15)     3
[15, 45)    8
[45, 60)    3
[60, 80)    6
dtype: int64

In [13]:
s.index

CategoricalIndex([[0, 15), [15, 45), [45, 60), [60, 80)], categories=[[0, 15), [15, 45), [45, 60), [60, 80)], ordered=True, dtype='category')

## 以日期及時間作為索引- DatetimeIndex

In [14]:
rng = pd.date_range('5/1/2017', periods = 5, freq = 'H')
rng

DatetimeIndex(['2017-05-01 00:00:00', '2017-05-01 01:00:00',
               '2017-05-01 02:00:00', '2017-05-01 03:00:00',
               '2017-05-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [15]:
ts = pd.Series(np.random.randn(len(rng)), index = rng)
ts

2017-05-01 00:00:00   -0.437051
2017-05-01 01:00:00    0.533249
2017-05-01 02:00:00   -0.819218
2017-05-01 03:00:00   -0.032955
2017-05-01 04:00:00   -0.639418
Freq: H, dtype: float64

In [16]:
ts.index # 用日期與時間的索引標籤查詢的效率高

DatetimeIndex(['2017-05-01 00:00:00', '2017-05-01 01:00:00',
               '2017-05-01 02:00:00', '2017-05-01 03:00:00',
               '2017-05-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')

## 以時間期間作為索引- PeriodIndex

In [17]:
# 以月為期間
# 之後有一節會詳細介紹
periods = pd.PeriodIndex(['2017-1', '2017-2', '2017-3'], freq = 'M')
periods

PeriodIndex(['2017-01', '2017-02', '2017-03'], dtype='period[M]')

In [18]:
period_series = pd.Series(np.random.randn(len(periods)), index = periods)
period_series

2017-01   -0.607207
2017-02   -0.126246
2017-03   -1.644070
Freq: M, dtype: float64

In [19]:
period_series.index

PeriodIndex(['2017-01', '2017-02', '2017-03'], dtype='period[M]')

## 在序列或資料框中建立與使用索引

In [20]:
index = pd.date_range('5/1/2017', periods = 5, freq = 'H')
index

DatetimeIndex(['2017-05-01 00:00:00', '2017-05-01 01:00:00',
               '2017-05-01 02:00:00', '2017-05-01 03:00:00',
               '2017-05-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [21]:
df_date_times = pd.DataFrame(np.arange(len(index)), index = index)
df_date_times

Unnamed: 0,0
2017-05-01 00:00:00,0
2017-05-01 01:00:00,1
2017-05-01 02:00:00,2
2017-05-01 03:00:00,3
2017-05-01 04:00:00,4


---

In [22]:
# 也可用 .index 屬性設定
df_date_times.index = pd.date_range('2020-01-01', periods = 5, freq = 'D')
df_date_times

Unnamed: 0,0
2020-01-01,0
2020-01-02,1
2020-01-03,2
2020-01-04,3
2020-01-05,4


---

In [23]:
index = pd.DatetimeIndex(['2020-1-7 00:03:00', '2020-2-8', '2020-3-9'])
index

DatetimeIndex(['2020-01-07 00:03:00', '2020-02-08 00:00:00',
               '2020-03-09 00:00:00'],
              dtype='datetime64[ns]', freq=None)

In [24]:
df  = pd.DataFrame(np.arange(len(index)), index = index)
df

Unnamed: 0,0
2020-01-07 00:03:00,0
2020-02-08 00:00:00,1
2020-03-09 00:00:00,2


In [25]:
df.index

DatetimeIndex(['2020-01-07 00:03:00', '2020-02-08 00:00:00',
               '2020-03-09 00:00:00'],
              dtype='datetime64[ns]', freq=None)

## 利用索引選取資料

In [26]:
s = pd.Series(np.arange(0, 5), index = list('abcde'))
s

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [27]:
s['b']

1

In [28]:
s.loc['b']

1

In [29]:
s['b':'d']   

b    1
c    2
d    3
dtype: int32

In [30]:
s.loc[['a', 'c', 'e']]

a    0
c    2
e    4
dtype: int32

---

In [31]:
df = pd.DataFrame([np.arange(10, 12), np.arange(12, 14)], columns = list('ab'), index = list('vw') )
df

Unnamed: 0,a,b
v,10,11
w,12,13


In [32]:
df['a']

v    10
w    12
Name: a, dtype: int32

In [33]:
df.loc['w']

a    12
b    13
Name: w, dtype: int32

## 把資料移入及移出索引

In [34]:
sp500 = pd.read_csv('./mod01/sp500.csv', index_col = 'Symbol', usecols = [0, 2, 3, 7])
sp500.head()

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954
ACN,Information Technology,79.79,8.326
ACE,Financials,102.91,86.897


In [35]:
index_moved_to_col = sp500.reset_index()
index_moved_to_col[:5]

Unnamed: 0,Symbol,Sector,Price,Book Value
0,MMM,Industrials,141.14,26.668
1,ABT,Health Care,39.6,15.573
2,ABBV,Health Care,53.95,2.954
3,ACN,Information Technology,79.79,8.326
4,ACE,Financials,102.91,86.897


In [36]:
index_moved_to_col.set_index('Sector')[:5]

Unnamed: 0_level_0,Symbol,Price,Book Value
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Industrials,MMM,141.14,26.668
Health Care,ABT,39.6,15.573
Health Care,ABBV,53.95,2.954
Information Technology,ACN,79.79,8.326
Financials,ACE,102.91,86.897


---

In [37]:
frame = pd.DataFrame({'a': range(7),
                      'b': range(7, 0, -1),
                      'c': ['one','one','one','two','two','two','two'],
                      'd': [0, 1, 2, 0, 1, 2, 3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [38]:
frame2 = frame.set_index(['c', 'd'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [39]:
frame.set_index(['c', 'd'], drop = False) # 選擇留下欄位

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [40]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [41]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


---

In [42]:
df1 = pd.DataFrame(np.random.randn(4, 2))
df1

Unnamed: 0,0,1
0,-0.700213,-1.116503
1,-0.889854,-0.203993
2,-2.53003,0.92893
3,-2.060577,-0.696087


In [43]:
df2 = pd.DataFrame(np.random.randn(3, 2))
df2

Unnamed: 0,0,1
0,-0.506764,-0.650387
1,0.276013,-0.93353
2,-1.12335,0.996712


In [44]:
df3 = pd.concat([df1, df2])
df3

Unnamed: 0,0,1
0,-0.700213,-1.116503
1,-0.889854,-0.203993
2,-2.53003,0.92893
3,-2.060577,-0.696087
0,-0.506764,-0.650387
1,0.276013,-0.93353
2,-1.12335,0.996712


In [45]:
df3.reset_index()

Unnamed: 0,index,0,1
0,0,-0.700213,-1.116503
1,1,-0.889854,-0.203993
2,2,-2.53003,0.92893
3,3,-2.060577,-0.696087
4,0,-0.506764,-0.650387
5,1,0.276013,-0.93353
6,2,-1.12335,0.996712


In [46]:
# 把原來的索引標籤直接丟棄 # 就地就改
df3.reset_index(drop = True, inplace = True)
df3

Unnamed: 0,0,1
0,-0.700213,-1.116503
1,-0.889854,-0.203993
2,-2.53003,0.92893
3,-2.060577,-0.696087
4,-0.506764,-0.650387
5,0.276013,-0.93353
6,-1.12335,0.996712


## 重新索引 pandas 物件

In [47]:
sp500 = pd.read_csv('./mod01/sp500.csv', index_col = 'Symbol', usecols = [0, 2, 3, 7])
sp500.head()

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954
ACN,Information Technology,79.79,8.326
ACE,Financials,102.91,86.897


In [48]:
reindexed = sp500.reindex( index = ['MMM', 'ABBV', 'FOO'])
reindexed

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABBV,Health Care,53.95,2.954
FOO,,,


In [49]:
sp500.reindex(columns = ['Price', 'Book Value', 'NewCol'])[:5]

Unnamed: 0_level_0,Price,Book Value,NewCol
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,141.14,26.668,
ABT,39.6,15.573,
ABBV,53.95,2.954,
ACN,79.79,8.326,
ACE,102.91,86.897,


## 階層式索引

In [50]:
# 從外面開始是 leve1 = 0, level = 1,...
# 列表的第 0 個元素放在 level = 0，列表的第 1 個元素放在 level = 1
data = pd.Series(np.random.randn(9), 
                 index = [['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                          [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1    1.316213
   2    0.660913
   3    0.518064
b  1   -2.606460
   3    0.976238
c  1    1.499003
   2    1.017737
d  2   -0.517978
   3   -0.451503
dtype: float64

In [51]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [52]:
# 不是階層索引了
data['b']

1   -2.606460
3    0.976238
dtype: float64

In [53]:
# 還是階層索引
data['b':'c']

b  1   -2.606460
   3    0.976238
c  1    1.499003
   2    1.017737
dtype: float64

In [54]:
# 還是階層索引
data[['b', 'd']]

b  1   -2.606460
   3    0.976238
d  2   -0.517978
   3   -0.451503
dtype: float64

In [55]:
# 不是階層索引了
data[:, 2]

a    0.660913
c    1.017737
d   -0.517978
dtype: float64

---

In [56]:
data

a  1    1.316213
   2    0.660913
   3    0.518064
b  1   -2.606460
   3    0.976238
c  1    1.499003
   2    1.017737
d  2   -0.517978
   3   -0.451503
dtype: float64

In [57]:
# 後面章節會講解
# 不管是堆疊或解堆疊，都是某層級索引移至另一軸的最後一層索引
# not in-place
data.unstack(level = 1) 

Unnamed: 0,1,2,3
a,1.316213,0.660913,0.518064
b,-2.60646,,0.976238
c,1.499003,1.017737,
d,,-0.517978,-0.451503


In [58]:
data

a  1    1.316213
   2    0.660913
   3    0.518064
b  1   -2.606460
   3    0.976238
c  1    1.499003
   2    1.017737
d  2   -0.517978
   3   -0.451503
dtype: float64

In [59]:
data.unstack(level = 0)

Unnamed: 0,a,b,c,d
1,1.316213,-2.60646,1.499003,
2,0.660913,,1.017737,-0.517978
3,0.518064,0.976238,,-0.451503


In [60]:
# 預設 level = -1， 最裡面的 level
data.unstack()

Unnamed: 0,1,2,3
a,1.316213,0.660913,0.518064
b,-2.60646,,0.976238
c,1.499003,1.017737,
d,,-0.517978,-0.451503


In [61]:
data.unstack().stack()

a  1    1.316213
   2    0.660913
   3    0.518064
b  1   -2.606460
   3    0.976238
c  1    1.499003
   2    1.017737
d  2   -0.517978
   3   -0.451503
dtype: float64

In [62]:
data.unstack().unstack()

1  a    1.316213
   b   -2.606460
   c    1.499003
   d         NaN
2  a    0.660913
   b         NaN
   c    1.017737
   d   -0.517978
3  a    0.518064
   b    0.976238
   c         NaN
   d   -0.451503
dtype: float64

---

In [63]:
# 欄位的階層索引標籤類似 ，列表的第 0 個元素 放在最外面的 level = 0，列表的第 1 個元素  level = 1,...
frame = pd.DataFrame(np.arange(12).reshape(4, 3),
                     index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [64]:
frame.index.names = ['key1', 'key2']

In [65]:
frame.columns.names = ['state', 'color']

In [66]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [67]:
# 欄位的階層索引不見
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [68]:
# 也可以用 level 的方式來表示，例如 frame.swaplevel(0, 1)
# 預設 axis = 0
# not in-place 
frame.swaplevel('key1', 'key2') 

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [69]:
frame.swaplevel('state', 'color', axis = 1) # not in-place

Unnamed: 0_level_0,color,Green,Red,Green
Unnamed: 0_level_1,state,Ohio,Ohio,Colorado
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


---

In [70]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [71]:
frame.sort_index(level = 1) # 預設 axis = 0 # not in-place

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [72]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [73]:
frame.swaplevel(0, 1).sort_index(level = 0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


---

In [74]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [75]:
# 跟 MySQL 的 groupby 類似
# 預設 axis = 0
# 之後會有一談到
frame.groupby(level = 'key2').sum() 

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [76]:
frame.groupby(level = 'color', axis = 1).sum() 

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


---

In [77]:
sp500 = pd.read_csv('./mod01/sp500.csv', index_col = 'Symbol', usecols = [0, 2, 3, 7])
sp500.head()

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954
ACN,Information Technology,79.79,8.326
ACE,Financials,102.91,86.897


In [78]:
reindexed = sp500.reset_index()
reindexed[:5]

Unnamed: 0,Symbol,Sector,Price,Book Value
0,MMM,Industrials,141.14,26.668
1,ABT,Health Care,39.6,15.573
2,ABBV,Health Care,53.95,2.954
3,ACN,Information Technology,79.79,8.326
4,ACE,Financials,102.91,86.897


In [79]:
multi_fi = reindexed.set_index(['Sector', 'Symbol'])
multi_fi[:5]

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Book Value
Sector,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1
Industrials,MMM,141.14,26.668
Health Care,ABT,39.6,15.573
Health Care,ABBV,53.95,2.954
Information Technology,ACN,79.79,8.326
Financials,ACE,102.91,86.897


In [80]:
type(multi_fi.index)

pandas.core.indexes.multi.MultiIndex

In [81]:
multi_fi.index

MultiIndex([(           'Industrials',  'MMM'),
            (           'Health Care',  'ABT'),
            (           'Health Care', 'ABBV'),
            ('Information Technology',  'ACN'),
            (            'Financials',  'ACE'),
            (           'Health Care',  'ACT'),
            ('Information Technology', 'ADBE'),
            (             'Utilities',  'AES'),
            (           'Health Care',  'AET'),
            (            'Financials',  'AFL'),
            ...
            (             'Utilities',  'XEL'),
            ('Information Technology',  'XRX'),
            ('Information Technology', 'XLNX'),
            (            'Financials',   'XL'),
            (           'Industrials',  'XYL'),
            ('Information Technology', 'YHOO'),
            ('Consumer Discretionary',  'YUM'),
            (           'Health Care',  'ZMH'),
            (            'Financials', 'ZION'),
            (           'Health Care',  'ZTS')],
           names=['Sect

In [82]:
multi_fi.index.levels

FrozenList([['Consumer Discretionary', 'Consumer Discretionary ', 'Consumer Staples', 'Consumer Staples ', 'Energy', 'Financials', 'Health Care', 'Industrials', 'Industries', 'Information Technology', 'Materials', 'Telecommunications Services', 'Utilities'], ['A', 'AA', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACE', 'ACN', 'ACT', 'ADBE', 'ADI', 'ADM', 'ADP', 'ADS', 'ADSK', 'ADT', 'AEE', 'AEP', 'AES', 'AET', 'AFL', 'AGN', 'AIG', 'AIV', 'AIZ', 'AKAM', 'ALL', 'ALLE', 'ALTR', 'ALXN', 'AMAT', 'AME', 'AMGN', 'AMP', 'AMT', 'AMZN', 'AN', 'AON', 'APA', 'APC', 'APD', 'APH', 'ARG', 'ATI', 'AVB', 'AVP', 'AVY', 'AXP', 'AZO', 'BA', 'BAC', 'BAX', 'BBBY', 'BBT', 'BBY', 'BCR', 'BDX', 'BEAM', 'BEN', 'BF-B', 'BHI', 'BIIB', 'BK', 'BLK', 'BLL', 'BMS', 'BMY', 'BRCM', 'BRK-B', 'BSX', 'BTU', 'BWA', 'BXP', 'C', 'CA', 'CAG', 'CAH', 'CAM', 'CAT', 'CB', 'CBG', 'CBS', 'CCE', 'CCI', 'CCL', 'CELG', 'CERN', 'CF', 'CFN', 'CHK', 'CHRW', 'CI', 'CINF', 'CL', 'CLF', 'CLX', 'CMA', 'CMCSA', 'CME', 'CMG', ...]])

In [83]:
len(multi_fi.index.levels) # 看索引有幾個層級

2

In [84]:
multi_fi.index.levels[0] # 每個層級都是不同的 Index 物件

Index(['Consumer Discretionary', 'Consumer Discretionary ', 'Consumer Staples',
       'Consumer Staples ', 'Energy', 'Financials', 'Health Care',
       'Industrials', 'Industries', 'Information Technology', 'Materials',
       'Telecommunications Services', 'Utilities'],
      dtype='object', name='Sector')

In [85]:
multi_fi.index.levels[1] 

Index(['A', 'AA', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACE', 'ACN', 'ACT', 'ADBE',
       ...
       'XLNX', 'XOM', 'XRAY', 'XRX', 'XYL', 'YHOO', 'YUM', 'ZION', 'ZMH',
       'ZTS'],
      dtype='object', name='Symbol', length=500)

In [86]:
 multi_fi.index.get_level_values(level = 0) # 取得特定層級的索引值本身

Index(['Industrials', 'Health Care', 'Health Care', 'Information Technology',
       'Financials', 'Health Care', 'Information Technology', 'Utilities',
       'Health Care', 'Financials',
       ...
       'Utilities', 'Information Technology', 'Information Technology',
       'Financials', 'Industrials', 'Information Technology',
       'Consumer Discretionary', 'Health Care', 'Financials', 'Health Care'],
      dtype='object', name='Sector', length=500)

---

In [87]:
multi_fi

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Book Value
Sector,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1
Industrials,MMM,141.14,26.668
Health Care,ABT,39.60,15.573
Health Care,ABBV,53.95,2.954
Information Technology,ACN,79.79,8.326
Financials,ACE,102.91,86.897
...,...,...,...
Information Technology,YHOO,35.02,12.768
Consumer Discretionary,YUM,74.77,5.147
Health Care,ZMH,101.84,37.181
Financials,ZION,28.43,30.191


In [88]:
# .xs() 方法用在階層式索引的好處是可以選 level
#  預設 axis = 0，level = 0
multi_fi.xs('Industrials')

Unnamed: 0_level_0,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,141.14,26.668
ALLE,52.46,0.000
APH,95.71,18.315
AVY,48.20,15.616
BA,132.41,19.870
...,...,...
UNP,196.26,46.957
UPS,102.73,6.790
UTX,115.54,35.252
WM,43.37,12.330


In [89]:
# 預設 level = 0 與 axis = 0，所以跟上面一樣結果
multi_fi.xs('Industrials', level = 0)

Unnamed: 0_level_0,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,141.14,26.668
ALLE,52.46,0.000
APH,95.71,18.315
AVY,48.20,15.616
BA,132.41,19.870
...,...,...
UNP,196.26,46.957
UPS,102.73,6.790
UTX,115.54,35.252
WM,43.37,12.330


In [90]:
multi_fi.loc['Industrials']

Unnamed: 0_level_0,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,141.14,26.668
ALLE,52.46,0.000
APH,95.71,18.315
AVY,48.20,15.616
BA,132.41,19.870
...,...,...
UNP,196.26,46.957
UPS,102.73,6.790
UTX,115.54,35.252
WM,43.37,12.330


---

In [91]:
multi_fi.xs('ABT', level = 1)

Unnamed: 0_level_0,Price,Book Value
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1
Health Care,39.6,15.573


In [92]:
# 預設是 drop_level = True，只要大家的索引都一樣，那一個 level 就會被 drop 掉 
multi_fi.xs('Industrials', drop_level= False) 

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Book Value
Sector,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1
Industrials,MMM,141.14,26.668
Industrials,ALLE,52.46,0.000
Industrials,APH,95.71,18.315
Industrials,AVY,48.20,15.616
Industrials,BA,132.41,19.870
Industrials,...,...,...
Industrials,UNP,196.26,46.957
Industrials,UPS,102.73,6.790
Industrials,UTX,115.54,35.252
Industrials,WM,43.37,12.330


In [93]:
multi_fi.xs('Industrials').xs('AVY')

Price         48.200
Book Value    15.616
Name: AVY, dtype: float64

In [94]:
# 或者是傳 tuple
# .xs() 只能讀取，不能像 loc 和 iloc 那樣改變值
multi_fi.xs(('Industrials', 'AVY'))  

Price         48.200
Book Value    15.616
Name: (Industrials, AVY), dtype: float64

In [95]:
multi_fi.xs('Price', axis = 1)

Sector                  Symbol
Industrials             MMM       141.14
Health Care             ABT        39.60
                        ABBV       53.95
Information Technology  ACN        79.79
Financials              ACE       102.91
                                   ...  
Information Technology  YHOO       35.02
Consumer Discretionary  YUM        74.77
Health Care             ZMH       101.84
Financials              ZION       28.43
Health Care             ZTS        30.53
Name: Price, Length: 500, dtype: float64