#  <font color=red> Module_03_索引資料</font>

## 索引的重要性

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

np.random.seed(123456)
df = pd.DataFrame({'foo': np.random.random(10000), 'key': range(100, 10100)})
df

In [None]:
df[df.key == 10099] # 直線搜尋所有資料，沒有效率

In [None]:
%timeit df[df.key == 10099]

---

In [None]:
df_with_index = df.set_index(['key'])
df_with_index

In [None]:
df_with_index.loc[10099] # 索引利用直接查詢而非搜尋的方式，較有效率 

In [None]:
%timeit df_with_index.loc[10099] # 缺點是需要花時間建立索引，也會消耗更多記憶體

## 基本型別的索引

In [None]:
temps = pd.DataFrame({'City': ['Missoula', 'Philadelphia'],
                      'Temperature': [70, 80] })
temps

In [None]:
temps.columns # 基本型別，內含值須是要可雜湊的 (hashable)，可雜湊的可先想成不可變的資料型態

## 整數索引標籤- Int64Index 及 RangeIndex

In [None]:
df_i64 = pd.DataFrame(np.arange(10, 20, 2), index = np.arange(0, 10, 2))
df_i64

In [None]:
df_i64.index

---

In [None]:
df_range = pd.DataFrame(np.arange(10, 15))
df_range

In [None]:
df_range.index # 預設索引 # 只要三個值，節省記憶體，且執行時間跟 Int64Index 同一等級

---

In [None]:
# 每種索引標籤都可用類似這樣的建立方法來建立，試試看!
index  = pd.RangeIndex(0, 10, 2)
df = pd.DataFrame(np.random.randn(len(index)), index = index, columns = ['Col1'])
df

In [None]:
df.index

## 浮點索引標籤- Float64Index

In [None]:
df_f64 = pd.DataFrame(np.arange(0, 1000, 5),
                     index = np.arange(0, 100, 0.5))
df_f64

In [None]:
df_f64[:5] # 因為不會搞混，所以這裡被當成使用索引標籤切割 # 有包含 5 

In [None]:
df_f64.index

## 離散區間- IntervalIndex

In [None]:
df_inerval = pd.DataFrame({'A': [1, 2, 3, 4]},
                           index = pd.IntervalIndex.from_breaks([0, 0.5, 1.0, 1.5, 2.0])) 
df_inerval

In [None]:
df_inerval.index

---

In [None]:
index = pd.IntervalIndex.from_breaks([0, 10, 20, 30], closed = 'left') # 有 closed 參數可用
df = pd.DataFrame(np.random.randn(len(index)), index = index)
df

In [None]:
df.index

## 以類別值作為索引-CategoricalIndex

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_categorical = pd.DataFrame({'A': np.arange(6),
                               'B': list('aabbca')})
df = df_categorical.copy()
df

In [None]:
df = df.set_index('B')
df

In [None]:
df.index # 這樣是基本索引

In [None]:
df.loc['a']

---

In [None]:
df_categorical

In [None]:
df_categorical['B'] = df_categorical['B'].astype('category')
df_categorical

In [None]:
df_categorical = df_categorical.set_index('B')
df_categorical

In [None]:
df_categorical.index # 類別索引，之後會有更多討論

In [None]:
df_categorical.loc['a']

---

In [None]:
ages = np.random.randint(0, 80, 20)
bins = [0, 15, 45, 60, 80]
pieces = pd.cut(ages, bins = bins, right = False)
pieces

In [None]:
s = pieces.value_counts()
s

In [None]:
s.index

## 以日期及時間作為索引- DatetimeIndex

In [None]:
rng = pd.date_range('5/1/2017', periods = 5, freq = 'H')
rng

In [None]:
ts = pd.Series(np.random.randn(len(rng)), index = rng)
ts

In [None]:
ts.index # 用日期與時間的索引標籤查詢的效率高

## 以時間期間作為索引- PeriodIndex

In [None]:
# 以月為期間
# 之後有一節會詳細介紹
periods = pd.PeriodIndex(['2017-1', '2017-2', '2017-3'], freq = 'M')
periods

In [None]:
period_series = pd.Series(np.random.randn(len(periods)), index = periods)
period_series

In [None]:
period_series.index

## 在序列或資料框中建立與使用索引

In [None]:
index = pd.date_range('5/1/2017', periods = 5, freq = 'H')
index

In [None]:
df_date_times = pd.DataFrame(np.arange(len(index)), index = index)
df_date_times

---

In [None]:
# 也可用 .index 屬性設定
df_date_times.index = pd.date_range('2020-01-01', periods = 5, freq = 'D')
df_date_times

---

In [None]:
index = pd.DatetimeIndex(['2020-1-7 00:03:00', '2020-2-8', '2020-3-9'])
index

In [None]:
df  = pd.DataFrame(np.arange(len(index)), index = index)
df

In [None]:
df.index

## 利用索引選取資料

In [None]:
s = pd.Series(np.arange(0, 5), index = list('abcde'))
s

In [None]:
s['b']

In [None]:
s.loc['b']

In [None]:
s['b':'d']   

In [None]:
s.loc[['a', 'c', 'e']]

---

In [None]:
df = pd.DataFrame([np.arange(10, 12), np.arange(12, 14)], columns = list('ab'), index = list('vw') )
df

In [None]:
df['a']

In [None]:
df.loc['w']

## 把資料移入及移出索引

In [None]:
sp500 = pd.read_csv('./mod01/sp500.csv', index_col = 'Symbol', usecols = [0, 2, 3, 7])
sp500.head()

In [None]:
index_moved_to_col = sp500.reset_index()
index_moved_to_col[:5]

In [None]:
index_moved_to_col.set_index('Sector')[:5]

---

In [None]:
frame = pd.DataFrame({'a': range(7),
                      'b': range(7, 0, -1),
                      'c': ['one','one','one','two','two','two','two'],
                      'd': [0, 1, 2, 0, 1, 2, 3]})
frame

In [None]:
frame2 = frame.set_index(['c', 'd'])
frame2

In [None]:
frame.set_index(['c', 'd'], drop = False) # 選擇留下欄位

In [None]:
frame2

In [None]:
frame2.reset_index()

---

In [None]:
df1 = pd.DataFrame(np.random.randn(4, 2))
df1

In [None]:
df2 = pd.DataFrame(np.random.randn(3, 2))
df2

In [None]:
df3 = pd.concat([df1, df2])
df3

In [None]:
df3.reset_index()

In [None]:
# 把原來的索引標籤直接丟棄 # 就地就改
df3.reset_index(drop = True, inplace = True)
df3

## 重新索引 pandas 物件

In [None]:
sp500 = pd.read_csv('./mod01/sp500.csv', index_col = 'Symbol', usecols = [0, 2, 3, 7])
sp500.head()

In [None]:
reindexed = sp500.reindex( index = ['MMM', 'ABBV', 'FOO'])
reindexed

In [None]:
sp500.reindex(columns = ['Price', 'Book Value', 'NewCol'])[:5]

## 階層式索引

In [None]:
# 從外面開始是 leve1 = 0, level = 1,...
# 列表的第 0 個元素放在 level = 0，列表的第 1 個元素放在 level = 1
data = pd.Series(np.random.randn(9), 
                 index = [['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                          [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

In [None]:
data.index

In [None]:
# 不是階層索引了
data['b']

In [None]:
# 還是階層索引
data['b':'c']

In [None]:
# 還是階層索引
data[['b', 'd']]

In [None]:
# 不是階層索引了
data[:, 2]

---

In [None]:
data

In [None]:
# 後面章節會講解
# 不管是堆疊或解堆疊，都是某層級索引移至另一軸的最後一層索引
# not in-place
data.unstack(level = 1) 

In [None]:
data

In [None]:
data.unstack(level = 0)

In [None]:
# 預設 level = -1， 最裡面的 level
data.unstack()

In [None]:
data.unstack().stack()

In [None]:
data.unstack().unstack()

---

In [None]:
# 欄位的階層索引標籤類似 ，列表的第 0 個元素 放在最外面的 level = 0，列表的第 1 個元素  level = 1,...
frame = pd.DataFrame(np.arange(12).reshape(4, 3),
                     index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
frame

In [None]:
frame.index.names = ['key1', 'key2']

In [None]:
frame.columns.names = ['state', 'color']

In [None]:
frame

In [None]:
# 欄位的階層索引不見
frame['Ohio']

In [None]:
# 也可以用 level 的方式來表示，例如 frame.swaplevel(0, 1)
# 預設 axis = 0
# not in-place 
frame.swaplevel('key1', 'key2') 

In [None]:
frame.swaplevel('state', 'color', axis = 1) # not in-place

---

In [None]:
frame

In [None]:
frame.sort_index(level = 1) # 預設 axis = 0 # not in-place

In [None]:
frame

In [None]:
frame.swaplevel(0, 1).sort_index(level = 0)

---

In [None]:
frame

In [None]:
# 跟 MySQL 的 groupby 類似
# 預設 axis = 0
# 之後會有一談到
frame.groupby(level = 'key2').sum() 

In [None]:
frame.groupby(level = 'color', axis = 1).sum() 

---

In [None]:
sp500 = pd.read_csv('./mod01/sp500.csv', index_col = 'Symbol', usecols = [0, 2, 3, 7])
sp500.head()

In [None]:
reindexed = sp500.reset_index()
reindexed[:5]

In [None]:
multi_fi = reindexed.set_index(['Sector', 'Symbol'])
multi_fi[:5]

In [None]:
type(multi_fi.index)

In [None]:
multi_fi.index

In [None]:
multi_fi.index.levels

In [None]:
len(multi_fi.index.levels) # 看索引有幾個層級

In [None]:
multi_fi.index.levels[0] # 每個層級都是不同的 Index 物件

In [None]:
multi_fi.index.levels[1] 

In [None]:
 multi_fi.index.get_level_values(level = 0) # 取得特定層級的索引值本身

---

In [None]:
multi_fi

In [None]:
# .xs() 方法用在階層式索引的好處是可以選 level
#  預設 axis = 0，level = 0
multi_fi.xs('Industrials')

In [None]:
# 預設 level = 0 與 axis = 0，所以跟上面一樣結果
multi_fi.xs('Industrials', level = 0)

In [None]:
multi_fi.loc['Industrials']

---

In [None]:
multi_fi.xs('ABT', level = 1)

In [None]:
# 預設是 drop_level = True，只要大家的索引都一樣，那一個 level 就會被 drop 掉 
multi_fi.xs('Industrials', drop_level= False) 

In [None]:
multi_fi.xs('Industrials').xs('AVY')

In [None]:
# 或者是傳 tuple
# .xs() 只能讀取，不能像 loc 和 iloc 那樣改變值
multi_fi.xs(('Industrials', 'AVY'))  

In [None]:
multi_fi.xs('Price', axis = 1)