In [45]:
import pandas as pd

# 行索引是数据的索引，列索引指向的是一个Series；
# DF的索引也是系列形成的Series的索引；
# 索引允许重复，但业务上一般不会让他重复；
# 有时行和列层级较多的数据会出现多层索引的情况

# 4.1.2 建立索引

In [68]:
# 1，读取数据时加载索引
df = pd.read_excel('team.xlsx',index_col='name')  
df.head()

Unnamed: 0_level_0,team,Q1,Q2,Q3,Q4
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Liver,E,89.0,21,24.0,1
Arry,C,,37,37.0,2
Ack,A,57.0,60,,3
Eorge,C,93.0,4,71.0,4
Oah,D,93.0,49,,5


In [76]:
# 2，设置一层索引 
# 注意：设置索引没有修改原来的df变量中内容，需要用inplace=True使索引生效 或 直接赋值使生效 df2 = df2.set_index('name') 
df2 = pd.read_excel('team.xlsx')
df2.set_index('name',inplace=True) 
df2.head()

Unnamed: 0_level_0,team,Q1,Q2,Q3,Q4
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Liver,E,89.0,21,24.0,1
Arry,C,,37,37.0,2
Ack,A,57.0,60,,3
Eorge,C,93.0,4,71.0,4
Oah,D,93.0,49,,5


In [77]:
# 3，设置多层索引 
# 注意：索引过的列不能再索引，如df2 = df2.set_index(['team','name'])报错提示"None of ['name'] are in the columns"，name已经是索引不能再索引。
df2 = df2.reset_index()  # 重置索引
df2 = df2.set_index(['team','name'])
df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Q1,Q2,Q3,Q4
team,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E,Liver,89.0,21,24.0,1
C,Arry,,37,37.0,2
A,Ack,57.0,60,,3
C,Eorge,93.0,4,71.0,4
D,Oah,93.0,49,,5


In [78]:
# 4，多层索引应用，索引姓名的第一个字母和全部姓名
df2 = df2.reset_index()  # 重置索引
df2.set_index([df2.name.str[0],'name'],inplace=True)
df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,team,Q1,Q2,Q3,Q4
name,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
L,Liver,E,89.0,21,24.0,1
A,Arry,C,,37,37.0,2
A,Ack,A,57.0,60,,3
E,Eorge,C,93.0,4,71.0,4
O,Oah,D,93.0,49,,5


In [90]:
# 5，将Series指定索引
df3 = pd.read_excel('team.xlsx')
s = pd.Series([i for i in range(100)])
df3.set_index(s).head()

Unnamed: 0,name,team,Q1,Q2,Q3,Q4
0,Liver,E,89.0,21,24.0,1
1,Arry,C,,37,37.0,2
2,Ack,A,57.0,60,,3
3,Eorge,C,93.0,4,71.0,4
4,Oah,D,93.0,49,,5


In [91]:
# 6，计算索引
df3.set_index([s, s**2]).head()

Unnamed: 0,Unnamed: 1,name,team,Q1,Q2,Q3,Q4
0,0,Liver,E,89.0,21,24.0,1
1,1,Arry,C,,37,37.0,2
2,4,Ack,A,57.0,60,,3
3,9,Eorge,C,93.0,4,71.0,4
4,16,Oah,D,93.0,49,,5


In [106]:
# 7, 获取具体值???
df3 = df3.set_index([s, s**2])
df3.head()


Unnamed: 0,Unnamed: 1,name,team,Q1,Q2,Q3,Q4
0,0,Liver,E,89.0,21,24.0,1
1,1,Arry,C,,37,37.0,2
2,4,Ack,A,57.0,60,,3
3,9,Eorge,C,93.0,4,71.0,4
4,16,Oah,D,93.0,49,,5


In [109]:
# 8，索引Q1
df7 = pd.read_excel('team.xlsx')
df7.set_index('Q1',inplace=True)
df7.head()

Unnamed: 0_level_0,name,team,Q2,Q3,Q4
Q1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
89.0,Liver,E,21,24.0,1
,Arry,C,37,37.0,2
57.0,Ack,A,60,,3
93.0,Eorge,C,4,71.0,4
93.0,Oah,D,49,,5


In [110]:
# 9，索引Q2并保留原数据列，使用 drop=False 保留原数据列 
# 注意：上次Q1的索引及原列值都没了。
df7.set_index('Q2',drop=False,inplace=True)
df7.head()

Unnamed: 0_level_0,name,team,Q2,Q3,Q4
Q2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21,Liver,E,21,24.0,1
37,Arry,C,37,37.0,2
60,Ack,A,60,,3
4,Eorge,C,4,71.0,4
49,Oah,D,49,,5


In [111]:
# 10，索引Q3保留原来的索引Q2，使用 append=True 保留上次索引
df7.set_index('Q3',append=True,inplace=True)
df7.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,team,Q2,Q4
Q2,Q3,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21,24.0,Liver,E,21,1
37,37.0,Arry,C,37,2
60,,Ack,A,60,3
4,71.0,Eorge,C,4,4
49,,Oah,D,49,5


In [112]:
# 11，索引Q4保留原来的索引Q2、Q3并保留Q4原列值
df7.set_index('Q4',append=True, drop=False, inplace=True)
df7.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,name,team,Q2,Q4
Q2,Q3,Q4,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
21,24.0,1,Liver,E,21,1
37,37.0,2,Arry,C,37,2
60,,3,Ack,A,60,3
4,71.0,4,Eorge,C,4,4
49,,5,Oah,D,49,5
