In [2]:
import numpy as np
import pandas as pd

In [6]:
# 表的列索引
df = pd.read_csv('data/learn_pandas.csv',
                 usecols = ['School', 'Grade', 'Name', 'Gender',
                            'Weight', 'Transfer'])

df['Name']

df[['Gender', 'Name']].head()


Unnamed: 0,Gender,Name
0,Female,Gaopeng Yang
1,Male,Changqiang You
2,Male,Mei Sun
3,Female,Xiaojuan Sun
4,Male,Gaojuan You


In [70]:
s = pd.Series([1, 2, 3, 4, 5, 6],
               index=['a', 'b', 'a', 'a', 'a', 'c'])

s['c': 'b': -2]

s.sort_index()['a':'b']


# [2, 4, 6]

a    1
a    3
a    4
a    5
b    2
dtype: int64

In [63]:
# loc索引器
df_demo = pd.read_csv('data/learn_pandas.csv',
                 usecols = ['School', 'Grade', 'Name', 'Gender',
                            'Weight', 'Transfer'])

df_demo.set_index('Name', inplace = True)

df_demo.loc['Gaopeng Yang']['School']

#  * 为元素列表

df_demo.loc[['Qiang Sun', 'Quan Zhao'], ['School','Gender']]

# * 为切片
df_demo.loc['Gaojuan You':'Gaoqiang Qian', 'School':'Gender']

# * 为布尔列表
df_demo.loc[df_demo.Weight>70].head()

df_demo.loc[df_demo.Grade.isin(['Freshman', 'Senior'])].head()

df_demo.select_dtypes(include = ['number']).head()

# * 为函数
def condition(x):

    condition_2_1 = x.School == 'Peking University'
    condition_2_3 = x.Weight > 80
    result = ~condition_2_3 | condition_2_1
    return result

df_demo.loc[condition]

df_chain = pd.DataFrame([[0,0],[1,0],[-1,0]], columns=list('AB'))


df_chain.loc[df_chain.A!=0] = 1

df_chain

#  iloc索引器

# 第二行第二列
df_demo.iloc[1, 1]

# 前两行前两列

df_demo.iloc[[0, 1], [0, 1]]

# 切片不包含结束端点
df_demo.iloc[1: 4, 2:4] 

# 传入切片为返回值的函数
df_demo.iloc[lambda x: slice(1, 4)]

# 选出体重超过80kg的学生
df_demo.iloc[(df_demo.Weight>80).values].head()


Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mei Sun,A,Senior,Male,89.0,N
Qiang Zheng,A,Senior,Male,87.0,N
Qiang Han,B,Freshman,Male,87.0,N
Chengpeng Zhou,C,Senior,Male,81.0,N
Feng Han,A,Sophomore,Male,82.0,N


In [75]:
# query方法
df.query('Weight > 80')
df.query('Weight > Weight.mean()').head()
df.query('(Grade not in ["Freshman", "Sophomore"]) and'
         '(Gender == "Male")').head()

df.query('Grade == ["Junior", "Senior"]').head()

df.query('Grade in ["Junior", "Senior"]').head()

low, high =70, 80
df.query('(Weight >= @low) & (Weight <= @high)').head()

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Changqiang You,B,Freshman,Male,70.0,N
Gaojuan You,C,Sophomore,Male,74.0,N
Xiaopeng Zhou,A,Freshman,Male,74.0,N
Xiaofeng Sun,D,Senior,Male,71.0,N
Gaoli Zhao,B,Freshman,Male,78.0,N


In [79]:
# 随机抽样
df_sample = pd.DataFrame({'id': list('abcde'),
                          'value': [1, 2, 3, 4, 90]})

df_sample

Unnamed: 0,id,value
0,a,1
1,b,2
2,c,3
3,d,4
4,e,90


In [108]:
df_sample.sample(3, replace = True, weights = df_sample.value)

Unnamed: 0,id,value
4,e,90
4,e,90
4,e,90


In [42]:
# 多级索引及其表的结构

df = pd.read_csv('data/learn_pandas.csv')

multi_index = pd.MultiIndex.from_product([list('ABCD'),
              df.Gender.unique()], names=('School', 'Gender'))

# multi_column = pd.MultiIndex.from_product([['Height', 'Weight'],
#                df.Grade.unique()], names=('Indicator', 'Grade'))


df_multi = pd.DataFrame(np.c_[(np.random.randn(8,4)*5 + 163).tolist(),
                              (np.random.randn(8,4)*5 + 65).tolist()],
                        index = multi_index,
                        columns = multi_column
                  ).round(1)

df_multi

Unnamed: 0_level_0,Indicator,Height,Height,Height,Height,Weight,Weight,Weight,Weight
Unnamed: 0_level_1,Grade,Freshman,Senior,Sophomore,Junior,Freshman,Senior,Sophomore,Junior
School,Gender,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
A,Female,167.1,158.9,166.5,170.4,64.6,65.1,64.0,69.1
A,Male,154.0,159.6,168.9,159.2,73.2,56.4,65.7,69.8
B,Female,159.9,161.7,159.6,164.5,61.7,60.2,72.4,65.0
B,Male,169.6,156.2,158.5,160.8,64.9,68.0,69.1,61.2
C,Female,174.8,171.2,163.1,169.3,61.2,62.2,55.5,66.5
C,Male,164.9,156.3,154.5,158.7,61.4,67.4,60.3,65.3
D,Female,161.5,162.1,168.8,169.8,65.4,63.7,61.8,73.1
D,Male,160.9,168.3,161.8,169.4,67.7,64.2,70.3,68.6


In [43]:
df_multi.index.names
df_multi.columns.names

df_multi.index.values
df_multi.columns.values

df_multi

Unnamed: 0_level_0,Indicator,Height,Height,Height,Height,Weight,Weight,Weight,Weight
Unnamed: 0_level_1,Grade,Freshman,Senior,Sophomore,Junior,Freshman,Senior,Sophomore,Junior
School,Gender,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
A,Female,167.1,158.9,166.5,170.4,64.6,65.1,64.0,69.1
A,Male,154.0,159.6,168.9,159.2,73.2,56.4,65.7,69.8
B,Female,159.9,161.7,159.6,164.5,61.7,60.2,72.4,65.0
B,Male,169.6,156.2,158.5,160.8,64.9,68.0,69.1,61.2
C,Female,174.8,171.2,163.1,169.3,61.2,62.2,55.5,66.5
C,Male,164.9,156.3,154.5,158.7,61.4,67.4,60.3,65.3
D,Female,161.5,162.1,168.8,169.8,65.4,63.7,61.8,73.1
D,Male,160.9,168.3,161.8,169.4,67.7,64.2,70.3,68.6


In [95]:
df = pd.read_csv('data/learn_pandas.csv',
                 usecols = ['School', 'Grade', 'Name', 'Gender',
                            'Weight', 'Transfer'])
                            
df_multi = df.set_index(['School', 'Grade'])
# df_sorted = df_multi.sort_index()

df_multi




Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Gender,Weight,Transfer
School,Grade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,Freshman,Gaopeng Yang,Female,46.0,N
B,Freshman,Changqiang You,Male,70.0,N
A,Senior,Mei Sun,Male,89.0,N
C,Sophomore,Xiaojuan Sun,Female,41.0,N
C,Sophomore,Gaojuan You,Male,74.0,N
C,...,...,...,...,...
C,Junior,Xiaojuan Sun,Female,46.0,N
D,Senior,Li Zhao,Female,50.0,N
A,Senior,Chengqiang Chu,Female,45.0,N
A,Senior,Chengmei Shen,Male,71.0,N


In [126]:
df

Unnamed: 0,School,Grade,Name,Gender,Weight,Transfer
0,A,Freshman,Gaopeng Yang,Female,46.0,N
1,B,Freshman,Changqiang You,Male,70.0,N
2,A,Senior,Mei Sun,Male,89.0,N
3,C,Sophomore,Xiaojuan Sun,Female,41.0,N
4,C,Sophomore,Gaojuan You,Male,74.0,N
...,...,...,...,...,...,...
195,C,Junior,Xiaojuan Sun,Female,46.0,N
196,D,Senior,Li Zhao,Female,50.0,N
197,A,Senior,Chengqiang Chu,Female,45.0,N
198,A,Senior,Chengmei Shen,Male,71.0,N


In [130]:
df.drop_duplicates(['School','Grade']).set_index(['School', 'Grade']).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Gender,Weight,Transfer
School,Grade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,Freshman,Gaopeng Yang,Female,46.0,N
A,Junior,Feng Zheng,Female,51.0,N
A,Senior,Mei Sun,Male,89.0,N
A,Sophomore,Yanfeng Qian,Female,48.0,N
B,Freshman,Changqiang You,Male,70.0,N
B,Junior,Juan Xu,Female,,N
B,Senior,Changli Lv,Female,41.0,N
B,Sophomore,Changmei Xu,Female,43.0,N
C,Freshman,Changqiang Yang,Female,49.0,N
C,Junior,Yanli You,Female,48.0,N


In [163]:

np.random.seed(0)
L1,L2 = ['A','B','C'],['a','b','c']
mul_index1 = pd.MultiIndex.from_product([L1,L2],names=('Upper', 'Lower'))

L3,L4 = ['D','E','F'],['d','e','f']

mul_index2 = pd.MultiIndex.from_product([L3,L4],names=('Big', 'Small'))



df_ex = pd.DataFrame(np.random.randint(-9,10,(9,9)),
                    index=mul_index1,
                    columns=mul_index2)


df_ex



Unnamed: 0_level_0,Big,D,D,D,E,E,E,F,F,F
Unnamed: 0_level_1,Small,d,e,f,d,e,f,d,e,f
Upper,Lower,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
A,a,3,6,-9,-6,-6,-2,0,9,-5
A,b,-3,3,-8,-3,-2,5,8,-4,4
A,c,-1,0,7,-4,6,6,-9,9,-6
B,a,8,5,-2,-9,-8,0,-9,1,-6
B,b,2,9,-7,-9,-9,-5,-4,-3,-1
B,c,8,6,-5,0,1,-8,-8,-2,0
C,a,-6,-3,2,5,9,-9,5,-6,3
C,b,1,2,-5,-3,-5,6,-6,3,-5
C,c,-1,5,6,-6,6,4,7,8,-4


In [152]:
idx = pd.IndexSlice

df_ex.loc[idx['C': , ('D',  'f'):]]


Unnamed: 0_level_0,Big,D,E,E,E,F,F,F
Unnamed: 0_level_1,Small,f,d,e,f,d,e,f
Upper,Lower,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
C,a,2,5,9,-9,5,-6,3
C,b,-5,-3,-5,6,-6,3,-5
C,c,6,-6,6,4,7,8,-4


In [165]:
# 多级索引的构造

my_tuple = [('a','cat'),('a','dog'),('b','cat'),('b','dog')]

pd.MultiIndex.from_tuples(my_tuple, names=['First','Second'])

my_array = [list('aabb'), ['cat', 'dog']*2]

pd.MultiIndex.from_arrays(my_array, names=['First','Second'])

# from_product 指根据给定多个列表的笛卡尔积进行构造：

my_list1 = ['a','b', 'c']
my_list2 = ['cat','dog', 'dqs']

pd.MultiIndex.from_product([my_list1,
                            my_list2],
                           names=['First','Second'])

                           


MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('a', 'dqs'),
            ('b', 'cat'),
            ('b', 'dog'),
            ('b', 'dqs'),
            ('c', 'cat'),
            ('c', 'dog'),
            ('c', 'dqs')],
           names=['First', 'Second'])

In [183]:

df = pd.read_csv('data/learn_pandas.csv')

df


# data

Unnamed: 0,School,Grade,Name,Gender,Height,Weight,Transfer,Test_Number,Test_Date,Time_Record
0,A,Freshman,Gaopeng Yang,Female,158.9,46.0,N,1,2019/10/5,0:04:34
1,B,Freshman,Changqiang You,Male,166.5,70.0,N,1,2019/9/4,0:04:20
2,A,Senior,Mei Sun,Male,188.9,89.0,N,2,2019/9/12,0:05:22
3,C,Sophomore,Xiaojuan Sun,Female,,41.0,N,2,2020/1/3,0:04:08
4,C,Sophomore,Gaojuan You,Male,174.0,74.0,N,2,2019/11/6,0:05:22
...,...,...,...,...,...,...,...,...,...,...
195,C,Junior,Xiaojuan Sun,Female,153.9,46.0,N,2,2019/10/17,0:04:31
196,D,Senior,Li Zhao,Female,160.9,50.0,N,3,2019/9/22,0:04:03
197,A,Senior,Chengqiang Chu,Female,153.9,45.0,N,1,2020/1/5,0:04:48
198,A,Senior,Chengmei Shen,Male,175.3,71.0,N,2,2020/1/7,0:04:58


In [200]:
multi_index = pd.MultiIndex.from_product([
              list('AB'),
              ['a', 'b', 'c', 'd']
            ], names=('School', 'Gender'))
            
multi_column = pd.MultiIndex.from_product([['Height', 'Weight'],
               df.Grade.unique()], names=('Indicator', 'Grade'))

df_multi = pd.DataFrame(np.c_[(np.random.randn(8,4)*5 + 163).tolist(),
                              (np.random.randn(8,4)*5 + 65).tolist()],
                     index=multi_index  
                  )

df_multi




Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7
School,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,a,165.536195,162.418851,158.262557,164.222217,66.797522,64.277166,63.192004,70.322926
A,b,170.006724,160.948091,165.644718,164.230739,60.310599,67.16554,62.970291,68.621843
A,c,167.317598,158.976231,174.733235,156.604194,71.926308,63.484509,67.205165,65.893964
A,d,161.172245,167.690463,164.483666,167.149931,61.002888,66.203938,66.445603,67.064354
B,a,160.519488,162.625975,163.06116,170.846298,64.008006,65.470962,59.261945,63.20943
B,b,166.452145,166.983361,159.71037,167.844413,67.779813,69.462369,62.888426,65.52357
B,c,164.127908,169.945727,173.070301,161.466171,66.140267,66.0074,67.703868,55.909612
B,d,160.968484,158.679775,162.282102,161.089873,64.75338,66.195168,59.998348,73.369929


In [280]:
L1,L2,L3 = ['A','B'],['a','b'], ['alpha','beta']

mul_index1 = pd.MultiIndex.from_product([L1 ,L2, L3],  names=('Upper', 'Lower','Extra'))

L4,L5,L6 = ['C','D'],['c','d'],['cat','dog']

mul_index2 = pd.MultiIndex.from_product([L4,L5,L6],
             names=('Big', 'Small', 'Other'))


df_ex = pd.DataFrame(np.random.randint(-9, 10, size=(8, 8)), index=mul_index1, columns=mul_index2)   

df_ex



Unnamed: 0_level_0,Unnamed: 1_level_0,Big,C,C,C,C,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,Small,c,c,d,d,c,c,d,d
Unnamed: 0_level_2,Unnamed: 1_level_2,Other,cat,dog,cat,dog,cat,dog,cat,dog
Upper,Lower,Extra,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
A,a,alpha,2,1,-5,5,-2,-4,5,9
A,a,beta,5,6,-9,-3,0,-8,2,-3
A,b,alpha,-1,-5,3,-6,-4,3,9,-9
A,b,beta,-8,4,7,2,5,4,-1,-6
B,a,alpha,0,8,-4,2,4,0,4,4
B,a,beta,7,8,4,-2,8,-5,2,6
B,b,alpha,-1,-3,-2,2,-4,8,0,-9
B,b,beta,-3,-2,9,-3,-6,-4,0,-4


In [218]:
df_ex.swaplevel(0, 2, axis=1)



Unnamed: 0_level_0,Unnamed: 1_level_0,Other,cat,dog,cat,dog,cat,dog,cat,dog
Unnamed: 0_level_1,Unnamed: 1_level_1,Small,c,c,d,d,c,c,d,d
Unnamed: 0_level_2,Unnamed: 1_level_2,Big,C,C,C,C,D,D,D,D
Upper,Lower,Extra,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
A,a,alpha,-1,-6,7,2,9,6,-6,-5
A,a,beta,-2,-6,2,6,4,6,-8,2
A,b,alpha,-4,-5,-9,-4,2,4,-2,-1
A,b,beta,2,-8,5,-3,2,1,-9,9
B,a,alpha,-1,-3,2,-6,-5,-5,8,6
B,a,beta,-1,0,9,8,-2,-6,0,6
B,b,alpha,7,-2,-6,-2,-1,-4,-4,-8
B,b,beta,5,-3,8,-1,-8,-7,7,-7


In [287]:
df_ex.reorder_levels([2, 1, 0],axis=1).head()

df_ex.rename(columns={'cat':'not_cat'},
             level=2).head()

df_ex.rename(index=lambda x:str.upper(x),
             level=2).head()   

df_ex.rename_axis(index={ 'Upper': 'Changed_row' }, columns={'Other': 'Changed_col'}).head()   

df_temp = df_ex.copy()

new_idx = df_temp.index.map(lambda x: (x[0]+'-'+
                                       x[1]+'-'+
                                       x[2]))

df_temp.index = new_idx

df_temp

new_idx = df_temp.index.map(lambda x:tuple(x.split('-')))

df_temp.index = new_idx

df_temp



Unnamed: 0_level_0,Unnamed: 1_level_0,Big,C,C,C,C,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,Small,c,c,d,d,c,c,d,d
Unnamed: 0_level_2,Unnamed: 1_level_2,Other,cat,dog,cat,dog,cat,dog,cat,dog
A,a,alpha,2,1,-5,5,-2,-4,5,9
A,a,beta,5,6,-9,-3,0,-8,2,-3
A,b,alpha,-1,-5,3,-6,-4,3,9,-9
A,b,beta,-8,4,7,2,5,4,-1,-6
B,a,alpha,0,8,-4,2,4,0,4,4
B,a,beta,7,8,4,-2,8,-5,2,6
B,b,alpha,-1,-3,-2,2,-4,8,0,-9
B,b,beta,-3,-2,9,-3,-6,-4,0,-4


In [299]:
df_new = pd.DataFrame({'A':list('aacd'),
                       'B':list('PQRT'),
                       'C':[1,2,3,4]})

df_new 

df_new.set_index('A')

df_new.set_index('A', append=True)

my_index = pd.Series(list('WXYZ'), name='D')

df_new.set_index(['A', my_index], inplace=True)

df_new.reset_index(inplace=True)

df_new



Unnamed: 0,A,D,B,C
0,a,W,P,1
1,a,X,Q,2
2,c,Y,R,3
3,d,Z,T,4


In [320]:
df_reindex = pd.DataFrame({"Weight":[60,70,80],
                           "Height":[176,180,179]},
                           index=['1001','1002','1003'])

df_reindex                          

Unnamed: 0,Weight,Height
1001,60,176
1003,70,180
1002,80,179


In [323]:
df_reindex.reindex(index=['1001','1002','1003', '1004'], columns=['Weight', 'Height'])

Unnamed: 0,Weight,Height
1001,60.0,176.0
1002,80.0,179.0
1003,70.0,180.0
1004,,


In [5]:
df_set_1 = pd.DataFrame([[0,1],[1,2],[3,4]],
                        index = pd.Index(['a','b','a'],name='id1'))

df_set_1



Unnamed: 0_level_0,0,1
id1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,1
b,1,2
a,3,4


In [6]:
df_set_2 = pd.DataFrame([[4,5],[2,6],[7,1]],
                        index = pd.Index(['b','b','c'],name='id2'))  
df_set_2

Unnamed: 0_level_0,0,1
id2,Unnamed: 1_level_1,Unnamed: 2_level_1
b,4,5
b,2,6
c,7,1


In [16]:
id1, id2 = df_set_1.index.unique(), df_set_2.index.unique()

id1, id2


(Index(['a', 'b'], dtype='object', name='id1'),
 Index(['b', 'c'], dtype='object', name='id2'))

In [18]:
id1.intersection(id2), id1.symmetric_difference(id2)

(Index(['b'], dtype='object'), Index(['a', 'c'], dtype='object'))

In [29]:
df_set_1.reindex(index )

# df_set_2.reset_index()

Unnamed: 0_level_0,0,1
id1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,1
b,1,2
a,3,4
