# Pandas

## 一、准备工作

In [11]:
import numpy as np
import pandas as pd

## 二、生成对象

### 1. 使用列表创建一个 Series

In [12]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

- 一个 series 是一个一维的标记数组，可以容纳任何数据类型（整数、字符串、浮点数、Python 对象）；
- 与 Python 列表不同，一个 series 总是包含相同类型的数据。

### 2. 使用列表创建一个 DataFrame

In [14]:
lst = ['Gamer47', 'Shox', 'Simple']
df = pd.DataFrame(lst)
df

Unnamed: 0,0
0,Gamer47
1,Shox
2,Simple


### 3. 使用 Series 字典对象生成 DataFrame

In [17]:
df2 = pd.DataFrame({'A': 1,
                    'B': pd.Timestamp('20210219'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(['test', 'train', 'test', 'train']),
                    'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2021-02-19,1.0,3,test,foo
1,1,2021-02-19,1.0,3,train,foo
2,1,2021-02-19,1.0,3,test,foo
3,1,2021-02-19,1.0,3,train,foo


### 4. 创建一个空的 DataFrame

In [18]:
df = pd.DataFrame()
df

## 三、查看数据

In [21]:
dates = pd.date_range('20210219', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2021-02-19,-0.085578,-1.355016,-0.827,-0.689935
2021-02-20,-0.948585,-0.349289,-0.267233,0.844029
2021-02-21,-1.777166,0.739841,-0.819841,1.020413
2021-02-22,-0.717299,0.364915,2.610492,0.020046
2021-02-23,-0.110647,-0.755049,-0.182103,1.972917
2021-02-24,0.02409,1.309118,-0.184814,0.630202


### 1. 查看头部数据

In [22]:
df.head()

Unnamed: 0,A,B,C,D
2021-02-19,-0.085578,-1.355016,-0.827,-0.689935
2021-02-20,-0.948585,-0.349289,-0.267233,0.844029
2021-02-21,-1.777166,0.739841,-0.819841,1.020413
2021-02-22,-0.717299,0.364915,2.610492,0.020046
2021-02-23,-0.110647,-0.755049,-0.182103,1.972917


### 2. 查看尾部数据

In [23]:
df.tail()

Unnamed: 0,A,B,C,D
2021-02-20,-0.948585,-0.349289,-0.267233,0.844029
2021-02-21,-1.777166,0.739841,-0.819841,1.020413
2021-02-22,-0.717299,0.364915,2.610492,0.020046
2021-02-23,-0.110647,-0.755049,-0.182103,1.972917
2021-02-24,0.02409,1.309118,-0.184814,0.630202


### 3. 查看数据的统计摘要

In [24]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.602531,-0.00758,0.054917,0.632945
std,0.694915,0.992184,1.28774,0.90729
min,-1.777166,-1.355016,-0.827,-0.689935
25%,-0.890763,-0.653609,-0.681689,0.172585
50%,-0.413973,0.007813,-0.226024,0.737116
75%,-0.091845,0.64611,-0.182781,0.976317
max,0.02409,1.309118,2.610492,1.972917


### 4. 查看索引和列名

In [25]:
df.index

DatetimeIndex(['2021-02-19', '2021-02-20', '2021-02-21', '2021-02-22',
               '2021-02-23', '2021-02-24'],
              dtype='datetime64[ns]', freq='D')

In [26]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

## 四、索引

### 1. 简述 Pandas Index

pandas 支持四种类型的多轴索引，它们统称索引器：

- DataFrame.[] 称为索引运算符
- DataFrame.loc[] 用于标签
- DataFrame.iloc[] 用于基于位置和整数
- DataFrame.ix[] 用于基于标签和整数

### 2. 定义重新索引 (Reindexing)

In [33]:
N = 20
df = pd.DataFrame({
    'A': pd.date_range(start='20210219', periods=N, freq='D'),
    'x': np.linspace(0, stop=N-1, num=N),
    'y': np.random.rand(N),
    'C': np.random.choice(['Low', 'Medium', 'High'], N).tolist(),
    'D': np.random.normal(100, 10, size=(N)).tolist()
})

df_reindexed = df.reindex(index=[0, 2, 5], columns=['A', 'C', 'B'])
df_reindexed

Unnamed: 0,A,C,B
0,2021-02-19,Medium,
2,2021-02-21,Medium,
5,2021-02-24,Medium,


### 3. 设置索引

pandas.set_index() 是一种将列表、序列或者 DataFrame 设置为 DataFrame 索引的方法。

**语法：** `DataFrame.set_index(keys, inplace=False)`

In [34]:
df.set_index('A', inplace=True)
df.head()

Unnamed: 0_level_0,x,y,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-02-19,0.0,0.19956,Medium,111.33336
2021-02-20,1.0,0.602096,High,83.369119
2021-02-21,2.0,0.575078,Medium,99.012966
2021-02-22,3.0,0.860347,Medium,105.831498
2021-02-23,4.0,0.445448,High,113.16763


### 4. 重置索引

pandas.Series.reset_index(): 生成一个新的 DataFrame 或带有重置索引的 Series.

In [35]:
sr = pd.Series([10, 25, 3, 11, 24, 6])
index_ = ['Coca Cola', 'Sprite', 'Coke', 'Fanta', 'Dew', 'ThumbsUp']
sr.index = index_
sr

Coca Cola    10
Sprite       25
Coke          3
Fanta        11
Dew          24
ThumbsUp      6
dtype: int64

In [36]:
result = sr.reset_index()
result

Unnamed: 0,index,0
0,Coca Cola,10
1,Sprite,25
2,Coke,3
3,Fanta,11
4,Dew,24
5,ThumbsUp,6


## 五、选择需要的数据

### 1. 获取数据

#### 1.1 创建数据

In [37]:
dates = pd.date_range('20210220', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2021-02-20,0.237543,-1.007727,2.068932,-2.940287
2021-02-21,-2.189762,-0.298827,-0.735149,0.232555
2021-02-22,0.280077,0.741312,0.520303,0.186196
2021-02-23,-1.196701,0.299779,1.731046,0.716443
2021-02-24,-0.372734,-0.208142,1.312494,0.443122
2021-02-25,0.364872,2.259915,0.398213,-1.40686


#### 1.2 选择单列，产生 Series

In [38]:
df['A']

2021-02-20    0.237543
2021-02-21   -2.189762
2021-02-22    0.280077
2021-02-23   -1.196701
2021-02-24   -0.372734
2021-02-25    0.364872
Freq: D, Name: A, dtype: float64

#### 1.3 用 [] 切片行

In [39]:
df[0:3]

Unnamed: 0,A,B,C,D
2021-02-20,0.237543,-1.007727,2.068932,-2.940287
2021-02-21,-2.189762,-0.298827,-0.735149,0.232555
2021-02-22,0.280077,0.741312,0.520303,0.186196


In [41]:
df['2021-02-22': '2021-02-24']

Unnamed: 0,A,B,C,D
2021-02-22,0.280077,0.741312,0.520303,0.186196
2021-02-23,-1.196701,0.299779,1.731046,0.716443
2021-02-24,-0.372734,-0.208142,1.312494,0.443122


### 2. 按标签选择

#### 2.1 用标签提取一行数据

In [42]:
df.loc[dates[0]]

A    0.237543
B   -1.007727
C    2.068932
D   -2.940287
Name: 2021-02-20 00:00:00, dtype: float64

#### 2.2 用标签选择多列数据

In [43]:
df.loc['20210222': '20210224', ['A', 'B']]

Unnamed: 0,A,B
2021-02-22,0.280077,0.741312
2021-02-23,-1.196701,0.299779
2021-02-24,-0.372734,-0.208142


#### 2.3 用标签切片，包含行与列结束点

#### 2.4 返回对象降维

In [44]:
df.loc['20210222', ['A', 'B']]

A    0.280077
B    0.741312
Name: 2021-02-22 00:00:00, dtype: float64

#### 2.5 提取标量值

In [45]:
df.loc[dates[0], 'A']

0.2375428883849271

#### 2.6 快速访问标量

In [46]:
df.at[dates[0], 'A']

0.2375428883849271

### 3. 按位置选择

#### 3.1 用整数位置选择

In [47]:
df.iloc[3]

A   -1.196701
B    0.299779
C    1.731046
D    0.716443
Name: 2021-02-23 00:00:00, dtype: float64

#### 3.2 用整数切片

In [48]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2021-02-23,-1.196701,0.299779
2021-02-24,-0.372734,-0.208142


#### 3.3 用整数列表按位置切片

In [49]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2021-02-21,-2.189762,-0.735149
2021-02-22,0.280077,0.520303
2021-02-24,-0.372734,1.312494


#### 3.4 显式整行切片

In [50]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2021-02-21,-2.189762,-0.298827,-0.735149,0.232555
2021-02-22,0.280077,0.741312,0.520303,0.186196


#### 3.5 显式整列切片

In [51]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2021-02-20,-1.007727,2.068932
2021-02-21,-0.298827,-0.735149
2021-02-22,0.741312,0.520303
2021-02-23,0.299779,1.731046
2021-02-24,-0.208142,1.312494
2021-02-25,2.259915,0.398213


#### 3.6 显式提取值

In [52]:
df.iloc[1, 1]

-0.2988271051833723

#### 3.7 快速访问标量

In [53]:
df.iat[1, 1]

-0.2988271051833723

### 4. 布尔索引

#### 4.1 用单列的值选择数据

In [54]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2021-02-20,0.237543,-1.007727,2.068932,-2.940287
2021-02-22,0.280077,0.741312,0.520303,0.186196
2021-02-25,0.364872,2.259915,0.398213,-1.40686


#### 4.2 选择 DataFrame 里满足条件的值

In [55]:
df[df > 0]

Unnamed: 0,A,B,C,D
2021-02-20,0.237543,,2.068932,
2021-02-21,,,,0.232555
2021-02-22,0.280077,0.741312,0.520303,0.186196
2021-02-23,,0.299779,1.731046,0.716443
2021-02-24,,,1.312494,0.443122
2021-02-25,0.364872,2.259915,0.398213,


#### 4.3 用 isin() 筛选

In [60]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2021-02-22,0.280077,0.741312,0.520303,0.186196,two
2021-02-24,-0.372734,-0.208142,1.312494,0.443122,four


### 5. 赋值

#### 5.1 用索引自动对齐新增列的数据

In [61]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20210220', periods=6))
s1

2021-02-20    1
2021-02-21    2
2021-02-22    3
2021-02-23    4
2021-02-24    5
2021-02-25    6
Freq: D, dtype: int64

In [68]:
df['F'] = s1

#### 5.2 按标签赋值

In [69]:
df.at[dates[0], 'A'] = 0

#### 5.3 按位置赋值

In [70]:
df.iat[0, 1] = 0

#### 5.4 按 NumPy 数组赋值

In [71]:
df.loc[:, 'D'] = np.array([5] * len(df))

In [72]:
df

Unnamed: 0,A,B,C,D,F
2021-02-20,0.0,0.0,2.068932,5,1
2021-02-21,-2.189762,-0.298827,-0.735149,5,2
2021-02-22,0.280077,0.741312,0.520303,5,3
2021-02-23,-1.196701,0.299779,1.731046,5,4
2021-02-24,-0.372734,-0.208142,1.312494,5,5
2021-02-25,0.364872,2.259915,0.398213,5,6


#### 5.5 用 where 条件赋值

In [73]:
df2 = df.copy()

In [74]:
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2021-02-20,0.0,0.0,-2.068932,-5,-1
2021-02-21,-2.189762,-0.298827,-0.735149,-5,-2
2021-02-22,-0.280077,-0.741312,-0.520303,-5,-3
2021-02-23,-1.196701,-0.299779,-1.731046,-5,-4
2021-02-24,-0.372734,-0.208142,-1.312494,-5,-5
2021-02-25,-0.364872,-2.259915,-0.398213,-5,-6


## 六、运算

### 1. 如何得到一个数列的最小值、第25百分位、中值、第75百分位和最大值？

In [77]:
from numpy import percentile

p = pd.Series(np.random.normal(14, 6, 22))
state = np.random.RandomState(120)
p = pd.Series(state.normal(14, 6, 22))

In [78]:
percentile(p, q=[0, 25, 50, 75, 100])

array([ 4.61498692, 12.15572753, 14.67780756, 17.58054104, 33.24975515])

### 2. 如何获得 panda DataFrame 中一个列的平均值？

In [79]:
df = pd.DataFrame({'A': [12, 4, 5, 44, 1],
                   'B': [5, 2, 54, 3, 2],
                   'C': [20, 16, 7, 3, 8],
                   'D': [14, 3, 17, 2, 6]})
df

Unnamed: 0,A,B,C,D
0,12,5,20,14
1,4,2,16,3
2,5,54,7,17
3,44,3,3,2
4,1,2,8,6


In [80]:
df.mean(axis=0)

A    13.2
B    13.2
C    10.8
D     8.4
dtype: float64

### 3. 如何将函数应用到 DataFrame 中的每个数据元素？

In [82]:
def add(a, b, c):
    return a + b + c

def main():
    data = {
        'A': [1, 2, 3],
        'B': [4, 5, 6],
        'C': [7, 8, 9]
    }
    df = pd.DataFrame(data)
    print('Original DataFrame:\n', df)
    
    df['add'] = df.apply(lambda row: add(row['A'], row['B'], row['C']), axis=1)
    print('\nAfter Applying Function: ')
    print(df)
    
if __name__ == '__main__':
    main()

Original DataFrame:
    A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

After Applying Function: 
   A  B  C  add
0  1  4  7   12
1  2  5  8   15
2  3  6  9   18


### 4. 如何在 pandas 中获得一个 DataFrame 的行数和列数？

In [84]:
raw_data = {'name': ['Willard Morris', 'Al Jennings', 'Omar Nullins', 'Spencer MacDaniel'],
            'age': [20, 19, 22, 21],
            'favorite_color': ['blue', 'red', 'yellow', 'green'],
            'grade': [88, 92, 95, 70]}
df = pd.DataFrame(raw_data, columns=['name', 'age', 'favorite_color', 'grade'])
df

Unnamed: 0,name,age,favorite_color,grade
0,Willard Morris,20,blue,88
1,Al Jennings,19,red,92
2,Omar Nullins,22,yellow,95
3,Spencer MacDaniel,21,green,70


In [85]:
df.shape

(4, 4)

#### 5. 如何在 pandas DataFrame 中获得列值的总和？

In [87]:
df.sum(axis=0, skipna=True)

name              Willard MorrisAl JenningsOmar NullinsSpencer M...
age                                                              82
favorite_color                                   blueredyellowgreen
grade                                                           345
dtype: object

## 七、合并

### 1. 如何将新行追加到 pandas DataFrame?

In [88]:
df1 = df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]})
df2 = pd.DataFrame({'a': [1, 2, 3], 'b': [5, 6, 7]})

In [89]:
df1.append(df2)

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7
3,4,8
0,1,5
1,2,6
2,3,7


In [90]:
df.append(df2, ignore_index=True)

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7
3,4,8
4,1,5
5,2,6
6,3,7


## 八、分组 (Grouping)

In [91]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
                   'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C': np.random.randn(8),
                   'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,1.440161,1.514855
1,bar,one,0.117114,1.029479
2,foo,two,0.892702,-1.129713
3,bar,three,0.492999,-0.74444
4,foo,two,-1.906507,2.964365
5,bar,two,0.995856,-0.697346
6,foo,one,-0.938861,-0.744972
7,foo,three,-0.090369,-1.441745


### 1. 先分组，再用 sum() 函数计算每组的汇总数据

In [92]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.605968,-0.412307
foo,-0.602874,1.16279


### 2. 多列分组后，生成多层索引，也可以应用 sum() 函数

In [93]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.117114,1.029479
bar,three,0.492999,-0.74444
bar,two,0.995856,-0.697346
foo,one,0.5013,0.769883
foo,three,-0.090369,-1.441745
foo,two,-1.013805,1.834652


## 九、重塑 (Reshaping)

### 1. 如何将 numpy 数组转换为给定形状的 DataFrame?

In [94]:
p = pd.Series(np.random.randint(1, 7, 8))
p

0    4
1    4
2    3
3    5
4    2
5    2
6    1
7    4
dtype: int64

In [96]:
info = pd.DataFrame(p.values.reshape(2, 4))
info

Unnamed: 0,0,1,2,3
0,4,4,3,5
1,2,2,1,4


## 十、数据透视表

透视表式一种可以对数据动态排布并且分类汇总的表格格式，在 pandas 中它被称作 pivot_table.

`pivot_table(data, values=None, index=None, columns=None)`

In [97]:
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
                   'B': ['A', 'B', 'C'] * 4,
                   'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D': np.random.randn(12),
                   'E': np.random.randn(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-1.773447,-1.804661
1,one,B,foo,0.372909,-1.288278
2,two,C,foo,-0.328123,0.860557
3,three,A,bar,1.138231,-1.170637
4,one,B,bar,0.887873,-1.032552
5,one,C,bar,0.2514,-1.009143
6,two,A,foo,-1.099555,0.54803
7,three,B,foo,-0.392592,0.617226
8,one,C,foo,0.454995,-0.36085
9,one,A,bar,1.607073,-0.902033


In [99]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,1.607073,-1.773447
one,B,0.887873,0.372909
one,C,0.2514,0.454995
three,A,1.138231,
three,B,,-0.392592
three,C,0.239812,
two,A,,-1.099555
two,B,0.637209,
two,C,,-0.328123


## 十一、pandas DataFrame 里的操作

### 1. 如何将列添加到 pandas DataFrame?

In [101]:
data = {'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'height': [5.1, 6.2, 5.1, 5.2], 
        'Qualification': ['Msc', 'Ma', 'Msc', 'Msc']}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,height,Qualification
0,Jai,5.1,Msc
1,Princi,6.2,Ma
2,Gaurav,5.1,Msc
3,Anuj,5.2,Msc


In [102]:
address = ['Delhi', 'Bangalore', 'Chennai', 'Patna']
df['Address'] = address
df

Unnamed: 0,Name,height,Qualification,Address
0,Jai,5.1,Msc,Delhi
1,Princi,6.2,Ma,Bangalore
2,Gaurav,5.1,Msc,Chennai
3,Anuj,5.2,Msc,Patna


### 2. 如何向 pandas DataFrame 添加索引、行或列？

In [103]:
employees = pd.DataFrame(
    data={'Name': ['John Doe', 'William Spark'],
          'Occupation': ['Chemist', 'Statistician'],
          'Date of Join': ['2018-01-25', '2018-01-26'],
          'Age': [23, 24]},
    index=['Emp001', 'Emp002'],
    columns=['Name', 'Occupation', 'Date of Join', 'Age']
)
print('\n---------- BEFORE ----------\n')
print(employees)
employees.loc['Emp003'] = ['Sunny', 'Programmer', '2018-01-25', 45]
print('\n---------- AFTER -----------\n')
print(employees)


---------- BEFORE ----------

                 Name    Occupation Date of Join  Age
Emp001       John Doe       Chemist   2018-01-25   23
Emp002  William Spark  Statistician   2018-01-26   24

---------- AFTER -----------

                 Name    Occupation Date of Join  Age
Emp001       John Doe       Chemist   2018-01-25   23
Emp002  William Spark  Statistician   2018-01-26   24
Emp003          Sunny    Programmer   2018-01-25   45


### 3. 如何在 pandas DataFrame 上进行迭代？

In [104]:
df = pd.DataFrame([{'c1': 10, 'c2': 100}, {'c1': 11, 'c2': 110}, {'c1': 12, 'c2': 120}])
for index, row in df.iterrows():
    print(row['c1'], row['c2'])

10 100
11 110
12 120


### 4. 如何排序 DataFrame?

In [106]:
unsorted_df = pd.DataFrame(np.random.randn(10, 2), index=[1, 4, 6, 2, 3, 5, 9, 8, 0, 7])
sorted_df = unsorted_df.sort_index()
sorted_df

Unnamed: 0,0,1
0,1.445322,0.043932
1,-0.80591,0.653677
2,-1.464479,0.832871
3,0.218437,1.901344
4,0.14534,0.42003
5,0.604572,0.27274
6,0.321421,-0.211071
7,2.630269,0.396296
8,0.120636,-0.59408
9,1.017284,0.688913


In [107]:
unsorted_df = pd.DataFrame(np.random.randn(10, 2), index=[1, 4, 6, 2, 3, 5, 9, 8, 0, 7], columns=['col2', 'col1'])
sorted_df = unsorted_df.sort_index(ascending=False)
sorted_df

Unnamed: 0,col2,col1
9,0.727287,-0.065425
8,-0.127936,-0.603623
7,-0.633505,-0.756768
6,1.570698,0.626106
5,-0.97378,-0.423858
4,-0.158439,0.432988
3,0.366012,-0.320249
2,0.919731,-0.025178
1,0.204776,0.228481
0,-1.953844,-1.26998


In [108]:
sorted_df = unsorted_df.sort_index(axis=1)
sorted_df

Unnamed: 0,col1,col2
1,0.228481,0.204776
4,0.432988,-0.158439
6,0.626106,1.570698
2,-0.025178,0.919731
3,-0.320249,0.366012
5,-0.423858,-0.97378
9,-0.065425,0.727287
8,-0.603623,-0.127936
0,-1.26998,-1.953844
7,-0.756768,-0.633505


In [110]:
data = [['a', 2, 301], ['b', 1, 201], ['c', 2, 201], ['d', 1, 301], ['e', 2, 301]]
df = pd.DataFrame(data, columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,a,2,301
1,b,1,201
2,c,2,201
3,d,1,301
4,e,2,301


In [112]:
df.sort_values(by=['C', 'B'], ascending=[False, True], inplace=True)
df

Unnamed: 0,A,B,C
3,d,1,301
0,a,2,301
4,e,2,301
1,b,1,201
2,c,2,201


In [116]:
data = [[300, 2, 301], [2, 1, 201], [3, 300, 201], [100, 1, 301], [500, 2, 301]]
df = pd.DataFrame(data, columns=['A', 'B', 'C'])
df.sort_values(by=0, axis=1, inplace=True)
df

Unnamed: 0,B,A,C
0,2,300,301
1,1,2,201
2,300,3,201
3,1,100,301
4,2,500,301


### 5. 如何删除 pandas DataFrame 中的行？

In [117]:
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
        'year': [2012, 2012, 2013, 2014, 2014],
        'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index=['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df

Unnamed: 0,name,year,reports
Cochice,Jason,2012,4
Pima,Molly,2012,24
Santa Cruz,Tina,2013,31
Maricopa,Jake,2014,2
Yuma,Amy,2014,3


In [118]:
df.drop(['Cochice', 'Pima'])

Unnamed: 0,name,year,reports
Santa Cruz,Tina,2013,31
Maricopa,Jake,2014,2
Yuma,Amy,2014,3


### 6. 如何删除 pandas DataFrame 中的列？

In [120]:
test_dict = {'id': [1, 2, 3, 4, 5, 6], 
             'name': ['Alice', 'Bob', 'Cindy', 'Eric', 'Helen', 'Grace'],
             'math': [90, 89, 99, 78, 97, 93],
             'english': [89, 94, 80, 94, 94, 90]}
test_dict_df = pd.DataFrame(data=test_dict)
test_dict_df

Unnamed: 0,id,name,math,english
0,1,Alice,90,89
1,2,Bob,89,94
2,3,Cindy,99,80
3,4,Eric,78,94
4,5,Helen,97,94
5,6,Grace,93,90


In [121]:
test_dict_df.drop(['id'], axis=1)

Unnamed: 0,name,math,english
0,Alice,90,89
1,Bob,89,94
2,Cindy,99,80
3,Eric,78,94
4,Helen,97,94
5,Grace,93,90


**参照**：https://www.jianshu.com/p/4ab8720071dd