# Pandas的对齐运算 
- 是数据清洗的重要过程，可以按索引对齐进行运算，如果没对齐的位置则补NaN，最后也可以填充NaN

In [1]:
import numpy as np
import pandas as pd

## Series的对齐运算  

### 1. Series 按行、索引对齐

In [2]:
s1 = pd.Series(np.arange(4), index=['a', 'b', 'c', 'd'])
s2 = pd.Series(np.arange(5), index=['a', 'c', 'e', 'f', 'g'])
print(s1)
print()
print(s2)

a    0
b    1
c    2
d    3
dtype: int64

a    0
c    1
e    2
f    3
g    4
dtype: int64


### 2.Series的对齐运算 

In [3]:
print(s1 + s2)

a    0.0
b    NaN
c    3.0
d    NaN
e    NaN
f    NaN
g    NaN
dtype: float64


## DataFrame的对齐运算  

### 1.DataFrame按行、列索引对齐

In [4]:
df1 = pd.DataFrame(np.arange(12).reshape(4,3), index=range(4), columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9).reshape(3,3), index=range(2,5), columns=['a', 'b', 'd'])
print(df1)
print(df2)

   a   b   c
0  0   1   2
1  3   4   5
2  6   7   8
3  9  10  11
   a  b  d
2  0  1  2
3  3  4  5
4  6  7  8


### 2.DataFrame的对齐运算 

In [5]:
df1 + df2

Unnamed: 0,a,b,c,d
0,,,,
1,,,,
2,6.0,8.0,,
3,12.0,14.0,,
4,,,,


## 填充未对齐的数据进行运算 
- fill_value 
- 使用 add , sub , div , mul 的同时, 
- 通过 fill_value 指定填充值，未对齐的数据将和填充值做运算

## 算术方法表:


        方法                    描述

        add，radd               加法（+） 
        sub，rsub               减法（-） 
        div，rdiv               除法（/） 
        floordiv，rfllordiv     整除（//） 
        mul，rmul               乘法（*） 
        pow，rpow               幂次方（**） 


In [6]:
s1

a    0
b    1
c    2
d    3
dtype: int64

In [7]:
s2

a    0
c    1
e    2
f    3
g    4
dtype: int64

In [8]:
s1.add(s2, fill_value=0)

a    0.0
b    1.0
c    3.0
d    3.0
e    2.0
f    3.0
g    4.0
dtype: float64

In [9]:
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [10]:
df2

Unnamed: 0,a,b,d
2,0,1,2
3,3,4,5
4,6,7,8


In [11]:
df1.add(df2, fill_value=0)# 注意 (d0 d1 c4) 所对应df1和df2的值都是NAN，所以就只能是NAN了

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,
1,3.0,4.0,5.0,
2,6.0,8.0,8.0,2.0
3,12.0,14.0,11.0,5.0
4,6.0,7.0,,8.0


In [12]:
1/df1

Unnamed: 0,a,b,c
0,inf,1.0,0.5
1,0.333333,0.25,0.2
2,0.166667,0.142857,0.125
3,0.111111,0.1,0.090909


In [13]:
# rdiv 字母r开头，会翻转参数进行运算
df1.rdiv(1)

Unnamed: 0,a,b,c
0,inf,1.0,0.5
1,0.333333,0.25,0.2
2,0.166667,0.142857,0.125
3,0.111111,0.1,0.090909


In [14]:
df1.reindex(columns=df2.columns, fill_value=7)

Unnamed: 0,a,b,d
0,0,1,7
1,3,4,7
2,6,7,7
3,9,10,7


## DataFrame 和 Series 的混合运算

In [15]:
arr = np.arange(12).reshape(3,4)
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [16]:
print(arr[0])
arr-arr[0]# 注意是广播机制的运算

[0 1 2 3]


array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [17]:
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [18]:
s3 = df1.loc[0]
print(s3)

a    0
b    1
c    2
Name: 0, dtype: int64


In [19]:
df1 - s3

Unnamed: 0,a,b,c
0,0,0,0
1,3,3,3
2,6,6,6
3,9,9,9


In [57]:
s4 = df1['a']
df1.sub(s4, axis=0) # axis='index'

Unnamed: 0,a,b,c
0,0,1,2
1,0,1,2
2,0,1,2
3,0,1,2


# Pandas的函数应用 


In [21]:
import numpy as np
import pandas as pd

## apply 和 applymap 

### 1. 可直接使用NumPy的函数 

In [22]:
df = pd.DataFrame(np.random.randn(5,4))
df

Unnamed: 0,0,1,2,3
0,-0.537792,1.434582,-1.312054,-0.593177
1,-0.918974,-0.708173,-0.866134,-1.089843
2,0.051728,-0.032248,0.626609,-1.268617
3,0.508769,-0.270399,-0.752377,0.965508
4,0.430834,0.68272,0.602135,-1.070027


In [23]:
# 可以直接使用np的函数
np.abs(df)

Unnamed: 0,0,1,2,3
0,0.537792,1.434582,1.312054,0.593177
1,0.918974,0.708173,0.866134,1.089843
2,0.051728,0.032248,0.626609,1.268617
3,0.508769,0.270399,0.752377,0.965508
4,0.430834,0.68272,0.602135,1.070027


### 2. 通过apply将函数应用到列或行上 

In [24]:
#  通过apply将函数应用到列或者行
f = lambda x: np.max(x)
df.apply(f, axis=1)

0    1.434582
1   -0.708173
2    0.626609
3    0.965508
4    0.682720
dtype: float64

### 3. 通过applymap将函数应用到每个数据上 

In [25]:
# 通过applymap将函数应用带每个数据
f2 = lambda x: '%.2f' % x
df.applymap(f2)

Unnamed: 0,0,1,2,3
0,-0.54,1.43,-1.31,-0.59
1,-0.92,-0.71,-0.87,-1.09
2,0.05,-0.03,0.63,-1.27
3,0.51,-0.27,-0.75,0.97
4,0.43,0.68,0.6,-1.07


## 排序 

### 1. 索引排序 
- sort_index()
- 排序默认使用升序排序，ascending=False 为降序排序

In [26]:
s1 = pd.Series(np.arange(4), index=list('dbca'))
print(s1)

d    0
b    1
c    2
a    3
dtype: int64


In [27]:
s1.sort_index() # 默认升序排序

a    3
b    1
c    2
d    0
dtype: int64

In [28]:
s1.sort_index(ascending=False)

d    0
c    2
b    1
a    3
dtype: int64

In [29]:
pd1 = pd.DataFrame(np.arange(12).reshape(4,3), index=list("bdca"), columns=list("BAC"))
pd1

Unnamed: 0,B,A,C
b,0,1,2
d,3,4,5
c,6,7,8
a,9,10,11


In [30]:
# 按照行索引排序
pd1.sort_index()

Unnamed: 0,B,A,C
a,9,10,11
b,0,1,2
c,6,7,8
d,3,4,5


In [59]:
# 按照列索引排序
pd1.sort_index(axis=1)

Unnamed: 0,A,B,C
b,1,0,2
d,4,3,5
c,7,6,8
a,10,9,11


In [60]:
pd1.sort_index().sort_index(axis=1)

Unnamed: 0,A,B,C
a,10,9,11
b,1,0,2
c,7,6,8
d,4,3,5


### 2. 按值排序 
- sort_values(by='column name')
- 根据某个唯一的列名进行排序，如果有其他相同列名则报错。

In [32]:
s1['a', 'c'] = np.nan
print(s1)

d    0.0
b    1.0
c    NaN
a    NaN
dtype: float64


In [33]:
s1.sort_values(ascending=False)# 当有缺失值时，默认排在最后

b    1.0
d    0.0
c    NaN
a    NaN
dtype: float64

In [34]:
pd1

Unnamed: 0,B,A,C
b,0,1,2
d,3,4,5
c,6,7,8
a,9,10,11


In [35]:
pd1.sort_values(by='A')

Unnamed: 0,B,A,C
b,0,1,2
d,3,4,5
c,6,7,8
a,9,10,11


In [36]:
pd2 = pd.DataFrame({'a':[3,5,9,1], 'b':[-3,0,2,4], 'c':[1,7,-2,8]})
pd2

Unnamed: 0,a,b,c
0,3,-3,1
1,5,0,7
2,9,2,-2
3,1,4,8


In [37]:
pd2.sort_values(by='b')# 指定b列进行排序

Unnamed: 0,a,b,c
0,3,-3,1
1,5,0,7
2,9,2,-2
3,1,4,8


In [38]:
pd2.sort_values(by=['a', 'c'], ascending=False)# 指定多列进行排序，其实还是按第一个需要排序的列进行整体列的排序

Unnamed: 0,a,b,c
2,9,2,-2
1,5,0,7
0,3,-3,1
3,1,4,8


In [39]:
pd2.sort_values(by=[2], axis=1)

Unnamed: 0,c,b,a
0,1,-3,3
1,7,0,5
2,-2,2,9
3,8,4,1


## 唯一值和成员属性

In [40]:
s1 = pd.Series([2,6,8,9,8,3,8,6], index=['a', 'a', 'c', 'c', 'a', 'c', 'c', 'c'])
s1

a    2
a    6
c    8
c    9
a    8
c    3
c    8
c    6
dtype: int64

In [41]:
s1.unique() # 返回一个唯一值组成的数组

array([2, 6, 8, 9, 3])

In [42]:
# 判断对象的索引是否是唯一值
s1.index.is_unique

False

In [43]:
s1 = pd.Series([2,6,8,9,8,3,8,6])
s1

0    2
1    6
2    8
3    9
4    8
5    3
6    8
7    6
dtype: int64

In [44]:
# 返回一个Series，用来计算唯一值的个数
s1.value_counts()

8    3
6    2
3    1
2    1
9    1
dtype: int64

In [45]:
# 判断值是否存在，返回bool类型
s1.isin([8])# 对s1里的值挨个进行判断，看其是否==8

0    False
1    False
2     True
3    False
4     True
5    False
6     True
7    False
dtype: bool

In [46]:
# 判断多个值
s1.isin([8, 2])# 对s1里的值挨个进行判断，看其 是否==8 or 是否==2

0     True
1    False
2     True
3    False
4     True
5    False
6     True
7    False
dtype: bool

In [47]:
data = pd.DataFrame({'a':[3,5,9,1], 'b':[-3,0,2,4], 'c':[1,7,-2,8]})
data

Unnamed: 0,a,b,c
0,3,-3,1
1,5,0,7
2,9,2,-2
3,1,4,8


In [48]:
data.abs()

Unnamed: 0,a,b,c
0,3,3,1
1,5,0,7
2,9,2,2
3,1,4,8


In [64]:
data.index.is_unique

True

In [50]:
data.isin([2,4])

Unnamed: 0,a,b,c
0,False,False,False
1,False,False,False
2,False,True,False
3,False,True,False


## 处理缺失数据 

In [51]:
df3 = pd.DataFrame([np.random.randn(3), 
                   [1, 2, np.nan],
                   [np.nan, 4, np.nan],
                   [1, 2, 3]])
df3

Unnamed: 0,0,1,2
0,1.457136,-0.153524,1.286275
1,1.0,2.0,
2,,4.0,
3,1.0,2.0,3.0


### 1. 判断是否存在缺失值：isnull() 

In [52]:
# 判断是否存在缺失值, 返回布尔类型
df3.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,False,True
2,True,False,True
3,False,False,False


### 2. 丢弃缺失数据：dropna() 

In [53]:
# 丢弃缺失数据
df3.dropna() # 默认丢弃行索引

Unnamed: 0,0,1,2
0,1.457136,-0.153524,1.286275
3,1.0,2.0,3.0


In [54]:
df3.dropna(axis=1)# 丢弃列索引

Unnamed: 0,1
0,-0.153524
1,2.0
2,4.0
3,2.0


### 3. 填充缺失数据：fillna() 

In [55]:
# 填充缺失数据

In [56]:
df3.fillna(-100)

Unnamed: 0,0,1,2
0,1.457136,-0.153524,1.286275
1,1.0,2.0,-100.0
2,-100.0,4.0,-100.0
3,1.0,2.0,3.0
