### apply 和 applymap

#### 1. 可直接使用Numpy的函数

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)

print(df)

print(np.abs(df))

          0         1         2         3
0  1.207270 -1.107191 -0.523128  0.523667
1 -1.211874 -2.424170 -0.656185 -0.108821
2 -1.515303 -2.061224 -1.399393 -0.426654
3 -0.313853 -2.266272 -0.728485 -2.778090
4 -2.677651 -0.217101  1.038653 -2.277234
          0         1         2         3
0  1.207270  1.107191  0.523128  0.523667
1  1.211874  2.424170  0.656185  0.108821
2  1.515303  2.061224  1.399393  0.426654
3  0.313853  2.266272  0.728485  2.778090
4  2.677651  0.217101  1.038653  2.277234


#### 2. 通过apply将函数应用到列或行上

In [3]:
print(df.apply(lambda x: x.max()))

0    1.207270
1   -0.217101
2    1.038653
3    0.523667
dtype: float64


In [4]:
print(df.apply(lambda x: x.max(), axis=1))

0    1.207270
1   -0.108821
2   -0.426654
3   -0.313853
4    1.038653
dtype: float64


#### 3. 通过applymap将函数应用到每个数据上

In [5]:
f2 = lambda x : "%.2f" %x
print(df.applymap(f2))

       0      1      2      3
0   1.21  -1.11  -0.52   0.52
1  -1.21  -2.42  -0.66  -0.11
2  -1.52  -2.06  -1.40  -0.43
3  -0.31  -2.27  -0.73  -2.78
4  -2.68  -0.22   1.04  -2.28


### 排序

#### 1. 索引排序

In [6]:
# sort_index()
# 排序默认使用升序排序， ascending=False为降序排列

In [8]:
s4 = pd.Series(range(10, 15), index=np.random.randint(5, size=5))
print("s4: \n", s4)

s4: 
 1    10
0    11
1    12
3    13
4    14
dtype: int64


In [9]:
# 索引排序
s4.sort_index()

0    11
1    10
1    12
3    13
4    14
dtype: int64

In [10]:
# DataFrame
df4 = pd.DataFrame(np.random.randn(3,5),
                  index=np.random.randint(3, size=3),
                  columns=np.random.randint(5, size=5))

print(df4)

df4_isort = df.sort_index(axis=1, ascending=False)
print(df4_isort)

          1         3         0         4         0
0 -1.578973  1.876969  0.528007  0.212646 -0.800617
2 -0.444042 -1.211058 -0.330680 -1.350462  0.163926
0 -1.029338  0.353062 -0.259603 -0.284637 -0.872490
          3         2         1         0
0  0.523667 -0.523128 -1.107191  1.207270
1 -0.108821 -0.656185 -2.424170 -1.211874
2 -0.426654 -1.399393 -2.061224 -1.515303
3 -2.778090 -0.728485 -2.266272 -0.313853
4 -2.277234  1.038653 -0.217101 -2.677651


#### 2. 按值排序

In [11]:
# sort_value(by="column name")
# 根据某个唯一的列名进行排序，如果有其他相同列名则报错

In [13]:
df4_vsort = df4.sort_values(by=1, ascending=False)
print(df4_vsort)

          1         3         0         4         0
2 -0.444042 -1.211058 -0.330680 -1.350462  0.163926
0 -1.029338  0.353062 -0.259603 -0.284637 -0.872490
0 -1.578973  1.876969  0.528007  0.212646 -0.800617


### 处理缺失数据

In [14]:
df_data = pd.DataFrame([np.random.randn(3), [1.,2.,np.nan],
                       [np.nan,4.,np.nan], [1.,2.,3]])
print(df_data.head())

      0         1         2
0 -0.07  2.136348 -0.325308
1  1.00  2.000000       NaN
2   NaN  4.000000       NaN
3  1.00  2.000000  3.000000


#### 1. 判断是否存在缺失值：isnull()

In [15]:
print(df_data.isnull())

       0      1      2
0  False  False  False
1  False  False   True
2   True  False   True
3  False  False  False


#### 2. 丢弃缺失数据：dropna()

In [17]:
print(df_data.dropna())

print(df_data.dropna(axis=1))

      0         1         2
0 -0.07  2.136348 -0.325308
3  1.00  2.000000  3.000000
          1
0  2.136348
1  2.000000
2  4.000000
3  2.000000


#### 3. 填充缺失值:fillna()

In [18]:
print(df_data.fillna(-100.0))

        0         1           2
0   -0.07  2.136348   -0.325308
1    1.00  2.000000 -100.000000
2 -100.00  4.000000 -100.000000
3    1.00  2.000000    3.000000
