# 資料整理和使用前準備
### 1 資料的先前準備工作:
    - 載入(loading)
    - 資料整理(cleaning)
    - 資料轉換(transforming)
    - 重新整理(rearrange)
  
    

## 處理遺失資料
- pandas 使用NaN(Not a number)表示遺失資料

In [4]:
import numpy as np
import pandas as pd

#使用np.nan代表NaN
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
'''
Out[2]: 
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
'''

#使用python None代表Nan
string_data[0] = None
string_data.isnull()
'''
Out[5]: 
0     True
1    False
2     True
3    False
dtype: bool
'''


'\nOut[5]: \n0     True\n1    False\n2     True\n3    False\ndtype: bool\n'

處理遺失資料的方法
    - dropna()  
    - fillna()  
    - isnull()  
    - notnull()  

### 清除NaN的資料

In [7]:
# 清除NaN的資料
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data
'''
Out[6]: 
0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64
'''
#使用dropna()方法,清除遺失資料
data.dropna()
'''
Out[7]: 
0    1.0
2    3.5
4    7.0
dtype: float64
'''

#使用dropna()方法,清除遺失資料
data[data.notnull()]
'''
Out[8]: 
0    1.0
2    3.5
4    7.0
dtype: float64
'''

0    1.0
2    3.5
4    7.0
dtype: float64

In [17]:
#DataFrame的清除比較複雜一些
#drapna()將會清除整個row
data = pd.DataFrame([[1., 6.5, 3.],
                    [1., NA, NA],
                    [NA, NA, NA],
                    [NA, 6.5, 3.]])
data
'''
Out[9]: 
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
'''

data.dropna()
'''
Out[10]: 
     0    1    2
0  1.0  6.5  3.0
'''

#做用引數名稱how='all',只清除整列都是NaN的列
data.dropna(how='all')
'''
Out[11]: 
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0
'''

#刪除整欄資料使用axis=1
data[4] = NA
data
'''
Out[12]: 
     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN
'''

data.dropna(axis=1, how='all')
'''
Out[14]: 
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
'''

df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
'''
Out[15]: 
          0         1         2
0 -0.384156       NaN       NaN
1  2.457934       NaN       NaN
2  0.387037       NaN  1.516058
3  0.799194       NaN  1.019982
4 -0.523685  2.448814 -1.618673
5  0.139726 -0.483431  1.087822
6 -0.333721 -0.491657 -0.834928
'''

df.dropna()
'''
Out[16]: 
          0         1         2
4 -0.523685  2.448814 -1.618673
5  0.139726 -0.483431  1.087822
6 -0.333721 -0.491657 -0.834928
'''

#設立2個以上的才刪除row,使用引數名稱thresh=2
df.dropna(thresh=2)
'''
Out[17]: 
          0         1         2
2  0.387037       NaN  1.516058
3  0.799194       NaN  1.019982
4 -0.523685  2.448814 -1.618673
5  0.139726 -0.483431  1.087822
6 -0.333721 -0.491657 -0.834928
'''


Unnamed: 0,0,1,2
2,0.427143,,-1.388578
3,0.00609,,-1.693441
4,2.049151,0.332035,-1.458771
5,-0.333426,0.754723,2.430431
6,-0.175726,-0.822314,0.289279


### 填滿遺失資料

In [20]:
#使用fillna(),填滿遺失資料
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
'''
Out[18]: 
          0         1         2
0  0.570724       NaN       NaN
1 -0.939249       NaN       NaN
2 -0.549853       NaN  0.625596
3  0.001978       NaN -0.309461
4 -0.083375 -0.665350  1.838664
5 -0.049425  1.272762 -1.111465
6 -1.011749  0.335852  0.366487

'''

df.fillna(0)
'''
Out[19]: 
          0         1         2
0  0.570724  0.000000  0.000000
1 -0.939249  0.000000  0.000000
2 -0.549853  0.000000  0.625596
3  0.001978  0.000000 -0.309461
4 -0.083375 -0.665350  1.838664
5 -0.049425  1.272762 -1.111465
6 -1.011749  0.335852  0.366487
'''

#fillna()引數使用dictionary,可指定不同欄位,填滿不同值。
df.fillna({1:0.5, 2:0})
'''
Out[20]: 
          0         1         2
0  0.570724  0.500000  0.000000
1 -0.939249  0.500000  0.000000
2 -0.549853  0.500000  0.625596
3  0.001978  0.500000 -0.309461
4 -0.083375 -0.665350  1.838664
5 -0.049425  1.272762 -1.111465
6 -1.011749  0.335852  0.366487
'''

#fillna()方法會建立一個新的物件,使用引數名稱inplace=True,會在原來的資料物件內修改
_ = df.fillna(0, inplace = True)
df
'''
Out[21]: 
          0         1         2
0  0.570724  0.000000  0.000000
1 -0.939249  0.000000  0.000000
2 -0.549853  0.000000  0.625596
3  0.001978  0.000000 -0.309461
4 -0.083375 -0.665350  1.838664
5 -0.049425  1.272762 -1.111465
6 -1.011749  0.335852  0.366487
'''



Unnamed: 0,0,1,2
0,0.259844,0.0,0.0
1,-0.092183,0.0,0.0
2,1.880992,0.0,0.721567
3,0.180372,0.0,1.649274
4,0.219948,0.176035,2.456585
5,0.738787,0.60978,0.078419
6,-0.866145,0.925364,-0.480117
