# 資料整理和使用前準備
### 1 資料的先前準備工作:
    - 載入(loading)
    - 資料整理(cleaning)
    - 資料轉換(transforming)
    - 重新安排(rearrange)
  
    

## 處理遺失資料
- pandas 使用NaN(Not a number)表示遺失資料

In [4]:
import numpy as np
import pandas as pd

#使用np.nan代表NaN
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
'''
Out[2]: 
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
'''

#使用python None代表Nan
string_data[0] = None
string_data.isnull()
'''
Out[5]: 
0     True
1    False
2     True
3    False
dtype: bool
'''


'\nOut[5]: \n0     True\n1    False\n2     True\n3    False\ndtype: bool\n'

處理遺失資料的方法
    - dropna()  
    - fillna()  
    - isnull()  
    - notnull()  

### 清除NaN的資料

In [7]:
# 清除NaN的資料
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data
'''
Out[6]: 
0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64
'''
#使用dropna()方法,清除遺失資料
data.dropna()
'''
Out[7]: 
0    1.0
2    3.5
4    7.0
dtype: float64
'''

#使用dropna()方法,清除遺失資料
data[data.notnull()]
'''
Out[8]: 
0    1.0
2    3.5
4    7.0
dtype: float64
'''

0    1.0
2    3.5
4    7.0
dtype: float64

In [17]:
#DataFrame的清除比較複雜一些
#drapna()將會清除整個row
data = pd.DataFrame([[1., 6.5, 3.],
                    [1., NA, NA],
                    [NA, NA, NA],
                    [NA, 6.5, 3.]])
data
'''
Out[9]: 
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
'''

data.dropna()
'''
Out[10]: 
     0    1    2
0  1.0  6.5  3.0
'''

#做用引數名稱how='all',只清除整列都是NaN的列
data.dropna(how='all')
'''
Out[11]: 
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0
'''

#刪除整欄資料使用axis=1
data[4] = NA
data
'''
Out[12]: 
     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN
'''

data.dropna(axis=1, how='all')
'''
Out[14]: 
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
'''

df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
'''
Out[15]: 
          0         1         2
0 -0.384156       NaN       NaN
1  2.457934       NaN       NaN
2  0.387037       NaN  1.516058
3  0.799194       NaN  1.019982
4 -0.523685  2.448814 -1.618673
5  0.139726 -0.483431  1.087822
6 -0.333721 -0.491657 -0.834928
'''

df.dropna()
'''
Out[16]: 
          0         1         2
4 -0.523685  2.448814 -1.618673
5  0.139726 -0.483431  1.087822
6 -0.333721 -0.491657 -0.834928
'''

#設立2個以上的才刪除row,使用引數名稱thresh=2
df.dropna(thresh=2)
'''
Out[17]: 
          0         1         2
2  0.387037       NaN  1.516058
3  0.799194       NaN  1.019982
4 -0.523685  2.448814 -1.618673
5  0.139726 -0.483431  1.087822
6 -0.333721 -0.491657 -0.834928
'''


Unnamed: 0,0,1,2
2,0.427143,,-1.388578
3,0.00609,,-1.693441
4,2.049151,0.332035,-1.458771
5,-0.333426,0.754723,2.430431
6,-0.175726,-0.822314,0.289279


### 填滿遺失資料

In [23]:
#使用fillna(),填滿遺失資料
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
'''
Out[18]: 
          0         1         2
0  0.570724       NaN       NaN
1 -0.939249       NaN       NaN
2 -0.549853       NaN  0.625596
3  0.001978       NaN -0.309461
4 -0.083375 -0.665350  1.838664
5 -0.049425  1.272762 -1.111465
6 -1.011749  0.335852  0.366487

'''

df.fillna(0)
'''
Out[19]: 
          0         1         2
0  0.570724  0.000000  0.000000
1 -0.939249  0.000000  0.000000
2 -0.549853  0.000000  0.625596
3  0.001978  0.000000 -0.309461
4 -0.083375 -0.665350  1.838664
5 -0.049425  1.272762 -1.111465
6 -1.011749  0.335852  0.366487
'''

#fillna()引數使用dictionary,可指定不同欄位,填滿不同值。
df.fillna({1:0.5, 2:0})
'''
Out[20]: 
          0         1         2
0  0.570724  0.500000  0.000000
1 -0.939249  0.500000  0.000000
2 -0.549853  0.500000  0.625596
3  0.001978  0.500000 -0.309461
4 -0.083375 -0.665350  1.838664
5 -0.049425  1.272762 -1.111465
6 -1.011749  0.335852  0.366487
'''

#fillna()方法會建立一個新的物件,使用引數名稱inplace=True,會在原來的資料物件內修改
_ = df.fillna(0, inplace = True)
df
'''
Out[21]: 
          0         1         2
0  0.570724  0.000000  0.000000
1 -0.939249  0.000000  0.000000
2 -0.549853  0.000000  0.625596
3  0.001978  0.000000 -0.309461
4 -0.083375 -0.665350  1.838664
5 -0.049425  1.272762 -1.111465
6 -1.011749  0.335852  0.366487
'''

df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA
df
'''
Out[22]: 
          0         1         2
0  1.982813  1.060082 -1.239308
1  0.759934 -0.820196  1.159191
2 -0.439932       NaN  2.185100
3 -0.240808       NaN  1.468984
4 -1.097970       NaN       NaN
5  0.922204       NaN       NaN
'''

#使用向下填滿,做用引數名稱method='ffill'
df.fillna(method='ffill')
'''
Out[23]: 
          0         1         2
0  1.982813  1.060082 -1.239308
1  0.759934 -0.820196  1.159191
2 -0.439932 -0.820196  2.185100
3 -0.240808 -0.820196  1.468984
4 -1.097970 -0.820196  1.468984
5  0.922204 -0.820196  1.468984
'''

#向下填滿2個cell
df.fillna(method='ffill', limit=2)
'''
Out[24]: 
          0         1         2
0  1.982813  1.060082 -1.239308
1  0.759934 -0.820196  1.159191
2 -0.439932 -0.820196  2.185100
3 -0.240808 -0.820196  1.468984
4 -1.097970       NaN  1.468984
5  0.922204       NaN  1.468984
'''

#填滿平均值
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())
'''
Out[25]: 
0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64
'''


0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 資料轉換
    - 過濾資料
    - 整理資料

### 移除重複資料

In [9]:
#duplicated 回傳boolean Series
import numpy as np
import pandas as pd

data = pd.DataFrame({'k1':['one', 'two']*3 + ['two'],
                    'k2':[1, 1, 2, 3, 3, 4, 4]})
data
'''
Out[26]: 
    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
6  two   4
'''

#使用duplicated檢查重覆的資料
data.duplicated
'''
Out[3]: 
<bound method DataFrame.duplicated of     k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
6  two   4>
'''

#drop_duplicates()移除重覆資料
data.drop_duplicates()
'''
Out[4]: 
    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
'''

#移除欄位有重覆資料的列
data['v1'] = range(7)
data.drop_duplicates(['k1'])
'''
Out[5]: 
    k1  k2
0  one   1
1  two   1
'''

#drop_duplicates()保留第一筆,使用引數名稱keep='last',何留最後一筆
data.drop_duplicates(['k1', 'k2'], keep='last')
'''
Out[6]: 
    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
6  two   4
'''

'\nOut[6]: \n    k1  k2\n0  one   1\n1  two   1\n2  one   2\n3  two   3\n4  one   3\n6  two   4\n'

### 使用function或map轉換資料

In [16]:
data = pd.DataFrame({'食物':['培根','豬肉','培根','熏牛肉','咸牛肉','培根','熏牛肉','蜂蜜火腿','熏鮭魚'],
                    '重量':[4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
'''
Out[7]: 
     食物    重量
0    培根   4.0
1    豬肉   3.0
2    培根  12.0
3   熏牛肉   6.0
4   咸牛肉   7.5
5    培根   8.0
6   熏牛肉   3.0
7  蜂蜜火腿   5.0
8   熏鮭魚   6.0
'''

#使用Series.map({})方法,並且傳入對應的dictionary,建立有對應資料新的Series
mean_to_animal = {
   '培根':'豬',
    '豬肉':'豬',
    '熏牛肉':'牛',
    '咸牛肉':'牛',
    '蜂蜜火腿':'豬',
    '熏鮭魚':'鮭魚'
}

data['動物'] = data['食物'].map(mean_to_animal)
data
'''
Out[8]: 
     食物    重量   動物
0    培根   4.0    豬
1    豬肉   3.0    豬
2    培根  12.0    豬
3   熏牛肉   6.0    牛
4   咸牛肉   7.5    牛
5    培根   8.0    豬
6   熏牛肉   3.0    牛
7  蜂蜜火腿   5.0    豬
8   熏鮭魚   6.0   鮭魚
'''

#也可以使用Series.map(lambda)
data['食物'].map(lambda x: mean_to_animal[x])
'''
Out[9]: 
0      豬
1      豬
2      豬
3      牛
4      牛
5      豬
6      牛
7      豬
8     鮭魚
Name: 食物, dtype: object
'''


'\nOut[9]: \n0      豬\n1      豬\n2      豬\n3      牛\n4      牛\n5      豬\n6      牛\n7      豬\n8    三文魚\nName: 食物, dtype: object\n'

### 置換值

In [21]:
#使用replace(),可以指定特定的值被置換
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
'''
Out[10]: 
0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64
'''

data.replace(-999,np.nan)
'''
Out[11]: 
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
'''

data.replace([-999, -1000], np.nan)
'''
Out[12]: 
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
'''

data.replace([-999, -1000],[np.nan, 0])
'''
Out[13]: 
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
'''

data.replace({-999: np.nan, -1000:0})
'''
Out[14]: 
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
'''

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### 重新命名軸的索引名稱
- 使用function或mapping

In [31]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                   index=['台北','台中','台南'],
                   columns = ['one', 'two', 'three', 'four'])
data
'''
Out[15]: 
    one  two  three  four
台北    0    1      2     3
台中    4    5      6     7
台南    8    9     10    11
'''

#Series的index也有map()方法
transform = lambda x: x + '(台灣)'
data.index.map(transform)
#Out[16]: Index(['台北(台灣)', '台中(台灣)', '台南(台灣)'], dtype='object')

data.index = data.index.map(transform)
data
'''
Out[17]: 
        one  two  three  four
台北(台灣)    0    1      2     3
台中(台灣)    4    5      6     7
台南(台灣)    8    9     10    11
'''
#使用rename(),複制一個新的並重新命名
data.rename(index=str, columns=str.upper)
'''
Out[18]: 
        ONE  TWO  THREE  FOUR
台北(台灣)    0    1      2     3
台中(台灣)    4    5      6     7
台南(台灣)    8    9     10    11
'''

#使用rename(),並應用對應值來置換
data.rename(
    index={'台南(台灣)':'高雄(台灣)'},
    columns = {'one':'NUM'}
)
'''
Out[19]: 
        NUM  two  three  four
台北(台灣)    0    1      2     3
台中(台灣)    4    5      6     7
高雄(台灣)    8    9     10    11
'''

#使用引數名稱inplace=True,修改原始內容
data.rename(index={'台南(台灣)':'高雄(台灣)'}, inplace=True)
data
'''
Out[20]: 
        one  two  three  four
台北(台灣)    0    1      2     3
台中(台灣)    4    5      6     7
高雄(台灣)    8    9     10    11
'''

Unnamed: 0,one,two,three,four
台北(台灣),0,1,2,3
台中(台灣),4,5,6,7
高雄(台灣),8,9,10,11


### 分組

In [46]:
#使用pd.cut(),進行分類拆解
ages = [ 20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

#19~25,26~35,36~60,61~100
bins = [18, 25, 35, 60, 100]

cats = pd.cut(ages, bins)
cats
'''
Out[22]: 
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
'''

#將categories編號
cats.codes
#Out[23]: array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

cats.categories
'''
Out[24]: 
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')
'''

pd.value_counts(cats)
'''
Out[25]: 
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
'''
#(18, 25],「(」是open,代表不保含,「]」代表close,代表包含

#改變範圍
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
'''
Out[26]: 
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
'''

group_names = ['青少年','青年', '中年', '老年']
pd.cut(ages, bins, labels=group_names)
'''
Out[27]: 
[青少年, 青少年, 青少年, 青年, 青少年, ..., 青年, 老年, 中年, 中年, 青年]
Length: 12
Categories (4, object): [青少年 < 青年 < 中年 < 老年]
'''

#自動分組,並指定精準到幾位數
data = np.random.rand(20)
data
'''
Out[28]: 
array([0.28732327, 0.52430816, 0.10053197, 0.9762071 , 0.2709794 ,
       0.27909403, 0.63996528, 0.83536985, 0.47676844, 0.42801012,
       0.5275013 , 0.46877906, 0.93083594, 0.60846994, 0.74655765,
       0.05880551, 0.73799312, 0.58173775, 0.49410714, 0.27392977])
'''

#分成4組,值精準至小數2位
pd.cut(data,4, precision=2)
'''
Out[29]: 
[(0.058, 0.29], (0.52, 0.75], (0.058, 0.29], (0.75, 0.98], (0.058, 0.29], ..., (0.058, 0.29], (0.52, 0.75], (0.52, 0.75], (0.29, 0.52], (0.058, 0.29]]
Length: 20
Categories (4, interval[float64]): [(0.058, 0.29] < (0.29, 0.52] < (0.52, 0.75] < (0.75, 0.98]]
'''

#使用np.qcut()方法,分組成為相同數量的組
data = np.random.randn(1000)
cats = pd.qcut(data,4)
pd.value_counts(cats)
'''
Out[31]: 
(0.627, 2.808]                   250
(-0.051, 0.627]                  250
(-0.631, -0.051]                 250
(-3.1959999999999997, -0.631]    250
dtype: int64
'''

#依比例分組
cats = pd.qcut(data,[0, 0.1, 0.5, 0.9, 1.])
pd.value_counts(cats)
'''
Out[32]: 
(-0.051, 1.14]                  400
(-1.22, -0.051]                 400
(1.14, 2.808]                   100
(-3.1959999999999997, -1.22]    100
dtype: int64
'''

(0.069, 1.334]      400
(-1.279, 0.069]     400
(1.334, 3.151]      100
(-3.622, -1.279]    100
dtype: int64

### 搜尋和過濾

In [52]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
'''
Out[33]: 
                 0            1            2            3
count  1000.000000  1000.000000  1000.000000  1000.000000
mean      0.031971    -0.063139     0.048191     0.002786
std       0.993219     1.034279     0.982753     0.973957
min      -3.362467    -3.700265    -3.185248    -3.262880
25%      -0.635766    -0.759439    -0.596552    -0.689184
50%       0.059680    -0.067288     0.020611     0.010695
75%       0.686105     0.632335     0.714625     0.667582
max       3.575304     3.637118     3.112954     3.025585
'''

#搜尋指定第3欄,絕對值超過3
col = data[2]
col[np.abs(col) > 3]

'''
Out[34]: 
103   -3.185248
685    3.033823
759    3.112954
775   -3.020949
Name: 2, dtype: float64
'''

#使用any(1)搜尋條件是一列只要有一個超過3或-3的值
data[(np.abs(data) > 3).any(1)]
'''
Out[35]: 
            0         1         2         3
103  0.293913 -1.916527 -3.185248 -1.060454
174  0.175875  3.637118 -0.506101 -1.659508
283 -1.716220 -3.235915 -0.759161 -0.817349
391 -0.888412 -0.506042  0.837620 -3.262880
413 -0.954607 -3.700265 -0.498898  0.776064
436  0.253111 -2.555901  0.384527 -3.086035
543  1.334808 -3.614929  1.073649  0.186973
578  3.575304 -0.529281 -0.892864 -0.171373
631 -0.271282 -1.017268 -0.072178  3.025585
685 -1.451157  2.889387  3.033823 -0.614973
748  0.251170  2.519471  1.255296 -3.185413
759  0.831235 -0.139263  3.112954  0.406874
760 -3.362467 -0.911349 -0.481441  0.010128
774  3.085326  0.469143 -0.999182 -0.110453
775 -0.428677  0.490717 -3.020949  0.443702
819 -3.055359 -0.807028  0.159213 -0.077878
'''

#np.sign(),負數就成為-1,正數就成為1
np.sign(data).head()
'''
Out[36]: 
     0    1    2    3
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0 -1.0  1.0 -1.0
3  1.0 -1.0 -1.0  1.0
4 -1.0  1.0 -1.0 -1.0
'''

data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
'''
Out[37]: 
                 0            1            2            3
count  1000.000000  1000.000000  1000.000000  1000.000000
mean      0.031728    -0.062225     0.048250     0.003294
std       0.989722     1.027314     0.981646     0.972170
min      -3.000000    -3.000000    -3.000000    -3.000000
25%      -0.635766    -0.759439    -0.596552    -0.689184
50%       0.059680    -0.067288     0.020611     0.010695
75%       0.686105     0.632335     0.714625     0.667582
max       3.000000     3.000000     3.000000     3.000000
'''


Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.011823,0.000723,0.044787,0.039283
std,1.018984,0.996059,1.044301,1.016401
min,-3.0,-3.0,-3.0,-3.0
25%,-0.703664,-0.656234,-0.60594,-0.64778
50%,-0.003733,-0.018756,0.025645,0.045662
75%,0.655627,0.626472,0.746904,0.745478
max,3.0,3.0,3.0,3.0


### 建立亂數排序

In [60]:
df = pd.DataFrame(np.arange(5*4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler
#Out[38]: array([4, 3, 2, 0, 1])

df.head()
'''
Out[39]: 
    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
'''

#使用take()方法,更改索引排序
df.take(sampler)
'''
Out[40]: 
    0   1   2   3
4  16  17  18  19
3  12  13  14  15
2   8   9  10  11
0   0   1   2   3
1   4   5   6   7
'''

#使用sample()取出部份資料集
df.sample(n=3)
'''
Out[41]: 
    0   1   2   3
3  12  13  14  15
0   0   1   2   3
2   8   9  10  11
'''

#使用引數名稱replace=True,建立更多的sample列
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws
'''
Out[42]: 
1    7
0    5
2   -1
1    7
4    4
0    5
1    7
4    4
3    6
0    5
dtype: int64
'''

3    6
3    6
2   -1
0    5
1    7
1    7
0    5
4    4
4    4
1    7
dtype: int64