# D10 Pandas 資料索引操作 (資料過濾、選擇與合併)

## 範例目標:
Pandas資料的索引、操作、選擇、過濾、合併與排序。

## 範例重點:

1. 資料過濾與操作資料不同，過濾出來的資料將是新資料集，不會動到原本的資料。
2. 合併資料時合併欄位(key)可多個欄位，遇到相同欄位名稱時merge會自動產生字尾，join則不會。



### 讀取csv檔

In [3]:
import pandas as pd
boston_data = pd.read_csv('boston1.csv', usecols=['CRIM', 'ZN', 'key', 'INDUS'])
boston_data

Unnamed: 0,key,CRIM,ZN,INDUS
0,1,0.02731,0.0,7.07
1,2,0.02729,0.0,7.07
2,3,0.03237,0.0,2.18
3,4,0.06905,0.0,2.18
4,5,0.02985,0.0,2.18
...,...,...,...,...
500,501,0.06263,0.0,11.93
501,502,0.04527,0.0,11.93
502,503,0.06076,0.0,11.93
503,504,0.10959,0.0,11.93


### 指定欄位為索引值

In [4]:
#指定欄位為索引值
boston_data_index=boston_data.set_index('key')
boston_data_index

Unnamed: 0_level_0,CRIM,ZN,INDUS
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.02731,0.0,7.07
2,0.02729,0.0,7.07
3,0.03237,0.0,2.18
4,0.06905,0.0,2.18
5,0.02985,0.0,2.18
...,...,...,...
501,0.06263,0.0,11.93
502,0.04527,0.0,11.93
503,0.06076,0.0,11.93
504,0.10959,0.0,11.93


### 查看索引的資訊

In [5]:
#查看索引的資訊
boston_data_index.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            496, 497, 498, 499, 500, 501, 502, 503, 504, 505],
           dtype='int64', name='key', length=505)

## 階層式索引

In [6]:
#建立階層式索引
boston_data_index2 = boston_data.set_index(['key', 'INDUS'])
boston_data_index2

Unnamed: 0_level_0,Unnamed: 1_level_0,CRIM,ZN
key,INDUS,Unnamed: 2_level_1,Unnamed: 3_level_1
1,7.07,0.02731,0.0
2,7.07,0.02729,0.0
3,2.18,0.03237,0.0
4,2.18,0.06905,0.0
5,2.18,0.02985,0.0
...,...,...,...
501,11.93,0.06263,0.0
502,11.93,0.04527,0.0
503,11.93,0.06076,0.0
504,11.93,0.10959,0.0


In [7]:
#查看索引的資訊
boston_data_index2.index

MultiIndex([(  1,  7.07),
            (  2,  7.07),
            (  3,  2.18),
            (  4,  2.18),
            (  5,  2.18),
            (  6,  7.87),
            (  7,  7.87),
            (  8,  7.87),
            (  9,  7.87),
            ( 10,  7.87),
            ...
            (496,  9.69),
            (497,  9.69),
            (498,  9.69),
            (499,  9.69),
            (500,  9.69),
            (501, 11.93),
            (502, 11.93),
            (503, 11.93),
            (504, 11.93),
            (505, 11.93)],
           names=['key', 'INDUS'], length=505)

## 操作資料

### 1. 重新命名欄位名稱

利用 .rename() 重新對欄位名稱進行命名。

In [8]:
boston_data

Unnamed: 0,key,CRIM,ZN,INDUS
0,1,0.02731,0.0,7.07
1,2,0.02729,0.0,7.07
2,3,0.03237,0.0,2.18
3,4,0.06905,0.0,2.18
4,5,0.02985,0.0,2.18
...,...,...,...,...
500,501,0.06263,0.0,11.93
501,502,0.04527,0.0,11.93
502,503,0.06076,0.0,11.93
503,504,0.10959,0.0,11.93


In [10]:
#重新命名欄位名稱，將原本的CRIM欄位名稱改為feature1
new_boston_data = boston_data.rename(columns = {'CRIM':'feature1'})
new_boston_data

Unnamed: 0,key,feature1,ZN,INDUS
0,1,0.02731,0.0,7.07
1,2,0.02729,0.0,7.07
2,3,0.03237,0.0,2.18
3,4,0.06905,0.0,2.18
4,5,0.02985,0.0,2.18
...,...,...,...,...
500,501,0.06263,0.0,11.93
501,502,0.04527,0.0,11.93
502,503,0.06076,0.0,11.93
503,504,0.10959,0.0,11.93


### 2. 增加欄位

#### 方法一、使用 []


In [13]:
#新增一行四捨五入後的 INDUS 欄位

#拷貝一份資料
copy1 = boston_data.copy()

#建立一個新欄位，為 INDUS 欄位四捨五入後的資料
copy1['round_INDUS'] = round(copy1['INDUS']) 

#印出該dataframe
copy1

Unnamed: 0,key,CRIM,ZN,INDUS,round_INDUS
0,1,0.02731,0.0,7.07,7.0
1,2,0.02729,0.0,7.07,7.0
2,3,0.03237,0.0,2.18,2.0
3,4,0.06905,0.0,2.18,2.0
4,5,0.02985,0.0,2.18,2.0
...,...,...,...,...,...
500,501,0.06263,0.0,11.93,12.0
501,502,0.04527,0.0,11.93,12.0
502,503,0.06076,0.0,11.93,12.0
503,504,0.10959,0.0,11.93,12.0


#### 方法二、使用 .insert()

- DataFrame.insert(loc, column, value, allow_duplicates=False)

In [34]:
#新增一行四捨五入後的 INDUS 欄位

#拷貝一份資料
copy2 = boston_data.copy()

#建立一個新欄位，為 INDUS 欄位四捨五入後的資料
copy2.insert(1, 'round_INDUS', round(copy2['INDUS']))

#印出該dataframe
copy2

Unnamed: 0,key,round_INDUS,CRIM,ZN,INDUS
0,1,7.0,0.02731,0.0,7.07
1,2,7.0,0.02729,0.0,7.07
2,3,2.0,0.03237,0.0,2.18
3,4,2.0,0.06905,0.0,2.18
4,5,2.0,0.02985,0.0,2.18
...,...,...,...,...,...
500,501,12.0,0.06263,0.0,11.93
501,502,12.0,0.04527,0.0,11.93
502,503,12.0,0.06076,0.0,11.93
503,504,12.0,0.10959,0.0,11.93


#### 新增一行四捨五入後的 INDUS 欄位，但欄位名稱重複 (可允許重複)

In [31]:
#新增一行四捨五入後的 INDUS 欄位，但欄位名稱重複 (可允許重複)

#拷貝一份資料
copy2 = boston_data.copy()

#建立一個新欄位，為 INDUS 欄位四捨五入後的資料
copy2.insert(1, 'INDUS', round(copy2['INDUS']), allow_duplicates=True)

#印出該dataframe
copy2

Unnamed: 0,key,INDUS,CRIM,ZN,INDUS.1
0,1,7.0,0.02731,0.0,7.07
1,2,7.0,0.02729,0.0,7.07
2,3,2.0,0.03237,0.0,2.18
3,4,2.0,0.06905,0.0,2.18
4,5,2.0,0.02985,0.0,2.18
...,...,...,...,...,...
500,501,12.0,0.06263,0.0,11.93
501,502,12.0,0.04527,0.0,11.93
502,503,12.0,0.06076,0.0,11.93
503,504,12.0,0.10959,0.0,11.93


#### 新增一行四捨五入後的 INDUS 欄位，但欄位名稱重複 (不允許重複)

In [32]:
#新增一行四捨五入後的 INDUS 欄位，但欄位名稱重複 (不允許重複)

#拷貝一份資料
copy2 = boston_data.copy()

#建立一個新欄位，為 INDUS 欄位四捨五入後的資料
copy2.insert(1, 'INDUS', round(copy2['INDUS']), allow_duplicates=False)

#印出該dataframe
copy2

ValueError: cannot insert INDUS, already exists

### 3. 刪除欄位


del、.pop()、.drop() 三種方法，每個方法有所不同
- del：刪除原 DataFrame 裡的欄位
- .pop()：刪除原 DataFrame 裡的欄位並且回傳被刪除的欄位
- .drop()：回傳刪除後的新資料框


#### del：刪除原 DataFrame 裡的欄位

In [37]:
#原本的copy2 dataframe樣貌

#新增一行四捨五入後的 INDUS 欄位

#拷貝一份資料
copy2 = boston_data.copy()

#建立一個新欄位，為 INDUS 欄位四捨五入後的資料
copy2.insert(1, 'round_INDUS', round(copy2['INDUS']))

#印出該dataframe
copy2

Unnamed: 0,key,round_INDUS,CRIM,ZN,INDUS
0,1,7.0,0.02731,0.0,7.07
1,2,7.0,0.02729,0.0,7.07
2,3,2.0,0.03237,0.0,2.18
3,4,2.0,0.06905,0.0,2.18
4,5,2.0,0.02985,0.0,2.18
...,...,...,...,...,...
500,501,12.0,0.06263,0.0,11.93
501,502,12.0,0.04527,0.0,11.93
502,503,12.0,0.06076,0.0,11.93
503,504,12.0,0.10959,0.0,11.93


In [36]:
#del
del copy2['round_INDUS']
copy2

Unnamed: 0,key,CRIM,ZN,INDUS
0,1,0.02731,0.0,7.07
1,2,0.02729,0.0,7.07
2,3,0.03237,0.0,2.18
3,4,0.06905,0.0,2.18
4,5,0.02985,0.0,2.18
...,...,...,...,...
500,501,0.06263,0.0,11.93
501,502,0.04527,0.0,11.93
502,503,0.06076,0.0,11.93
503,504,0.10959,0.0,11.93


#### .pop()：刪除原 DataFrame 裡的欄位並且回傳被刪除的欄位

In [47]:
#原本 copy1 的 dataframe 樣貌
#拷貝一份資料
copy1 = boston_data.copy()

#建立一個新欄位，為 INDUS 欄位四捨五入後的資料
copy1['round_INDUS'] = round(copy1['INDUS']) 

#印出該dataframe
copy1

Unnamed: 0,key,CRIM,ZN,INDUS,round_INDUS
0,1,0.02731,0.0,7.07,7.0
1,2,0.02729,0.0,7.07,7.0
2,3,0.03237,0.0,2.18,2.0
3,4,0.06905,0.0,2.18,2.0
4,5,0.02985,0.0,2.18,2.0
...,...,...,...,...,...
500,501,0.06263,0.0,11.93,12.0
501,502,0.04527,0.0,11.93,12.0
502,503,0.06076,0.0,11.93,12.0
503,504,0.10959,0.0,11.93,12.0


In [48]:
#.pop()
print(copy1.pop('round_INDUS'))  #刪除原 DataFrame 裡的欄位並且回傳被刪除的欄位
print('新的 copy1 樣貌 ： \n', copy1)

0       7.0
1       7.0
2       2.0
3       2.0
4       2.0
       ... 
500    12.0
501    12.0
502    12.0
503    12.0
504    12.0
Name: round_INDUS, Length: 505, dtype: float64
新的 copy1 樣貌 ： 
      key     CRIM   ZN  INDUS
0      1  0.02731  0.0   7.07
1      2  0.02729  0.0   7.07
2      3  0.03237  0.0   2.18
3      4  0.06905  0.0   2.18
4      5  0.02985  0.0   2.18
..   ...      ...  ...    ...
500  501  0.06263  0.0  11.93
501  502  0.04527  0.0  11.93
502  503  0.06076  0.0  11.93
503  504  0.10959  0.0  11.93
504  505  0.04741  0.0  11.93

[505 rows x 4 columns]


####  .drop()：回傳刪除後的新資料框

In [52]:
# 原本 copy3 的 dataframe 樣貌
copy3 = boston_data.copy()
copy3

Unnamed: 0,key,CRIM,ZN,INDUS
0,1,0.02731,0.0,7.07
1,2,0.02729,0.0,7.07
2,3,0.03237,0.0,2.18
3,4,0.06905,0.0,2.18
4,5,0.02985,0.0,2.18
...,...,...,...,...
500,501,0.06263,0.0,11.93
501,502,0.04527,0.0,11.93
502,503,0.06076,0.0,11.93
503,504,0.10959,0.0,11.93


In [50]:
#.drop()
print(copy3.drop('CRIM', axis=1)) #根據column進行刪除

     key   ZN  INDUS
0      1  0.0   7.07
1      2  0.0   7.07
2      3  0.0   2.18
3      4  0.0   2.18
4      5  0.0   2.18
..   ...  ...    ...
500  501  0.0  11.93
501  502  0.0  11.93
502  503  0.0  11.93
503  504  0.0  11.93
504  505  0.0  11.93

[505 rows x 3 columns]


In [54]:
copy3


Unnamed: 0,key,CRIM,ZN,INDUS
0,1,0.02731,0.0,7.07
1,2,0.02729,0.0,7.07
2,3,0.03237,0.0,2.18
3,4,0.06905,0.0,2.18
4,5,0.02985,0.0,2.18
...,...,...,...,...
500,501,0.06263,0.0,11.93
501,502,0.04527,0.0,11.93
502,503,0.06076,0.0,11.93
503,504,0.10959,0.0,11.93
