# D09 使用 Pandas 讀寫各種常用的檔案格式
## 讀寫 csv

In [1]:
import pandas as pd
iris = pd.read_csv('iris.csv')

In [2]:
print(iris)

     sepal length  sepal width  petal length  petal width  target
0             5.1          3.5           1.4          0.2       0
1             4.9          3.0           1.4          0.2       0
2             4.7          3.2           1.3          0.2       0
3             4.6          3.1           1.5          0.2       0
4             5.0          3.6           1.4          0.2       0
..            ...          ...           ...          ...     ...
145           6.7          3.0           5.2          2.3       2
146           6.3          2.5           5.0          1.9       2
147           6.5          3.0           5.2          2.0       2
148           6.2          3.4           5.4          2.3       2
149           5.9          3.0           5.1          1.8       2

[150 rows x 5 columns]


In [18]:
#使用 usercols 參數指定讀取的行名稱
iris_2 = pd.read_csv('iris.csv', usecols=['sepal width', 'target'])
iris_2

Unnamed: 0,sepal width,target
0,3.5,0
1,3.0,0
2,3.2,0
3,3.1,0
4,3.6,0
...,...,...
145,3.0,2
146,2.5,2
147,3.0,2
148,3.4,2


在資料讀取中利用 names 參數指定行名稱(column name)，因為有指定行名稱所以必須以 hearder=0 參數跳過檔案裡放置行名稱的列。

In [15]:
iris_withtitle = pd.read_csv('iris.csv', names = ['feature1', 'feature2','feature3', 'feature4', 'target'], header=0)
iris_withtitle 

Unnamed: 0,feature1,feature2,feature3,feature4,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [16]:
#因為已指定行名稱(column name)，若要使用 usecols，則要使用以定義的欄名稱進行指定
iris_withtitle = pd.read_csv('iris.csv', names = ['feature1', 'feature2','feature3', 'feature4', 'target'], header = 0, usecols = ['feature4', 'target'])
iris_withtitle 

Unnamed: 0,feature4,target
0,0.2,0
1,0.2,0
2,0.2,0
3,0.2,0
4,0.2,0
...,...,...
145,2.3,2
146,1.9,2
147,2.0,2
148,2.3,2


### 輸出csv檔

In [14]:
iris_withtitle.to_csv('iris_withtitle.csv')  

## 讀寫 excel

In [26]:
#讀取excel (預設只會讀取第一個工作表)
boston_data = pd.read_excel('data.xls')
boston_data

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [20]:
#讀取excel，並指定工作表
boston_data2 = pd.read_excel('data.xls', sheet_name='boston')
boston_data2

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48


pd.read_excel 和 pd.read_csv 一樣有 usecols、header、names 可以使用

In [23]:
#讀取excel，並指定工作表、篩選欄位
bos = pd.read_excel('data.xls', sheet_name ='boston', header=0, usecols=['TAX', 'PTRATIO', 'B', 'LSTAT'])
bos

Unnamed: 0,TAX,PTRATIO,B,LSTAT
0,296,15.3,396.90,4.98
1,242,17.8,396.90,9.14
2,242,17.8,392.83,4.03
3,222,18.7,394.63,2.94
4,222,18.7,396.90,5.33
...,...,...,...,...
501,273,21.0,391.99,9.67
502,273,21.0,396.90,9.08
503,273,21.0,396.90,5.64
504,273,21.0,393.45,6.48


### 輸出 excel 檔

In [31]:
bos.to_excel('new_boston.xls', sheet_name='bos')

## 讀取 json 檔

In [32]:
pd.read_json('boston.json')

Unnamed: 0,TAX,PTRATIO,B,LSTAT
0,296,15.3,396.90,4.98
1,242,17.8,396.90,9.14
2,242,17.8,392.83,4.03
3,222,18.7,394.63,2.94
4,222,18.7,396.90,5.33
...,...,...,...,...
501,273,21.0,391.99,9.67
502,273,21.0,396.90,9.08
503,273,21.0,396.90,5.64
504,273,21.0,393.45,6.48


### 輸出 json 檔

In [33]:
bos.to_json('bos.json')

## 讀取 SQL 資料庫

任何 SQL 資料庫如果支援遵守 Python DB-API 都可以被 Pandas 讀取。

以下先用 boston.csv 的資料寫入 SQLite3 資料庫中並且命名 boston，

由 if_exists 參數判斷是否存在資料庫，
如果檔案不存在會立即被建立，如果存在 if_exists='replace' 將會取代掉原本資料，

if_exists='append' 將會繼續寫在原有資料下。

In [35]:
#載入套件sqlite3
import sqlite3

boston_data = pd.read_excel('data.xls', sheet_name='boston', header=0,
                           usecols = ['TAX', 'PTRATIO', 'B', 'LSTAT'])

#與sql資料庫串聯
connection = sqlite3.connect('sq;_db.sqlite')
#由if_exists判斷是否存在資料庫 #if_exist='replace'如果存在，將會取代原本的資料
boston_data.to_sql('boston', connection, if_exists='replace') 

connection.commit()

connection.close()


讀取 SQLite3 資料庫可以使用 pd.io.sql.read_sql，可以直接下 SQL 指令對 sql_db 中的 boston 做搜尋。

In [36]:
#串聯資料庫
connection = sqlite3.connect('sql_db.sqlite')

#可以直接下 SQL 指令對 sql_db 中的 boston 做搜尋
boston_data_sql = pd.io.sql.read_sql("select * from boston", 
                                     connection)

#關閉與資料庫的串聯
connection.close()

#印出data
boston_data_sql

Unnamed: 0,index,TAX,PTRATIO,B,LSTAT
0,0,296,15.3,396.90,4.98
1,1,242,17.8,396.90,9.14
2,2,242,17.8,392.83,4.03
3,3,222,18.7,394.63,2.94
4,4,222,18.7,396.90,5.33
...,...,...,...,...,...
501,501,273,21.0,391.99,9.67
502,502,273,21.0,396.90,9.08
503,503,273,21.0,396.90,5.64
504,504,273,21.0,393.45,6.48
