# 使用pandas讀寫資料
## 純文字資料格式
### 讀取純文字的部份方法

- read_csv()  
- read_table() 
- read_excel()  
- read_html()
- read_json()  
- read_sas()  
- read_sql() 

#### 常用的引數功能
    1 indexing
    2 資料類型的推測和資料類型的轉換
    3 日期時間的解析
    4 對大型檔案提供多次讀取
    5 清理資料,省略讀取不要的列,header,footer,註解

In [3]:
#使用指令檢視內容
#!cat -> mac
#!type -> windows

!cat ex1.csv

'''
a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
'''

#如果是使用「,」分隔,可以使用read_csv
import pandas as pd
dataFrame = pd.read_csv('ex1.csv')
dataFrame

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
#read_table(),加上指定的間格符號
pd.read_table('ex1.csv',sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
#沒有欄位的csv
!cat ex2.csv

'''
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
'''

pd.read_csv('ex2.csv',header=None)

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [7]:
#指定欄位名稱
pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [8]:
#指定那一個欄,成為索引
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ex2.csv', names = names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [11]:
#指定多重索引
!cat csv_mindex.csv

pd.read_csv('csv_mindex.csv', index_col=['key1', 'key2'])

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [14]:
#沒有固定的分隔
#使用read_table()
#sep使用正規則字串
!cat ex3.txt

pd.read_table('ex3.txt', sep='\s+')

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [16]:
#排除列
#使用引數名稱skiprows
!cat ex4.csv
pd.read_csv('ex4.csv', skiprows=[0, 2, 3])

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [21]:
#處理遺失值
!cat ex5.csv
result = pd.read_csv('ex5.csv')

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [22]:
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [24]:
result = pd.read_csv('ex5.csv', na_values=['NULL'])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [25]:
sentinels = {'message':['foo', 'NA'], 'something':['two']}
pd.read_csv('ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


In [39]:
#實際案例
#選擇想要的欄位

result = pd.read_csv('個股日成交資訊.csv',usecols=['證券代號','成交金額','收盤價','成交筆數'])


dtype('O')