#  <font color=red> Module_05_資料的載入與儲存</font>

## 處理CSV及文字、表格格式的資料

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import sys
import json 
import pickle

msft = pd.read_csv('./mod05/msft.csv')
msft[:5]

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,7/21/2014,83.46,83.53,81.81,81.93,2359300
1,7/18/2014,83.3,83.4,82.52,83.35,4020800
2,7/17/2014,84.35,84.63,83.33,83.63,1974000
3,7/16/2014,83.77,84.91,83.66,84.91,1755600
4,7/15/2014,84.3,84.38,83.2,83.58,1874700


In [2]:
# 指定索引行是第幾行，從 0 開始算起
# 當然也可以讀進來後再用 .set_index() 方法
msft = pd.read_csv('./mod05/msft.csv', index_col = 0) 
msft[:5]

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7/21/2014,83.46,83.53,81.81,81.93,2359300
7/18/2014,83.3,83.4,82.52,83.35,4020800
7/17/2014,84.35,84.63,83.33,83.63,1974000
7/16/2014,83.77,84.91,83.66,84.91,1755600
7/15/2014,84.3,84.38,83.2,83.58,1874700


In [3]:
msft.dtypes

Open      float64
High      float64
Low       float64
Close     float64
Volume      int64
dtype: object

In [4]:
# dtype 可能不是我們要的資料型態
msft.index

Index(['7/21/2014', '7/18/2014', '7/17/2014', '7/16/2014', '7/15/2014',
       '7/14/2014', '7/11/2014', '7/10/2014', '7/9/2014', '7/8/2014',
       ...
       '1/14/2000', '1/13/2000', '1/12/2000', '1/11/2000', '1/10/2000',
       '1/7/2000', '1/6/2000', '1/5/2000', '1/4/2000', '1/3/2000'],
      dtype='object', name='Date', length=3767)

In [5]:
# 可以利用 dtype 參數來決定每一欄的資料型態，寫成字典
# 字典的值通常寫 np.float64 或字串 'float' 都能過成功
msft = pd.read_csv('./mod05/msft.csv', dtype = {'Volume': np.float64}) 
msft.dtypes

Date       object
Open      float64
High      float64
Low       float64
Close     float64
Volume    float64
dtype: object

In [6]:
# 預設是 header = 'infer'
# 試試看改成 header = None，header = 0，header = 1 的差別
df = pd.read_csv('./mod05/msft.csv', 
                 header = 0, 
                 names = ['date', 'open', 'high', 'low', 'close', 'volume']) 
df[:5]

Unnamed: 0,date,open,high,low,close,volume
0,7/21/2014,83.46,83.53,81.81,81.93,2359300
1,7/18/2014,83.3,83.4,82.52,83.35,4020800
2,7/17/2014,84.35,84.63,83.33,83.63,1974000
3,7/16/2014,83.77,84.91,83.66,84.91,1755600
4,7/15/2014,84.3,84.38,83.2,83.58,1874700


In [7]:
# 用 userclos 參數指定載入特定的行
# 這裡一定要載入 Date，因為我們要把他設定成索引標籤
df2 = pd.read_csv('./mod05/msft.csv', 
                    usecols = ['Date', 'Close'],
                    index_col = ['Date']) 
df2[:5]

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
7/21/2014,81.93
7/18/2014,83.35
7/17/2014,83.63
7/16/2014,84.91
7/15/2014,83.58


In [8]:
# 用 .to_csv() 方法把 dataFrame 存成 csv
# 如果索引標籤沒有名字，可以用參數 index_label 給索引標籤取名
df2.to_csv('./mod05/msft_modified.csv', index_label = 'Date') 

---

In [9]:
# 可以用參數 sep 來指定資料是用什麼符號來分隔欄位
# 也可以用 pd.read_table('./mod05/msft.csv', sep =',')，一模一樣的效果
df = pd.read_csv('./mod05/msft.csv', sep = ',') 
df 

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,7/21/2014,83.46,83.53,81.81,81.93,2359300
1,7/18/2014,83.30,83.40,82.52,83.35,4020800
2,7/17/2014,84.35,84.63,83.33,83.63,1974000
3,7/16/2014,83.77,84.91,83.66,84.91,1755600
4,7/15/2014,84.30,84.38,83.20,83.58,1874700
...,...,...,...,...,...,...
3762,1/7/2000,48.55,50.35,47.80,50.00,4621200
3763,1/6/2000,46.78,48.35,46.28,48.03,3306100
3764,1/5/2000,46.94,47.50,45.92,46.75,4809900
3765,1/4/2000,49.80,49.80,47.72,47.85,4489500


In [10]:
# 用 .to_csv() 方法把 dataFrame 存成用 | 隔開的文字檔
# 用參數 index 來決定要不要存 index
df.to_csv('./mod05/msft.piped.txt', sep = '|', index = False)

In [11]:
# 再讀回來
df = pd.read_csv('./mod05/msft.piped.txt', sep = '|')
df[:5]

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,7/21/2014,83.46,83.53,81.81,81.93,2359300
1,7/18/2014,83.3,83.4,82.52,83.35,4020800
2,7/17/2014,84.35,84.63,83.33,83.63,1974000
3,7/16/2014,83.77,84.91,83.66,84.91,1755600
4,7/15/2014,84.3,84.38,83.2,83.58,1874700


---

In [12]:
# 使用參數 skiprows 來排除額外的頁眉
df = pd.read_csv('./mod05/msft2.csv', skiprows = [0, 2, 3])
df

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,7/21/2014,83.46,83.53,81.81,81.93,2359300
1,7/18/2014,83.3,83.4,82.52,83.35,4020800
2,7/17/2014,84.35,84.63,83.33,83.63,1974000
3,7/16/2014,83.77,84.91,83.66,84.91,1755600
4,7/15/2014,84.3,84.38,83.2,83.58,1874700
5,7/14/2014,83.66,84.64,83.11,84.4,1432100
6,7/11/2014,83.55,83.98,82.85,83.35,2001400
7,7/10/2014,85.2,85.57,83.36,83.42,2713300
8,7/9/2014,84.83,85.79,84.76,85.5,1540700


In [13]:
# 用參數 skipfooter 來排除額外的檔尾，skipfooter = 2 表示最後兩列要被排除
# 要加 engine = 'python'
df = pd.read_csv('./mod05/msft_with_footer.csv', 
                 skipfooter = 2,
                 engine = 'python')   
df

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,7/21/2014,83.46,83.53,81.81,81.93,2359300
1,7/18/2014,83.3,83.4,82.52,83.35,4020800


In [14]:
# 有時檔案太大，只想讀前面幾列
# 用參數 nrows 來決定讀取前幾列
pd.read_csv('./mod05/msft.csv', nrows = 3) 

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,7/21/2014,83.46,83.53,81.81,81.93,2359300
1,7/18/2014,83.3,83.4,82.52,83.35,4020800
2,7/17/2014,84.35,84.63,83.33,83.63,1974000


In [15]:
pd.read_csv('./mod05/msft.csv', 
            skiprows = 2, 
            nrows = 3,
            header = 0,
            names = ['date', 'open', 'high', 'low', 'close', 'volume']) # 注意流程有沒有跟妳想的一樣

Unnamed: 0,date,open,high,low,close,volume
0,7/17/2014,84.35,84.63,83.33,83.63,1974000
1,7/16/2014,83.77,84.91,83.66,84.91,1755600
2,7/15/2014,84.3,84.38,83.2,83.58,1874700


---

In [16]:
df = pd.read_csv('./mod05/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [17]:
# pd.read_table() 函式預設的 sep 是空格
# 可以把 sep 拿掉看看差別
df = pd.read_table('./mod05/ex1.csv', sep = ',') 
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [18]:
df = pd.read_table('./mod05/ex1.csv') 
df

Unnamed: 0,"a,b,c,d,message"
0,"1,2,3,4,hello"
1,"5,6,7,8,world"
2,"9,10,11,12,foo"


In [19]:
df = pd.read_csv('./mod05/ex2.csv', header = None) # 說明檔案沒有標頭
df

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [20]:
df = pd.read_csv('./mod05/ex2.csv', header = None, names = ['a', 'b', 'c', 'd', 'message']) 
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [21]:
df =pd.read_csv('./mod05/csv_mindex.csv', index_col = ['key1', 'key2']) # 階層式索引
df

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [22]:
# 使用正規表達式
# 試試看 sep = ' ' 會如何
# 因為第 0 欄沒名字被當成索引
df = pd.read_table('./mod05/ex3.txt', sep = '\s+') 
df

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [23]:
df = pd.read_table('./mod05/ex3.txt', sep = ' ') 
df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,B,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,C
0,aaa,-0.264438,-1.026059,-0.6195,,,,,,,...,,,,,,,,,,
1,bbb,,0.927272,,0.302904,-0.032399,,,,,...,,,,,,,,,,
2,ccc,-0.264273,-0.386314,-0.217601,,,,,,,...,,,,,,,,,,
3,ddd,-0.871858,-0.348382,,1.100491,,,,,,...,,,,,,,,,,


---

In [24]:
# pandas 對遺失值的標記
# 如果是 NA 或是都沒輸入會顯示 NaN
# 注意空格會被認為是空格字串，而不是遺失值
df = pd.read_csv('./mod05/ex5.csv') 
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [25]:
df.isnull()

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [26]:
# 可以用 na_values 參數來自定義那些值也是遺失值
df = pd.read_csv('./mod05/ex5.csv', na_values = ['NULL', 'foo']) # NULL 和 foo 也要當遺失值
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,


In [27]:
 # 對每個欄指定不同的 NA 標記值
df = pd.read_csv('./mod05/ex5.csv', na_values = {'message': ['foo', 'NA'], 'something': ['two']})
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


---

In [28]:
# 如果數據超大，可用 chunksize 分段讀取
chunker = pd.read_csv('./mod05/ex6.csv', chunksize = 1000) # 分段讀取檔案
chunker

<pandas.io.parsers.readers.TextFileReader at 0x1e2cf8372e0>

In [29]:
tot = pd.Series([], dtype = 'float64')
tot

Series([], dtype: float64)

In [30]:
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value = 0)
    
tot = tot.sort_values(ascending = False) 
tot

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
V    328.0
I    327.0
U    326.0
P    324.0
D    320.0
A    320.0
R    318.0
Y    314.0
G    308.0
S    308.0
N    306.0
W    305.0
T    304.0
B    302.0
Z    288.0
C    286.0
4    171.0
6    166.0
7    164.0
8    162.0
3    162.0
5    157.0
2    152.0
0    151.0
9    150.0
1    146.0
dtype: float64

---

In [31]:
data = pd.read_csv('./mod05/ex5.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [32]:
data.to_csv(sys.stdout, sep = '|') # 寫到 sys.stdout，這樣會把文字改為顯示在終端機上

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [33]:
# 把遺失值用 'Null' 輸出
# na_rep 就是 A string representation of a missing value 
data.to_csv(sys.stdout, sep = '|', na_rep = 'Null')

|something|a|b|c|d|message
0|one|1|2|3.0|4|Null
1|two|5|6|Null|8|world
2|three|9|10|11.0|12|foo


In [34]:
data.to_csv(sys.stdout, index = False, header = False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [35]:
data.to_csv(sys.stdout, index = False, columns = ['a', 'b', 'c']) # 只寫出部分的欄位

a,b,c
1,2,3.0
5,6,
9,10,11.0


---

In [36]:
dates = pd.date_range('2000/1/1', periods = 7)
ts = pd.Series(np.arange(7), index = dates)
ts

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
Freq: D, dtype: int32

In [37]:
ts.to_csv('./mod05/tseries.csv') # 序列一樣有 .to_csv() 方法可用

## 讀寫 Excel 格式資料

In [38]:
# 需要安裝 xlrd、openpyxl 套件來讀 XLS 及 XLSX 檔案 
# XLSX 檔案是比較新的格式，通常都會把 Excel 存成 .xlsx 檔
df = pd.read_excel('./mod05/stocks.xlsx') 
df 

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2014-07-21,83.46,83.53,81.81,81.93,2359300
1,2014-07-18,83.30,83.40,82.52,83.35,4020800
2,2014-07-17,84.35,84.63,83.33,83.63,1974000
3,2014-07-16,83.77,84.91,83.66,84.91,1755600
4,2014-07-15,84.30,84.38,83.20,83.58,1874700
...,...,...,...,...,...,...
3762,2000-01-07,48.55,50.35,47.80,50.00,4621200
3763,2000-01-06,46.78,48.35,46.28,48.03,3306100
3764,2000-01-05,46.94,47.50,45.92,46.75,4809900
3765,2000-01-04,49.80,49.80,47.72,47.85,4489500


In [39]:
# 很多參數跟 pd.read_csv() 一樣，就不重複
aapl = pd.read_excel('./mod05/stocks.xlsx', sheet_name = 'aapl') # 讀取其他頁面 
aapl[:5]

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2014-07-21,94.99,95.0,93.72,93.94,38887700
1,2014-07-18,93.62,94.74,93.02,94.43,49898600
2,2014-07-17,95.03,95.28,92.57,93.09,57152000
3,2014-07-16,96.97,97.1,94.74,94.78,53396300
4,2014-07-15,96.8,96.85,95.03,95.32,45477900


---

In [40]:
# 寫出需要 pip install XLWT
# 盡量不要存成 xls
# 存成 .xlsx 檔用 notepad++ 打開會亂碼
df.to_excel('./mod05/stock2.xlsx')   

In [41]:
df.to_excel('./mod05/stock_msft.xlsx', sheet_name = 'MSFT', index = False)

In [42]:
# 寫入多頁面
with pd.ExcelWriter('./mod05/all_stocks.xlsx') as writer:
    aapl.to_excel(writer, sheet_name = 'AAPL')
    df.to_excel(writer, sheet_name = 'MSFT')

## 讀寫 JSON 檔案

In [43]:
df = pd.read_excel('./mod05/stocks.xlsx')
df[:5]

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2014-07-21,83.46,83.53,81.81,81.93,2359300
1,2014-07-18,83.3,83.4,82.52,83.35,4020800
2,2014-07-17,84.35,84.63,83.33,83.63,1974000
3,2014-07-16,83.77,84.91,83.66,84.91,1755600
4,2014-07-15,84.3,84.38,83.2,83.58,1874700


In [44]:
# 注意存成 json 檔後的長相
df.to_json('./mod05/stocks.json')

In [45]:
df_from_json = pd.read_json('./mod05/stocks.json') # json 具有跨平台與程式語言的可攜性
df_from_json[:5]

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2014-07-21,83.46,83.53,81.81,81.93,2359300
1,2014-07-18,83.3,83.4,82.52,83.35,4020800
2,2014-07-17,84.35,84.63,83.33,83.63,1974000
3,2014-07-16,83.77,84.91,83.66,84.91,1755600
4,2014-07-15,84.3,84.38,83.2,83.58,1874700


---

In [46]:
# 注意讀出來資料框的長相
data = pd.read_json('./mod05/example.json')
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [47]:
data.to_json('./mod05/data.json')

In [48]:
# 設定 orient = 'records'，會輸出另一種 json 格式
data.to_json('./mod05/data1.json', orient = 'records')

---

In [49]:
df = pd.DataFrame(np.array([[1, 2], [3, 4]]), 
                  index = ['a', 'b'],
                  columns = ['col1', 'col2'])
df

Unnamed: 0,col1,col2
a,1,2
b,3,4


In [50]:
df.to_json('./mod05/test.json')

In [51]:
# orient = 'records' 不會存入索引標籤
df.to_json('./mod05/test1.json', orient = 'records')

---

In [52]:
# All arrays must be of the same length
# 不是 json 檔就一定可以讀成 Dataframe
pd.read_json('./mod05/cv.json')

ValueError: All arrays must be of the same length

In [None]:
# 利用 json 套件來處理
with open('./mod05/cv.json', mode = 'r', encoding = 'utf-8') as f:
    content = f.read()
    
my_dict = json.loads(content)
my_dict

In [None]:
type(my_dict)

In [None]:
# 每個欄位不一樣長的話沒辦法一次讀取成 dataframe
# 要分開讀成 dataFrame，再用 pd.concat() 或 pd.merge() 函式等等來合併
siblings = pd.DataFrame(my_dict['siblings']) 
siblings

---

In [None]:
# 常見的應用，把個人檔案建成資料框
cv_json = []
cvs = ['cv', 'cv1', 'cv2']
for each in cvs:
    with open(f'./mod05/{each}.json', mode = 'r', encoding='utf-8') as f:
        content = f.read()
        content = json.loads(content)
        cv_json.append(content)

In [None]:
cv_json

In [None]:
for i, each in enumerate(cv_json):
    each['id'] = i 

In [None]:
cv_json

In [None]:
info_col = ['id', 'name', 'places_lived', 'pet']
person = pd.DataFrame(cv_json, columns = info_col)
person

In [None]:
person.pet = person.pet.fillna(0)
person

In [None]:
pieces = []
for each in cv_json:
    sib = pd.DataFrame(each['siblings'])
    sib['id'] = each['id']
    pieces.append(sib)

In [None]:
pieces

In [None]:
siblings = pd.concat(pieces)
siblings

In [None]:
siblings.columns = ['sib_name', 'sib_age', 'sib_pet', 'id']
siblings

In [None]:
# 就類似資料庫的 MySQL
pd.merge(person, siblings)

## 從網站讀取 HTML 資料

In [None]:
# 需要 lxml、html5lib、BeautifulSoup4 套件
# 可以抓取 url 上的列表，但不是每個網站都能成功
# 不行只能爬蟲
url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/"
banks = pd.read_html(url)

In [None]:
len(banks) # 回傳一至多個 Dataframe 物件，傳回 Dataframe 的列表 (依照 Html 裡面有多少表格而定)

In [None]:
banks

In [None]:
failures = banks[0]
failures

In [None]:
failures.dtypes

In [None]:
 # 也可以用 pd.to_datetime() 函式，後面會教到
close_timestamps = failures['Closing DateClosing'].astype('datetime64[ns]')
close_timestamps

In [None]:
close_timestamps.dt.year

In [None]:
# 每年的破產銀行個數
close_timestamps.dt.year.value_counts()

---

In [None]:
df = pd.read_excel('./mod05/stocks.xlsx')
df[:5]

In [None]:
df[:5].to_html('./mod05/stocks.html') # 會為資料產生只擁有 <table> 標籤的檔案

## 讀寫 Pickle 格式檔案

In [None]:
# 是一種儲存二進位格式資料 (也稱為序列化 serialization)
frame = pd.read_csv('./mod05/ex1.csv')
frame

In [None]:
frame.to_pickle('./mod05/frame_pickle')

In [None]:
pd.read_pickle('./mod05/frame_pickle') # 也可以用之前學過的方式，用內建的 pickle 讀取檔案中任何被 pickle 的物件

---

In [None]:
# + 表示可讀寫的意思
file_pickle = open('./mod05/frame_pickle', mode = 'rb+')
frame_from_pickle = pickle.load(file_pickle)
file_pickle.close()

In [None]:
frame_from_pickle

## 讀寫 HDF5 格式檔案

In [None]:
# HDF5 也是常見的一種二進位檔案格式，另一個是 MessagePack
np.random.seed(123456)
df= pd.DataFrame(np.random.randn(8, 3), 
                 index = pd.date_range('1/1/2020', periods = 8),
                 columns = ['A', 'B', 'C'])
df

In [None]:
# 需要用管理者開啟 CMD 安裝 tables 套件
# 把右邊的資料框 df 存進 store 裡，key 為 df 
store = pd.HDFStore('./mod05/store.h5')
store['df'] = df

In [None]:
store.keys()

---

In [None]:
store = pd.HDFStore('./mod05/store.h5')
store.keys()

In [None]:
frame = store['df']
frame[:5]

In [None]:
# 把 frame 做點修改再存進去 store
frame.iloc[0]['A'] = 1
frame

In [None]:
store['df'] = frame

---

In [None]:
# 再從 store 取出看有沒有更新
# 快速的寫法
pd.HDFStore('./mod05/store.h5')['df'][:5]

In [None]:
# 最後要把 store 關閉
store.close()

---

In [None]:
frame = pd.DataFrame({'a': np.random.randn(100)})
frame[:5]

In [None]:
store = pd.HDFStore('./mod05/mydata.h5')

In [None]:
store

In [None]:
# 小心本來有檔案在裡面，是有可能重複存的
store.keys()

In [None]:
# 這是資料框
store['obj1'] = frame

In [None]:
# 這是序列
store['obj1_col'] = frame['a']

In [None]:
store.keys()

In [None]:
store['obj1']

In [None]:
del store['obj1']

In [None]:
store.keys()

In [None]:
# 是 store['obj1'] = frame 的顯式寫法，可以讓我們指定像是儲存格式等參數
# HDFStore 支援 'fixed' 與 'table' 兩種儲存方式，後者慢一點，但它支援特殊的查詢語法
# 把之前的資料框存進 store，key 為 obj1
store.put('obj1', frame, format = 'table')  

In [None]:
store.keys()

In [None]:
store.select('obj1', where = ['index >= 10 and index <= 15'])

In [None]:
store.close()

---

In [None]:
# 上面的更精簡寫法
frame.to_hdf('./mod05/mydata.h5', 'obj2', format = 'table')

In [None]:
pd.read_hdf('./mod05/mydata.h5', 'obj2', where = ['index < 5'] )

## 存取網站上的 CSV 資料

In [53]:
# 用 Python 來抓取政府公開資料(CSV)
# 不是一定會成功
frame = pd.read_csv("https://data.nhi.gov.tw/Datasets/Download.ashx?rid=A21030000I-D50001-001&l=https://data.nhi.gov.tw/resource/mask/maskdata.csv")
frame[:5]

Unnamed: 0,醫事機構代碼,醫事機構名稱,醫事機構地址,醫事機構電話,成人口罩剩餘數,兒童口罩剩餘數,來源資料時間
0,145080011,衛生福利部花蓮醫院豐濱原住民分院,花蓮縣豐濱鄉豐濱村光豐路４１號,8358141,2200,870,2021/10/18 18:26:25
1,291010010,連江縣立醫院,連江縣南竿鄉復興村２１７號,623995,8000,1150,2021/10/18 18:26:25
2,2312010014,新竹市東區衛生所,新竹市東區民族路４０之２號,(03)5236158,2640,640,2021/10/18 18:26:25
3,2312041028,新竹市北區衛生所,新竹市北區國華街六十九號一樓,(03)5353969,550,280,2021/10/18 18:26:25
4,2312050018,新竹市香山衛生所,新竹市香山區牛埔里育德街１８８號２樓,(03)5388109,1190,1050,2021/10/18 18:26:25


## 從遠端資料服務讀取資料

In [54]:
import pandas_datareader as pdr
start = datetime(2017, 4, 1)
end = datetime(2017, 4, 30)
df_msft = pdr.data.DataReader("MSFT", data_source = "yahoo", start = start, end = end)

In [55]:
df_msft

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-03-31,66.190002,65.449997,65.650002,65.860001,21040300,61.719135
2017-04-03,65.940002,65.190002,65.809998,65.550003,20400900,61.428635
2017-04-04,65.809998,65.279999,65.389999,65.730003,12997400,61.597309
2017-04-05,66.349998,65.440002,66.300003,65.559998,21448600,61.438
2017-04-06,66.059998,65.480003,65.599998,65.730003,18103500,61.597309
2017-04-07,65.959999,65.440002,65.849998,65.68,14108500,61.550449
2017-04-10,65.82,65.360001,65.610001,65.529999,17952700,61.409885
2017-04-11,65.610001,64.849998,65.599998,65.480003,18791500,61.363022
2017-04-12,65.510002,65.110001,65.419998,65.230003,17108500,61.12875
2017-04-13,65.860001,64.949997,65.290001,64.949997,17896500,60.866352


In [56]:
start = datetime(2010, 1, 1)
df_2330 = pdr.data.DataReader("2330.TW", "yahoo", start = start)

In [57]:
df_2330

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,65.000000,64.000000,65.000000,64.900002,39407000.0,43.280872
2010-01-05,65.099998,63.900002,65.000000,64.500000,37138000.0,43.014118
2010-01-06,64.900002,63.700001,64.500000,64.900002,49261000.0,43.280872
2010-01-07,65.000000,64.199997,64.900002,64.199997,42134000.0,42.814049
2010-01-08,64.300003,63.500000,63.500000,64.000000,46076000.0,42.680676
...,...,...,...,...,...,...
2021-10-12,575.000000,564.000000,570.000000,575.000000,26522907.0,575.000000
2021-10-13,575.000000,570.000000,572.000000,571.000000,18914374.0,571.000000
2021-10-14,579.000000,573.000000,579.000000,573.000000,13916927.0,573.000000
2021-10-15,600.000000,586.000000,592.000000,600.000000,53150216.0,600.000000


## 存取 MySQL 資料庫

In [58]:
from sqlalchemy import create_engine

# 先安裝套件 sqlalchemy
# 連接資訊務必要打對!
# 'mysql+module://username:password@ip:port/dbname?charset=utf8mb4'
connect_info = 'mysql+pymysql://root:妳的密碼@localhost:3306/testdb?charset=utf8mb4'

# 使用 sqlalchemy 去建立連接引擎 (link-engine)
engine = create_engine(connect_info) 


sql = "SELECT * FROM staff" # SQL 語法
df = pd.read_sql(sql = sql, 
                 con = engine, 
                 parse_dates = 'recordDt')

UnicodeEncodeError: 'latin-1' codec can't encode characters in position 0-3: ordinal not in range(256)

In [59]:
df

Unnamed: 0,col1,col2
a,1,2
b,3,4


In [60]:
# 讀進資料框的每欄位的資料型態是 pandas 觀察出的，不一定是你要的資料型態
df.dtypes

col1    int32
col2    int32
dtype: object

In [61]:
df['ID'] = df['ID'].astype('int')
df['Name'] = df['Name'].astype('S10')
df['DeptId'] = df['DeptId'].astype('int')

KeyError: 'ID'

In [62]:
df.dtypes

col1    int32
col2    int32
dtype: object

---

In [63]:
from sqlalchemy import create_engine
from sqlalchemy.types import NVARCHAR, Float, Integer

# 'mysql+module://username:password@ip:port/dbname?charset=utf8mb4'
connect_info = 'mysql+pymysql://root:妳的密碼@localhost:3306/testdb?charset=utf8mb4'
engine = create_engine(connect_info) # 使用 sqlalchemy 去建立連接引擎 (link-engine)

In [64]:
# 隨便建一個資料框，準備存入 MySQL
df = pd.DataFrame([['a', 1, 2.0, datetime.now(), True]], 
                   columns = ['str', 'int', 'float', 'datetime', 'boolean'])
df

Unnamed: 0,str,int,float,datetime,boolean
0,a,1,2.0,2021-10-18 18:28:42.758958,True


In [65]:
# 把資料框寫入 MySQL 
# name 是寫入的表格名
# 不管是寫出寫入都要注意每個欄位資料型態的改變
# 指定欄位的資料型態
df.to_sql(name = 'data_from_df',  
           con = engine,
           if_exists = 'append',
           index = False,
           dtype= {'str': NVARCHAR(length = 255),
                   'int': Integer(),
                   'float':Float()
                    })  

UnicodeEncodeError: 'latin-1' codec can't encode characters in position 0-3: ordinal not in range(256)