## 使用內建功能讀取 txt 檔

# [教學目標]
- 示範 Pandas 各種 讀取 / 寫入 檔案的方式

# [範例重點]
- 讀取 txt 檔 (In[2], Out[2])
- 存取 json 檔 (In[4], In[5], In[7], In[8])
- 存取 npy 檔 (numpy專用檔, In[10], In[11]) 
- 讀取 Pickle 檔 (In[12], In[13])

In [58]:
with open("Part01/example.txt", 'r') as f:
    data = f.readlines()
#     data = f.readline()
#     data = f.read()
print(data)

['id,sex,age,score\n', '001,F,20,77\n', '002,F,25,90\n', '003,M,22,80\n', '004,F,30,66\n', '005,M,40,60\n', '006,M,29,87']


## 將 txt 轉成 pandas dataframe

In [59]:
import pandas as pd

data = []
with open("Part01/example.txt", 'r') as f:
    for line in f:
#         line = line.replace('\n', '').split(',') # 將每句最後的 /n 取代成空值後，再以逗號斷句
        line = line.strip().split(',') # 這行也行
        data.append(line)
data

[['id', 'sex', 'age', 'score'],
 ['001', 'F', '20', '77'],
 ['002', 'F', '25', '90'],
 ['003', 'M', '22', '80'],
 ['004', 'F', '30', '66'],
 ['005', 'M', '40', '60'],
 ['006', 'M', '29', '87']]

In [60]:
df = pd.DataFrame(data[1:])
df.columns = data[0]
df

Unnamed: 0,id,sex,age,score
0,1,F,20,77
1,2,F,25,90
2,3,M,22,80
3,4,F,30,66
4,5,M,40,60
5,6,M,29,87


## 將資料轉成 json 檔後輸出
將 json 讀回來後，是否與我們原本想要存入的方式一樣? (以 id 為 key)

In [61]:
import json
df.to_json('Part01/example01.json')

In [62]:
# 上面的存入方式，會將 column name 做為主要的 key, row name 做為次要的 key
with open('Part01/example01.json', 'r') as f:
    j1 = json.load(f)
j1

{'id': {'0': '001',
  '1': '002',
  '2': '003',
  '3': '004',
  '4': '005',
  '5': '006'},
 'sex': {'0': 'F', '1': 'F', '2': 'M', '3': 'F', '4': 'M', '5': 'M'},
 'age': {'0': '20', '1': '25', '2': '22', '3': '30', '4': '40', '5': '29'},
 'score': {'0': '77', '1': '90', '2': '80', '3': '66', '4': '60', '5': '87'}}

In [63]:
df.set_index('id', inplace=True)
df

Unnamed: 0_level_0,sex,age,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,F,20,77
2,F,25,90
3,M,22,80
4,F,30,66
5,M,40,60
6,M,29,87


In [64]:
df.to_json('Part01/example02.json', orient='index')

In [65]:
with open('Part01/example02.json', 'r') as f:
    j2 = json.load(f)
j2

{'001': {'sex': 'F', 'age': '20', 'score': '77'},
 '002': {'sex': 'F', 'age': '25', 'score': '90'},
 '003': {'sex': 'M', 'age': '22', 'score': '80'},
 '004': {'sex': 'F', 'age': '30', 'score': '66'},
 '005': {'sex': 'M', 'age': '40', 'score': '60'},
 '006': {'sex': 'M', 'age': '29', 'score': '87'}}

In [66]:
df2 = pd.DataFrame(j2)
df2

Unnamed: 0,001,002,003,004,005,006
age,20,25,22,30,40,29
score,77,90,80,66,60,87
sex,F,F,M,F,M,M


In [67]:
df2 = df2.T
df2

Unnamed: 0,age,score,sex
1,20,77,F
2,25,90,F
3,22,80,M
4,30,66,F
5,40,60,M
6,29,87,M


## 將檔案存為 npy 檔
一個專門儲存 numpy array 的檔案格式
使用 npy 通常可以讓你更快讀取資料喔!  
[建議閱讀](https://towardsdatascience.com/why-you-should-start-using-npy-file-more-often-df2a13cc0161)

In [68]:
import numpy as np
# 將 data 的數值部分轉成 numpy array
array = np.array(data[1:])
array

array([['001', 'F', '20', '77'],
       ['002', 'F', '25', '90'],
       ['003', 'M', '22', '80'],
       ['004', 'F', '30', '66'],
       ['005', 'M', '40', '60'],
       ['006', 'M', '29', '87']], dtype='<U3')

In [69]:
np.save(arr=array, file='Part01/example.npy')

In [71]:
array_back = np.load('Part01/example.npy')
array_back

array([['001', 'F', '20', '77'],
       ['002', 'F', '25', '90'],
       ['003', 'M', '22', '80'],
       ['004', 'F', '30', '66'],
       ['005', 'M', '40', '60'],
       ['006', 'M', '29', '87']], dtype='<U3')

In [113]:
# 自己亂打的練習
import numpy as np
import pandas as pd

groups = ["Modern Web", "DevOps", np.nan, "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, np.nan]

ironmen_dict = {
                "groups": groups,
                "ironmen": ironmen
}

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen_dict)
print(ironmen_df)


ironmen_df_na_dropped = ironmen_df.dropna(axis=0) # 有遺失值的觀測值都刪除
print(ironmen_df_na_dropped)
print("---") # 分隔線
ironmen_df_na_filled = ironmen_df.fillna(0) # 有遺失值的觀測值填補 0
print(ironmen_df_na_filled)
print("---") # 分隔線
ironmen_df_na_filled = ironmen_df.fillna({"groups": "Cloud", "ironmen": 71}) # 依欄位填補遺失值
print(ironmen_df_na_filled)

       groups  ironmen
0  Modern Web     59.0
1      DevOps      9.0
2         NaN     19.0
3    Big Data     14.0
4    Security      6.0
5       自我挑戰組      NaN
       groups  ironmen
0  Modern Web     59.0
1      DevOps      9.0
3    Big Data     14.0
4    Security      6.0
---
       groups  ironmen
0  Modern Web     59.0
1      DevOps      9.0
2           0     19.0
3    Big Data     14.0
4    Security      6.0
5       自我挑戰組      0.0
---
       groups  ironmen
0  Modern Web     59.0
1      DevOps      9.0
2       Cloud     19.0
3    Big Data     14.0
4    Security      6.0
5       自我挑戰組     71.0


## Pickle
存成 pickle 檔  
什麼都包，什麼都不奇怪的 [Pickle](https://docs.python.org/3/library/pickle.html)  
比如說 [CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html) 的資料集就是用 pickle 包的喔!

In [114]:
import pickle
with open('Part01/example.pkl', 'wb') as f:
    pickle.dump(file=f, obj=data)

In [115]:
with open('Part01/example.pkl', 'rb') as f:
    pkl_data = pickle.load(f)
pkl_data

{'G': ['Modern Web', 'DevOps', 'Cloud', 'Big Data', 'Security', '自我挑戰組'],
 'I': [59, 9, 19, 14, 6, 77]}