<img width=150 src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/1a/NumPy_logo.svg/200px-NumPy_logo.svg.png"></img>

# Day-09 Pandas 物件的定義與屬性

* 教學目標：
  * 知道 Pandas 的特性與貢獻
  * 能夠使用 DataFrame 與 Series 當中的屬性
  * 初步理解 Seies、DataFrame 與 NdArray 的比較
  * 能夠使用不同的方法初始化一個陣列
  * 知道固定大小對於陣列的意義
  * 了解不同的亂數陣列有什麼差異
  * 讀寫csv、excel
  * 讀寫json
  * 讀寫資料庫SQL
* 範例重點：
  * 讀寫不同檔案格式會有不同程式可以達到csv(read_csv、to_csv)、excel(read_excel、to_excel)、json(read_json、to_json)、SQL資料庫(io.sql.read_sql、to_sql)
  * 讀取進來後都是DataFrame的型態

## 匯入套件

In [None]:
# 載入 NumPy, Pandas 套件
import numpy as np
import pandas as pd

# 檢查正確載入與版本
print(np)
print(np.__version__)
print(pd)
print(pd.__version__)

<module 'numpy' from 'D:\\anaconda3\\lib\\site-packages\\numpy\\__init__.py'>
1.19.2
<module 'pandas' from 'D:\\anaconda3\\lib\\site-packages\\pandas\\__init__.py'>
1.1.3


## Series

In [None]:
s = pd.Series([1, 2, 3])
print(s)
print(type(s))

0    1
1    2
2    3
dtype: int64
<class 'pandas.core.series.Series'>


### 常用屬性

In [None]:
print(s.shape) # (3, )
print(s.size) # 3
print(s.dtype) # int64

(3,)
3
int64


### 範例

In [None]:
s = pd.Series([1,2,3],  index=['Amy', 'Bob', 'Tom'])
print(s)
s

Amy    1
Bob    2
Tom    3
dtype: int64


Amy    1
Bob    2
Tom    3
dtype: int64

## DataFrame

In [None]:
df = pd.DataFrame([1, 2, 3])
print(df)
print(type(df))

   0
0  1
1  2
2  3
<class 'pandas.core.frame.DataFrame'>


### 常用屬性

In [None]:
print(df.shape) # (3, 1)
print(df.size) # 3
print(df.dtypes) 

(3, 1)
3
0    int64
dtype: object


In [None]:
df = pd.DataFrame([1, 2, 3], index=['a', 'b', 'c'], columns=['No'])
print(df)
df

   No
a   1
b   2
c   3


Unnamed: 0,No
a,1
b,2
c,3


### 二維

In [None]:
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])
print(df)
df

   A  B  C
a  1  2  3
b  4  5  6


Unnamed: 0,A,B,C
a,1,2,3
b,4,5,6


### 範例

In [None]:
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])
print(df)
dfdf = pd.DataFrame({
    'Name': ['Alice', 'Bob'],
    'Age': [18, 20],
})
print(df)
df

    Name  Age
0  Alice   18
1    Bob   20


Unnamed: 0,Name,Age
0,Alice,18
1,Bob,20


In [None]:
df = pd.DataFrame([
  {'Name': 'Alice', 'Age': 18},
  {'Name': 'Bob', 'Age': 20}
])
print(df)
df

   Age   Name
0   18  Alice
1   20    Bob


Unnamed: 0,Age,Name
0,18,Alice
1,20,Bob


### DataFrame 是由 Series 组成的

In [None]:
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])
print(df['B'])
print(type(df['B']))

a    2
b    5
Name: B, dtype: int64
<class 'pandas.core.series.Series'>


In [None]:
print(df.shape)
print(df.size)
print(df.index)
print(df.columns)
print(df.values)

(2, 3)
6
Index(['a', 'b'], dtype='object')
Index(['A', 'B', 'C'], dtype='object')
[[1 2 3]
 [4 5 6]]


In [None]:
print(df.head())
print(df.tail())
print(df.describe())
print(df.info())

   A  B  C
a  1  2  3
b  4  5  6
   A  B  C
a  1  2  3
b  4  5  6
             A        B        C
count  2.00000  2.00000  2.00000
mean   2.50000  3.50000  4.50000
std    2.12132  2.12132  2.12132
min    1.00000  2.00000  3.00000
25%    1.75000  2.75000  3.75000
50%    2.50000  3.50000  4.50000
75%    3.25000  4.25000  5.25000
max    4.00000  5.00000  6.00000
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, a to b
Data columns (total 3 columns):
A    2 non-null int64
B    2 non-null int64
C    2 non-null int64
dtypes: int64(3)
memory usage: 64.0+ bytes
None


## 安裝 Package

In [None]:
!pip install openpyxl
!pip install XLRD



### CSV

In [None]:
iris_data = pd.read_csv('iris.csv')
iris_data

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [None]:
iris_data = pd.read_csv('iris.csv', usecols=['petal length','petal width','target'])
iris_data

Unnamed: 0,petal length,petal width,target
0,1.4,0.2,0
1,1.4,0.2,0
2,1.3,0.2,0
3,1.5,0.2,0
4,1.4,0.2,0
...,...,...,...
145,5.2,2.3,2
146,5.0,1.9,2
147,5.2,2.0,2
148,5.4,2.3,2


In [None]:
iris_data = pd.read_csv('iris.csv', header=0, names=['featrue1','featrue2','featrue3','featrue4','target'])
iris_data

Unnamed: 0,featrue1,featrue2,featrue3,featrue4,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [None]:
iris_data.to_csv('my_iris.csv')

### EXCEL

In [None]:
data = pd.read_excel('data.xls')
data

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [None]:
boston_data = pd.read_excel('data.xls', sheet_name='boston')
boston_data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48


In [None]:
boston_data = pd.read_excel('data.xls',sheet_name='boston',header=0 \
                            ,usecols=['TAX','PTRATIO','B','LSTAT'])
boston_data

Unnamed: 0,TAX,PTRATIO,B,LSTAT
0,296,15.3,396.90,4.98
1,242,17.8,396.90,9.14
2,242,17.8,392.83,4.03
3,222,18.7,394.63,2.94
4,222,18.7,396.90,5.33
...,...,...,...,...
501,273,21.0,391.99,9.67
502,273,21.0,396.90,9.08
503,273,21.0,396.90,5.64
504,273,21.0,393.45,6.48


In [None]:
boston_data.to_excel('my_boston.xlsx',sheet_name='boston')

In [None]:
boston_data[:5]

Unnamed: 0,TAX,PTRATIO,B,LSTAT
0,296,15.3,396.9,4.98
1,242,17.8,396.9,9.14
2,242,17.8,392.83,4.03
3,222,18.7,394.63,2.94
4,222,18.7,396.9,5.33


### JSON

In [None]:
import json

In [None]:
boston_data = pd.read_json('boston.json')
boston_data

Unnamed: 0,TAX,PTRATIO,B,LSTAT
0,296,15.3,396.90,4.98
1,242,17.8,396.90,9.14
2,242,17.8,392.83,4.03
3,222,18.7,394.63,2.94
4,222,18.7,396.90,5.33
...,...,...,...,...
501,273,21.0,391.99,9.67
502,273,21.0,396.90,9.08
503,273,21.0,396.90,5.64
504,273,21.0,393.45,6.48


In [None]:
boston_data.to_json('boston.json')

### SQL

In [None]:
import sqlite3

In [None]:
boston_data = pd.read_excel('data.xls',sheet_name='boston',header=0 \
                            ,usecols=['TAX','PTRATIO','B','LSTAT'])
connection = sqlite3.connect('sql_db.sqlite')
boston_data.to_sql('boston', connection, if_exists='replace')
connection.commit()
connection.close()

In [None]:
connection = sqlite3.connect('sql_db.sqlite')
boston_data_sql = pd.io.sql.read_sql("select * from boston",connection)
connection.close()
boston_data_sql

Unnamed: 0,index,TAX,PTRATIO,B,LSTAT
0,0,296,15.3,396.90,4.98
1,1,242,17.8,396.90,9.14
2,2,242,17.8,392.83,4.03
3,3,222,18.7,394.63,2.94
4,4,222,18.7,396.90,5.33
...,...,...,...,...,...
501,501,273,21.0,391.99,9.67
502,502,273,21.0,396.90,9.08
503,503,273,21.0,396.90,5.64
504,504,273,21.0,393.45,6.48


In [None]:
boston_data_sql

Unnamed: 0,index,TAX,PTRATIO,B,LSTAT
0,0,296,15.3,396.90,4.98
1,1,242,17.8,396.90,9.14
2,2,242,17.8,392.83,4.03
3,3,222,18.7,394.63,2.94
4,4,222,18.7,396.90,5.33
...,...,...,...,...,...
501,501,273,21.0,391.99,9.67
502,502,273,21.0,396.90,9.08
503,503,273,21.0,396.90,5.64
504,504,273,21.0,393.45,6.48
