#  <font color=red> Module_00_建立序列與序列的基本操作</font>

## 建立序列(Series)

In [1]:
import pandas as pd
import numpy as np

# data 代入列表
# int64 表示元素的資料型別
# 預設索引標籤
a = pd.Series([10, 11, 12, 13, 14])
a 

0    10
1    11
2    12
3    13
4    14
dtype: int64

In [2]:
a[3] # 此查詢是根據索引標籤

13

In [3]:
b = pd.Series(['Mike', 'Marcia', 'Mikael', 'Bleu'])
b

0      Mike
1    Marcia
2    Mikael
3      Bleu
dtype: object

In [4]:
c = pd.Series([2]*5)
c

0    2
1    2
2    2
3    2
4    2
dtype: int64

In [5]:
d = pd.Series(list('abcde'))
d

0    a
1    b
2    c
3    d
4    e
dtype: object

---

In [6]:
# data 代入字典
# 字典的 key 會被當成索引標籤
e = pd.Series({'Mike': 'Dad',
               'Marcia': 'Mom',
               'Mikael': 'Son',
               'Bleu': 'Best doggies ever'})
e

Mike                    Dad
Marcia                  Mom
Mikael                  Son
Bleu      Best doggies ever
dtype: object

---

In [7]:
# data 代入 ndarray
f = pd.Series(np.arange(4, 9))
f

0    4
1    5
2    6
3    7
4    8
dtype: int32

In [8]:
g = pd.Series(np.linspace(0, 9, 5))
g

0    0.00
1    2.25
2    4.50
3    6.75
4    9.00
dtype: float64

In [9]:
# np.random.normal() 是從平均值是 0 標準差是 1 的常態分佈取值(鐘形曲線)
# 一個標準差內的機率大約: 68%
# 兩個標準差內的機率大約: 95%
# 三個標準差內的機率大約: 99.7%
np.random.seed(12345)
h = pd.Series(np.random.normal(size = 5))
h

0   -0.204708
1    0.478943
2   -0.519439
3   -0.555730
4    1.965781
dtype: float64

---

In [10]:
# data 代入數值
i = pd.Series(2)
i

0    2
dtype: int64

## 序列的屬性

In [11]:
a = pd.Series([1, 2, 3])
a

0    1
1    2
2    3
dtype: int64

In [12]:
a.values

array([1, 2, 3], dtype=int64)

In [13]:
type(a.values)

numpy.ndarray

In [14]:
a.index

RangeIndex(start=0, stop=3, step=1)

In [15]:
# 也可用 len() 函式得知
a.size

3

In [16]:
len(a)

3

In [17]:
a.shape

(3,)

---

In [18]:
b = pd.Series({'Mike': 'Dad',
               'Marcia': 'Mom',
               'Mikael': 'Son',
               'Bleu': 'Best doggies ever'}, name = 'family')
b

Mike                    Dad
Marcia                  Mom
Mikael                  Son
Bleu      Best doggies ever
Name: family, dtype: object

In [19]:
b.values

array(['Dad', 'Mom', 'Son', 'Best doggies ever'], dtype=object)

In [20]:
b.index

Index(['Mike', 'Marcia', 'Mikael', 'Bleu'], dtype='object')

In [21]:
b.size

4

In [22]:
b.shape

(4,)

In [23]:
b.name

'family'

In [24]:
b.index.name = 'people' # 索引標籤也可以設定 name

In [25]:
b

people
Mike                    Dad
Marcia                  Mom
Mikael                  Son
Bleu      Best doggies ever
Name: family, dtype: object

## 在序列建立時指定索引

In [26]:
people = ['Mike', 'Marcia', 'Mikael', 'Bleu']
role = ['Dad', 'Mom', 'Son', 'Dog']
a = pd.Series(people, index = role)
a

Dad      Mike
Mom    Marcia
Son    Mikael
Dog      Bleu
dtype: object

In [27]:
a.index

Index(['Dad', 'Mom', 'Son', 'Dog'], dtype='object')

In [28]:
a['Dad']

'Mike'

## 使用 pd.Series物件.head() 與 pd.Series物件.tail() 與 pd.Series物件.take() 來查詢

In [29]:
a = pd.Series(np.arange(1, 10), index = list('abcdefghi'))
a

a    1
b    2
c    3
d    4
e    5
f    6
g    7
h    8
i    9
dtype: int32

In [30]:
a.head()

a    1
b    2
c    3
d    4
e    5
dtype: int32

In [31]:
a.head(3) # 也可以寫 a.head(n = 3)

a    1
b    2
c    3
dtype: int32

In [32]:
a.tail()

e    5
f    6
g    7
h    8
i    9
dtype: int32

In [33]:
a.tail(3)

g    7
h    8
i    9
dtype: int32

In [34]:
a.take([1, 5, 8]) # 注意這是利用位置 (position) 來查詢

b    2
f    6
i    9
dtype: int32

## 利用 [ ] 運算子來查詢

In [35]:
a = pd.Series(np.arange(10, 15), index = list('abcde'))
a

a    10
b    11
c    12
d    13
e    14
dtype: int32

In [36]:
# 利用索引標籤來查詢
# 查詢一個返回值
a['b']

11

In [37]:
# 查詢多個要加 list
# 查詢兩個以上返回序列
a[['d', 'e']] 

d    13
e    14
dtype: int32

In [38]:
a[0] # 利用位置來查詢 

10

In [39]:
a[[1, 3]] # 查詢多個要加 list

b    11
d    13
dtype: int32

---

In [40]:
b = pd.Series([10 ,20, 30, 40], index = [2, 3, 4, 5])
b

2    10
3    20
4    30
5    40
dtype: int64

In [41]:
b[2] # 注意如果索引標籤是整數，會是以索引標籤查詢為主

10

In [42]:
b[[3, 2]]

3    20
2    10
dtype: int64

## 以 .iloc[ ] 指明位置查詢

In [43]:
a = pd.Series(np.arange(10, 15), index = list('abcde'))
a

a    10
b    11
c    12
d    13
e    14
dtype: int32

In [44]:
a.iloc[1]

11

In [45]:
a.iloc[[2, 3]]

c    12
d    13
dtype: int32

In [46]:
a.iloc[[2, 8]] # 會產生例外

IndexError: positional indexers are out-of-bounds

In [48]:
a.iloc['a']

TypeError: Cannot index by location index with a non-integer key

## 以 .loc[ ] 指明索引標籤查詢

In [50]:
s = pd.Series(np.arange(10, 15), index = list('abcde'))
s

a    10
b    11
c    12
d    13
e    14
dtype: int32

In [51]:
t = pd.Series([10 ,20, 30, 40], index = [2, 3, 4, 5])
t

2    10
3    20
4    30
5    40
dtype: int64

In [52]:
s.loc['b']

11

In [53]:
s.loc[['c', 'd']]

c    12
d    13
dtype: int32

In [54]:
t.loc[[5, 2]]

5    40
2    10
dtype: int64

In [55]:
s.loc[['a', 'f']]  # 會產生例外

KeyError: "['f'] not in index"

In [None]:
s.loc[1]

## 把序列切割(slicing)成子集合

In [56]:
a = pd.Series(np.arange(100, 115), index = np.arange(10, 25))
a

10    100
11    101
12    102
13    103
14    104
15    105
16    106
17    107
18    108
19    109
20    110
21    111
22    112
23    113
24    114
dtype: int32

In [57]:
# 在切割時是使用位置
# print(a[10:12])
# print(a[1:6]) 
# print(a[1:6:2])
# print(a[:5])
# print(a[4:])
# print(a[:5:2])
# print(a[4::2])
# print(a[::-1])
# print(a[4::-2])
# print(a[-4:])
# print(a[:-4])
# print(a[-4:-1])

In [58]:
# print(a.iloc[1:6])
# print(a.iloc[1:6:2])
# print(a.iloc[:5])
# print(a.iloc[4:])
# print(a.iloc[:5:2])
# print(a.iloc[4::2])
# print(a.iloc[::-1])
# print(a.iloc[4::-2])
# print(a.iloc[-4:])
# print(a.iloc[:-4])
# print(a.iloc[-4:-1])

In [59]:
# 索引標籤的切片會包含最後一項
# print(a.loc[10:12]) 
# print(a.loc[10:18:2])
print(a.loc[:19])

10    100
11    101
12    102
13    103
14    104
15    105
16    106
17    107
18    108
19    109
dtype: int32


---

In [60]:
a = pd.Series(np.arange(100, 110), index = np.arange(10, 20))
a

10    100
11    101
12    102
13    103
14    104
15    105
16    106
17    107
18    108
19    109
dtype: int32

In [61]:
# 切片出來的是 view，修改會影響原始資料
b = a[1:4] 
b

11    101
12    102
13    103
dtype: int32

In [62]:
b[11] = 0
b

11      0
12    102
13    103
dtype: int32

In [63]:
a 

10    100
11      0
12    102
13    103
14    104
15    105
16    106
17    107
18    108
19    109
dtype: int32

---

In [64]:
a = pd.Series(np.arange(100, 110), index = np.arange(10, 20))
a

10    100
11    101
12    102
13    103
14    104
15    105
16    106
17    107
18    108
19    109
dtype: int32

In [65]:
b = a.iloc[1:4] # 這是切片所以也是 view
b

11    101
12    102
13    103
dtype: int32

In [66]:
b[11] = 0
b

11      0
12    102
13    103
dtype: int32

In [67]:
a

10    100
11      0
12    102
13    103
14    104
15    105
16    106
17    107
18    108
19    109
dtype: int32

---

In [68]:
a = pd.Series(np.arange(100, 110), index = np.arange(10, 20))
a

10    100
11    101
12    102
13    103
14    104
15    105
16    106
17    107
18    108
19    109
dtype: int32

In [69]:
b = a.iloc[[1, 2, 3]] # 是 copy
b

11    101
12    102
13    103
dtype: int32

In [70]:
b[11] = 0
b

11      0
12    102
13    103
dtype: int32

In [71]:
a

10    100
11    101
12    102
13    103
14    104
15    105
16    106
17    107
18    108
19    109
dtype: int32

---

In [72]:
c = pd.Series(np.arange(0,5), index = ['a', 'b', 'c', 'd', 'e'])
c

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [73]:
c[1:3]

b    1
c    2
dtype: int32

In [74]:
c['a':'c'] # 索引標籤的切割，有包含最後的值

a    0
b    1
c    2
dtype: int32

In [75]:
c.loc['a':'c']

a    0
b    1
c    2
dtype: int32

## 利用索引標籤實現對齊

In [76]:
a = pd.Series([1, 2], index = ['a', 'b'] )
a

a    1
b    2
dtype: int64

In [77]:
b = pd.Series([4, 3], index = ['b', 'a'] )
b

b    4
a    3
dtype: int64

In [78]:
c = a + b
c

a    4
b    6
dtype: int64

In [79]:
# 類似 broadcasting 
# 先建立 pd.Series(2, a.index)，在乘 a
d = a*2 
d

a    2
b    4
dtype: int64

---

In [80]:
e = pd.Series([5, 6], index = ['b', 'c'])
e

b    5
c    6
dtype: int64

In [81]:
a + e # 索引標籤無法對齊的會傳回 NaN

a    NaN
b    7.0
c    NaN
dtype: float64

---

In [82]:
# 索引標籤可以不唯一
f = pd.Series([1.0, 2.0, 3.0], index = ['a', 'a', 'c'])
f

a    1.0
a    2.0
c    3.0
dtype: float64

In [83]:
g = pd.Series([4.0, 5.0, 6.0, 7.0], index = ['a', 'a', 'c', 'a'])
g

a    4.0
a    5.0
c    6.0
a    7.0
dtype: float64

In [84]:
# 使用笛卡兒積 # 假設 f 有 n 個 'a' 標籤， g 有 m 個 'a' 標籤，出來會有 n*m 個 'a'
f + g 

a    5.0
a    6.0
a    8.0
a    6.0
a    7.0
a    9.0
c    9.0
dtype: float64

## 布林選擇

In [85]:
a = pd.Series(np.arange(0, 5), index = list('abcde'))
a

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [86]:
logical_results = a >= 3
logical_results

a    False
b    False
c    False
d     True
e     True
dtype: bool

In [87]:
a[logical_results]

d    3
e    4
dtype: int32

In [88]:
a[a > 1]

c    2
d    3
e    4
dtype: int32

In [89]:
a[(a >= 2) and (a < 5)] # 使用 and、or 會產生例外

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [90]:
a[(a >= 2) & (a < 5)] # 改用 &、| 

c    2
d    3
e    4
dtype: int32

In [91]:
(a >= 0).all()

True

In [92]:
(a < 2).any()

True

In [93]:
(a < 2).sum()

2

## 將序列重新索引

In [94]:
np.random.seed(123456)
a = pd.Series(np.random.randn(5))
a

0    0.469112
1   -0.282863
2   -1.509059
3   -1.135632
4    1.212112
dtype: float64

In [95]:
# in-place
# 是直接改索引標籤，值不會跟著變動
a.index = [4, 3, 2, 1, 0]
a

4    0.469112
3   -0.282863
2   -1.509059
1   -1.135632
0    1.212112
dtype: float64

In [96]:
a.index = ['a', 'b', 'c', 'd', 'e'] 
a

a    0.469112
b   -0.282863
c   -1.509059
d   -1.135632
e    1.212112
dtype: float64

---

In [97]:
# 應用
d = pd.Series([0, 1, 2], index = [0, 1, 2])
e = pd.Series([3, 4, 5], index = ['0', '1', '2'])
d

0    0
1    1
2    2
dtype: int64

In [98]:
e

0    3
1    4
2    5
dtype: int64

In [99]:
d + e

0   NaN
1   NaN
2   NaN
0   NaN
1   NaN
2   NaN
dtype: float64

In [100]:
# 也可以用 e.index = e.index.astype('int')
e.index = e.index.values.astype('int')
d + e

0    3
1    5
2    7
dtype: int64

---

In [101]:
b = pd.Series(np.random.randn(4), index = ['a', 'b', 'c', 'd'])
b

a   -0.173215
b    0.119209
c   -1.044236
d   -0.861849
dtype: float64

In [102]:
c = b.reindex(['a', 'c', 'g']) # reindex 方法不是 in-place
c

a   -0.173215
c   -1.044236
g         NaN
dtype: float64

In [103]:
d = pd.Series(np.random.randn(5), index = list('abced'))
d

a   -2.104569
b   -0.494929
c    1.071804
e    0.721555
d   -0.706771
dtype: float64

In [104]:
e = d.reindex(['a', 'f'], fill_value = 0)
e

a   -2.104569
f    0.000000
dtype: float64

In [105]:
f = pd.Series(['red', 'green', 'blue'], index = [0, 3, 5])
f

0      red
3    green
5     blue
dtype: object

In [106]:
g = f.reindex(np.arange(0, 7), method = 'ffill')
g

0      red
1      red
2      red
3    green
4    green
5     blue
6     blue
dtype: object

In [107]:
h = f.reindex(np.arange(0, 7), method = 'bfill')
h

0      red
1    green
2    green
3    green
4     blue
5     blue
6      NaN
dtype: object

## 原地修改序列

In [108]:
np.random.seed(123456)
a = pd.Series(np.random.randn(3), index = ['a', 'b', 'c'])
a

a    0.469112
b   -0.282863
c   -1.509059
dtype: float64

In [109]:
a['d'] = 100 # in-place
a

a      0.469112
b     -0.282863
c     -1.509059
d    100.000000
dtype: float64

In [110]:
a['d'] = -100 # in-place
a

a      0.469112
b     -0.282863
c     -1.509059
d   -100.000000
dtype: float64

In [111]:
del a['a'] # in-place
a

b     -0.282863
c     -1.509059
d   -100.000000
dtype: float64

---

In [112]:
b = a.copy()
b

b     -0.282863
c     -1.509059
d   -100.000000
dtype: float64

In [113]:
c = b[:2] # view
c

b   -0.282863
c   -1.509059
dtype: float64

In [114]:
c['b'] = 0
c

b    0.000000
c   -1.509059
dtype: float64

In [115]:
b 

b      0.000000
c     -1.509059
d   -100.000000
dtype: float64

In [116]:
a

b     -0.282863
c     -1.509059
d   -100.000000
dtype: float64

## 綜合應用

In [117]:
dates = pd.date_range('2016-04-01', '2016-04-06')
dates

DatetimeIndex(['2016-04-01', '2016-04-02', '2016-04-03', '2016-04-04',
               '2016-04-05', '2016-04-06'],
              dtype='datetime64[ns]', freq='D')

In [118]:
temps1 = pd.Series([80, 82, 85, 90, 83, 87], index = dates)
temps1

2016-04-01    80
2016-04-02    82
2016-04-03    85
2016-04-04    90
2016-04-05    83
2016-04-06    87
Freq: D, dtype: int64

In [119]:
temps1['2016-04-04']

90

---

In [120]:
temps2 = pd.Series([70, 75, 69, 83, 79, 77], index = dates)
temps2

2016-04-01    70
2016-04-02    75
2016-04-03    69
2016-04-04    83
2016-04-05    79
2016-04-06    77
Freq: D, dtype: int64

In [121]:
# 會進行對齊
temp_diffs = temps1 - temps2
temp_diffs

2016-04-01    10
2016-04-02     7
2016-04-03    16
2016-04-04     7
2016-04-05     4
2016-04-06    10
Freq: D, dtype: int64

In [122]:
temp_diffs[2]

16

In [123]:
temp_diffs.mean()

9.0

---

In [124]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
a = pd.Series(sdata)
a

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [125]:
'Ohio' in a 

True

In [126]:
'california' in a

False

In [127]:
# 會執行對齊
states = ['california', 'Ohio', 'Oregon', 'Texas']
b = pd.Series(a, index = states)
b

california        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [128]:
pd.isnull(b) # 也可寫成方法 b.isnull()

california     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [129]:
pd.notnull(b)

california    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [130]:
a + b 

Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
california         NaN
dtype: float64

In [131]:
b.name = 'population'
b

california        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [132]:
b.index.name = 'state'
b

state
california        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64