# D1：NumPy陣列定義與屬性

In [2]:
import numpy as np
a = np.arange(15).reshape(3, 5)
print(a)
# array([[ 0,  1,  2,  3,  4],
#        [ 5,  6,  7,  8,  9],
#        [10, 11, 12, 13, 14]])

print(type(a)) # <type 'numpy.ndarray'>

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]
<class 'numpy.ndarray'>


In [3]:
print(a.ndim) # 2
print(a.shape) # (3,5)
print(a.size) # 15
print(a.dtype) # int32
print(a.itemsize) # 4
print(a.data) # <memory at 0x00000296AB488828> 

2
(3, 5)
15
int32
4
<memory at 0x0000020382455908>


In [4]:
list(a)
# [array([0, 1, 2, 3, 4]), array([5, 6, 7, 8, 9]), array([10, 11, 12, 13, 14])]

[array([0, 1, 2, 3, 4]), array([5, 6, 7, 8, 9]), array([10, 11, 12, 13, 14])]

In [5]:
a.tolist()
# [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]

[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]

# D2：NumPy陣列資料型態

In [7]:
print(a.dtype == 'int32') # True
print(a.dtype is 'int32') # False
print(a.dtype is np.dtype('int32')) # True

True
False
True


In [8]:
print(a.dtype == 'int') 
print(a.dtype == np.int) 
print(a.dtype == np.dtype('int')) 

True
True
True


# D3：NumPy陣列的初始化

In [10]:
print(np.arange( 10, 30, 5 ))
# array([10, 15, 20, 25])
print(np.linspace( 0, 2, 3 ))
# array([0. 1. 2.])
print(np.logspace( 0, 2, 3 ))
# array([1. 10. 100.])

[10 15 20 25]
[0. 1. 2.]
[  1.  10. 100.]


In [19]:
from numpy.random import default_rng
rng = default_rng()

normal = rng.standard_normal((3,2)) # 3*2 array，元素值取3*2次符合Normal(mu=0,sigma=1)的值
random = rng.random((3,2)) # 3*2 array，元素值取3*2次[0,1)中的float值(均勻)
integers = rng.integers(0, 10, size=(3,2)) # 3*2 array，元素值取3*2次[0,10)中的值

print(normal)
print(random)
print(integers)

[[ 1.2361904   1.15169374]
 [ 0.08386839 -1.21398297]
 [ 0.38800389  2.1422208 ]]
[[0.34783311 0.07435547]
 [0.38373747 0.57392032]
 [0.39060422 0.59626724]]
[[3 6]
 [2 8]
 [3 1]]


In [23]:
dt = np.dtype({'names':('Name', 'num1', 'num2', 'True'), 'formats':((np.str_, 5), np.int32, int, 'U3')})
c = np.zeros(3, dtype=dt)
c

array([('', 0, 0, ''), ('', 0, 0, ''), ('', 0, 0, '')],
      dtype=[('Name', '<U5'), ('num1', '<i4'), ('num2', '<i4'), ('True', '<U3')])

In [24]:
name = ['Chloe', 'Charlotte', 'Clara']
num_1 = [11, 12, 13]
num_2 = [14, 15, 16]
check = ['Y', 'Y', 'N']

c['Name'] = name
c['num1'] = num_1
c['num2'] = num_2
c['True'] = check
print(c)

[('Chloe', 11, 14, 'Y') ('Charl', 12, 15, 'Y') ('Clara', 13, 16, 'N')]


In [25]:
c_rec = c.view(np.recarray)
c_rec

rec.array([('Chloe', 11, 14, 'Y'), ('Charl', 12, 15, 'Y'),
           ('Clara', 13, 16, 'N')],
          dtype=[('Name', '<U5'), ('num1', '<i4'), ('num2', '<i4'), ('True', '<U3')])

In [26]:
c_rec.Name

array(['Chloe', 'Charl', 'Clara'], dtype='<U5')

# D4：NumPy陣列的算數運算

In [2]:
import numpy as np

a = np.array([[1.0, 2.0], [3.0, 4.0]])
# [[1. 2.]
#  [3. 4.]]
y = np.array([[5.], [7.]])
# [[5.]]
#  [7.]]

print(a.transpose()) # 非永久改變a
# array([[1., 3.],
#        [2., 4.]])
print(np.linalg.inv(a)) # inverse of a
# array([[-2. ,  1. ],
#        [ 1.5, -0.5]])
print(np.trace(a))
# 2.0
print(np.linalg.solve(a, y))  # ax = y 的解
# array([[-3.],
#        [ 4.]])

[[1. 3.]
 [2. 4.]]
[[1. 2.]
 [3. 4.]]
[[-2.   1. ]
 [ 1.5 -0.5]]
5.0
[[-3.]
 [ 4.]]


In [3]:
a = np.array([[1.0, 2.0], [3.0, 4.0]])
# [[1. 2.]
#  [3. 4.]] 

a_inv = np.linalg.inv(a)
# array([[-2.  ,  1. ],
#   [ 1.5, -0.5]])

I = np.dot(a,a_inv)

print(I)
# [[1.0000000e+00 0.0000000e+00]
#  [8.8817842e-16 1.0000000e+00]]

print(np.around(I))
#[[1. 0.]
# [0. 1.]]
print(I.astype(np.int64))
#[[1 0]
# [0 0]]

[[1.00000000e+00 1.11022302e-16]
 [0.00000000e+00 1.00000000e+00]]
[[1. 0.]
 [0. 1.]]
[[0 0]
 [0 1]]


# D5：NumPy陣列的邏輯運算

In [5]:
import numpy as np
a = np.array( [20,30,40,50] )
b = np.arange( 4 )
# 比較運算
print(a > b) # [ True  True  True  True]
print(a < b) # [False False False False]
print(a == b) # [False False False False]
print(a != b) # [ True  True  True  True]

a = np.array( [True, True, False, False] )
b = np.array( [True, False, True, False]  )
# 邏輯運算
#print(a and b) # ValueError

# 位元運算
print(a & b) # [ True False False False]
print(a | b) # [ True  True  True False]

[ True  True  True  True]
[False False False False]
[False False False False]
[ True  True  True  True]
[ True False False False]
[ True  True  True False]


In [6]:
import numpy as np

a = np.array( [10, 20, 30, 40] )

print(a[ [True, True, True, True] ])
# [10 20 30 40]
print(a[ [True, False, True, False] ])
# [10 30]
print(a[ [False, False, False, False] ])
# []

[10 20 30 40]
[10 30]
[]


In [10]:
import numpy as np

a = np.array( [10, 20, 30, 40] )
print(a > 20) # 單純印出bool值
# [False False  True  True]
print(a[ [False, False, True, True] ]) # 將bool值與a一一結合，只印出被定義為True的element
# [30 40] 
print(a[ a > 20 ]) # 結合bool值，只印True的element
# [30 40] 

# python容器做法
b = []
for i in a:
  if i > 20:
    b.append(i)
print(b) # [30, 40]

[False False  True  True]
[30 40]
[30 40]
[30, 40]


In [11]:
import numpy as np

print(np.any([True, True, True])) # True
print(np.any([True, False, False])) # True
print(np.any([False, False, False])) # False

print(np.all([True, True, True])) # True
print(np.all([True, False, False])) # False
print(np.all([False, False, False])) # False

True
True
False
True
False
False


# D6：NumPy中常見的陣列方法與函式

In [19]:
import numpy as np 

print(np.searchsorted([1,2,3,4,5], 3))# 2
print(np.searchsorted(
    [1, 2, 5, 4, 3],
    [-10, 10, 3, 5]
))# [0, 5, 2, 5]
print(np.searchsorted(
    [1, 2, 3, 4, 5],
    [-10, 10, 3, 5]
))# [0, 5, 2, 4]

2
[0 5 2 5]
[0 5 2 4]


In [26]:
a = np.arange(10)
print(np.where(a, 1, -1)) # [-1  1  1  1  1  1  1  1  1  1]
print(np.where(a > 5, 1, -1)) # [-1 -1 -1 -1 -1 -1  1  1  1  1]
print(np.where([[True,False], [True,True]],    # 第一個True對應x,y的1和9，因為要True，所以選1，其他以此類推
			 [[1,2], [3,4]],
             [[9,8], [7,6]]))
# [[1, 8],
#  [3, 4]]

a = np.array([2, 4, 6, 8, 10])
print(np.where(a>5)) # (array([2, 3, 4], dtype=int64),)
print(a[np.where(a>5)]) # [6 8 10]

print(np.where([[0, 1], [1, 0]]))  # 會return True的是value=1的element，座標分別是(0,1),(1,0)
# (array([0, 1], dtype=int64), array([1, 0], dtype=int64))

[-1  1  1  1  1  1  1  1  1  1]
[-1 -1 -1 -1 -1 -1  1  1  1  1]
[[1 8]
 [3 4]]
(array([2, 3, 4], dtype=int64),)
[ 6  8 10]
(array([0, 1], dtype=int64), array([1, 0], dtype=int64))


In [41]:
import numpy as np 

a = np.arange(6) # [0, 1, 2, 3, 4, 5]
print(a.reshape(3, -1)) # 新形狀可模糊指定為-1，numpy會自動計算
print(a.reshape(3, 2))
# [[0 1]
#  [2 3]
#  [4 5]]
a.reshape(3,2)[0,0] = 100
print(a) # [100, 1, 2, 3, 4, 5] 本身的值也被改變了

a = np.arange(6) # [0, 1, 2, 3, 4, 5]
print(a.resize((3, 2))) # None
print(a)
# [[0 1]
#  [2 3]
#  [4 5]]
a.resize(3, 1)
print(a)
# [[0]
# [1]
# [2]]
a.resize(3, 3)
print(a)
# [[0 1 2]
# [0 0 0]
# [0 0 0]]

[[0 1]
 [2 3]
 [4 5]]
[[0 1]
 [2 3]
 [4 5]]
[100   1   2   3   4   5]
None
[[0 1]
 [2 3]
 [4 5]]
[[0]
 [1]
 [2]]
[[0 1 2]
 [0 0 0]
 [0 0 0]]


In [35]:
a = np.arange(6).reshape((3, 2))
print(a.ravel()) # [0 1 2 3 4 5]
a.ravel()[1] = 100
print(a.flatten()) # [  0 100   2   3   4   5] a被改動了
print(a.flat) # <numpy.flatiter object at 0x0000022788329B50>
print(a.flat[2]) # 2

[0 1 2 3 4 5]
[  0 100   2   3   4   5]
<numpy.flatiter object at 0x0000022788329B50>
2


# D7：NumPy陣列的索引、切片與迭代

In [1]:
import numpy as np 

a = np.arange(6).reshape(3, 2)

for d in a.flat:
    print(d)

0
1
2
3
4
5


In [6]:
Z = np.random.randint(0, 100, 1000000).reshape(100, 100, 100)
%timeit -n 10 a = 2
for i in np.nditer(Z):
    i = i+1

129 ns ± 54.4 ns per loop (mean ± std. dev. of 7 runs, 10 loops each)


# D8：Pandas物件定義與屬性

In [7]:
import pandas as pd

s = pd.Series([1, 2, 3])
print(s.shape) # (3, )
print(s.size) # 3
print(s.dtype) # int64

(3,)
3
int64


In [10]:
df = pd.DataFrame([1, 2, 3])
print(df.shape) # (3, 1)
print(df.size) # 3
print(df.dtypes) 
# 0    int64
# dtype: object
print(df.columns)
print(df.index)
print(df.values)

(3, 1)
3
0    int64
dtype: object
RangeIndex(start=0, stop=1, step=1)
RangeIndex(start=0, stop=3, step=1)
[[1]
 [2]
 [3]]


# D9：使用Pandas Dataframe的初始化

In [1]:
import pandas as pd

s = pd.Series([1,2,3])

print(s)
# 0    1
# 1    2
# 2    3
# dtype: int64

s = pd.Series([1,2,3],  index=['Amy', 'Bob', 'Tom'])
print(s)
# Amy    1
# Bob    2
# Tom    3
# dtype: int64

0    1
1    2
2    3
dtype: int64
Amy    1
Bob    2
Tom    3
dtype: int64


In [2]:
import pandas as pd

df = pd.DataFrame([1, 2, 3])
print(df)
#    0
# 0  1
# 1  2
# 2  3

df = pd.DataFrame([1, 2, 3], index=['a', 'b', 'c'], columns=['No'])
print(df)
#    No
# a   1
# b   2
# c   3

   0
0  1
1  2
2  3
   No
a   1
b   2
c   3


In [3]:
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
print(df)
#    0  1  2
# 0  1  2  3
# 1  4  5  6

df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])
print(df)
#    A  B  C
# a  1  2  3
# b  4  5  6

   0  1  2
0  1  2  3
1  4  5  6
   A  B  C
a  1  2  3
b  4  5  6


# D10：Pandas Dataframe的資料選取

In [5]:
import pandas as pd

df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])

print(df.loc['a', 'A']) # 1
print(df.loc['a', ['A', 'B']])
print(df.loc[['a', 'b'], 'A'])
print(df.loc[['a', 'b'], ['A', 'B']])
print(df.loc[['a', 'b']])

1
A    1
B    2
Name: a, dtype: int64
a    1
b    4
Name: A, dtype: int64
   A  B
a  1  2
b  4  5
   A  B  C
a  1  2  3
b  4  5  6


In [6]:
import pandas as pd

df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b'], columns=['A', 'B', 'C'])

print(df.iloc[0, 0]) # 1
print(df.iloc[0, [0, 1]])
print(df.iloc[[0, 1], 0])
print(df.iloc[[0, 1], [0, 1]])


1
A    1
B    2
Name: a, dtype: int64
a    1
b    4
Name: A, dtype: int64
   A  B
a  1  2
b  4  5


In [7]:
import pandas as pd

df = pd.DataFrame([[1, 2, 'KK'], [4, 5, 'DD']], index=['a', 'b'], columns=['A', 'B', 'C'])

print(df.ix[0, 'A']) # 1
print(df.ix['a', [0, 1]])
print(df.ix[['a', 'b'], 0])
print(df.ix[[0, 1], ['A', 'B']])
# .ix() is depreciated

AttributeError: 'DataFrame' object has no attribute 'ix'

In [14]:
print(df.at['a', 'A']) # 1
print(df.iat[0, 1]) # 2

1
2


# D11：Pandas中的算術運算特性

In [1]:
import pandas as pd

# 同col的對齊運算
df1 = pd.DataFrame([[1, 2, 3]])
df2 = pd.DataFrame([[1, 1, 1]])
print(df1 + df2)
#    0  1  2
# 0  2  3  4

   0  1  2
0  2  3  4


In [2]:
import pandas as pd

# 不同col的運算
df1 = pd.DataFrame([[1, 2, 3]], columns=['a', 'b', 'c'])
df2 = pd.DataFrame([[1, 1, 1]], columns=['c', 'd', 'e'])
print(df1 + df2)
#     a   b     c   d   e
# 0 NaN NaN     4 NaN NaN

    a   b  c   d   e
0 NaN NaN  4 NaN NaN


In [3]:
import pandas as pd

df1 = pd.DataFrame([[1, 2, 3]])
print(df1 + 1)
#    0  1  2
# 0  2  3  4

   0  1  2
0  2  3  4


In [4]:
import pandas as pd

df = pd.DataFrame([[1, 2, 3]])
print(df + 1)
#    0  1  2
# 0  2  3  4
print(df + pd.DataFrame([1]))
#    0   1   2
# 0  2 NaN NaN

   0  1  2
0  2  3  4
   0   1   2
0  2 NaN NaN


In [5]:
import numpy as np

a = np.array([[1, 2, 3]])
print(a + 1)
# [[2 3 4]]
print(a + np.array([1]))
# [[2 3 4]]

[[2 3 4]]
[[2 3 4]]


In [9]:
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index = ['a','b'], columns = ['A', 'B', 'C'])
print(df > 2) #        A      B     C
              # a  False  False  True
              # b   True   True  True
print(df[df > 2]) #      A    B  C
                  # a  NaN  NaN  3
                  # b  4.0  5.0  6
print(df['A'] > 2) # a    False
                   # b     True
                   # Name: A, dtype: bool
print(df[df['A'] > 2]) #    A  B  C
                       # b  4  5  6

       A      B     C
a  False  False  True
b   True   True  True
     A    B  C
a  NaN  NaN  3
b  4.0  5.0  6
a    False
b     True
Name: A, dtype: bool
   A  B  C
b  4  5  6


In [15]:
import pandas as pd

df = pd.DataFrame({
    'col1': ['A', 'a', 'B', 'b'],
    'col2': [2, 1, 9, 8],
})
print(df)
print(df.sort_values(by=['col1'])) # df.sort_values()不改變df本身
print(df)

  col1  col2
0    A     2
1    a     1
2    B     9
3    b     8
  col1  col2
0    A     2
2    B     9
1    a     1
3    b     8
  col1  col2
0    A     2
1    a     1
2    B     9
3    b     8


In [16]:
import pandas as pd

df = pd.DataFrame({
    'col1': ['A', 'a', 'B', 'b'],
    'col2': [2, 1, 9, 8],
})
print(df)

print(df.sort_values(by=['col1', 'col2']))
print(df.sort_values(by='col2', ascending=False))
print(df)

  col1  col2
0    A     2
1    a     1
2    B     9
3    b     8
  col1  col2
0    A     2
2    B     9
1    a     1
3    b     8
  col1  col2
2    B     9
3    b     8
0    A     2
1    a     1
  col1  col2
0    A     2
1    a     1
2    B     9
3    b     8


# D12：Pandas迭代與重複操作

In [20]:
import pandas as pd

df = pd.DataFrame({
  'name': ['Alice', 'Bob'],
  'age': [20, 32]
})

for c in df:
  print(c)
print('\n')
for i in range(len(df)):
  print(df.iloc[i])

name
age


name    Alice
age        20
Name: 0, dtype: object
name    Bob
age      32
Name: 1, dtype: object


In [22]:
for d in df.iteritems():
  print(d)
# ('name', 0  Alice  1  Bob Name: name, dtype: object)
# ('age', 0  20  1  32  Name: age, dtype: int64)
for d in df.iterrows():
  print(d)
# (0, name Alice age 20 Name: 0, dtype: object)
# (1, name Bob age 32 Name: 1, dtype: object)
for d in df.itertuples():
  print(d)
# Pandas(Index=0, name='Alice', age=20)
# Pandas(Index=1, name='Bob', age=32)

('name', 0    Alice
1      Bob
Name: name, dtype: object)
('age', 0    20
1    32
Name: age, dtype: int64)


(0, name    Alice
age        20
Name: 0, dtype: object)
(1, name    Bob
age      32
Name: 1, dtype: object)


Pandas(Index=0, name='Alice', age=20)
Pandas(Index=1, name='Bob', age=32)


In [24]:
import numpy as np
import pandas as pd

df = pd.DataFrame({
  'score': [98, 67, 85],
  'age': [20, 32, 28]
})

print(df.apply(np.max))
# score    98
# age      32
# dtype: int64

print(df.apply(np.min))
# score    67
# age      20
# dtype: int64

print(df.apply(lambda x: x.max() - x.min()))
# score    31
# age      12
# dtype: int64

score    98
age      32
dtype: int64
score    67
age      20
dtype: int64
score    31
age      12
dtype: int64


In [27]:
print(df['age'].map(lambda x: -x))
# 0   -20
# 1   -32
# 2   -28
# Name: age, dtype: int64

0   -20
1   -32
2   -28
Name: age, dtype: int64


In [28]:
print(df.applymap(lambda x: -x))
#    score  age
# 0    -98  -20
# 1    -67  -32
# 2    -85  -28

   score  age
0    -98  -20
1    -67  -32
2    -85  -28


# D13：Pandas Dataframe的新增與刪除

In [2]:
import pandas as pd

df = pd.DataFrame([[1, 2]], columns = ['a', 'b'])   #    a  b
print(df)                                           # 0  1  2

df = df.append(pd.DataFrame([[3, 4]], columns = ['a', 'b'])) #    a  b
print(df)                                                    # 0  1  2
                                                             # 0  3  4

df = df.append(pd.DataFrame([[3, 4]], columns = ['a', 'b']))
df = df.reset_index(drop=True)
print(df)
#    a  b
# 0  1  2
# 1  3  4
# 2  3  4

   a  b
0  1  2
   a  b
0  1  2
0  3  4
   a  b
0  1  2
1  3  4
2  3  4


In [4]:
import pandas as pd

df = pd.DataFrame([[1, 2, 3]], columns = ['a', 'b', 'c'])  #    a  b  c
print(df)                                                  # 0  1  2  3

del df['a']
df.pop('c')
print(df) #    b
          # 0  2

   a  b  c
0  1  2  3
   b
0  2


In [5]:
import pandas as pd

df = pd.DataFrame([[1], [2]], columns = ['a']) #    a
print(df)                                      # 0  1
                                               # 1  2
df = df.drop(1) #    a
print(df)       # 0  1

   a
0  1
1  2
   a
0  1


In [6]:
one = pd.DataFrame({
    'id':[1, 2],
    'Name': ['Alex', 'Amy'],
})
two = pd.DataFrame({
    'id':[1, 2],
    'Name': ['Bob', 'Tom']
})

pd.concat([one, two]).reset_index(drop=True) # 修正index，避免重複
#   id  Name
# 0  1  Alex
# 1  2   Amy
# 2  1   Bob
# 3  2   Tom

Unnamed: 0,id,Name
0,1,Alex
1,2,Amy
2,1,Bob
3,2,Tom


In [7]:
one = pd.DataFrame({
    'id':[1, 2],
    'Name': ['Alex', 'Amy'],
})
two = pd.DataFrame({
    'id':[1, 2],
    'Score': [98, 60]
})

pd.merge(one, two, on='id')

Unnamed: 0,id,Name,Score
0,1,Alex,98
1,2,Amy,60


In [8]:
one = pd.DataFrame({
    'Name': ['Alex', 'Amy'],})
two = pd.DataFrame({
    'Score': [98, 60]})

one.join(two)
#    Name  Score
# 0  Alex     98
# 1   Amy     60

Unnamed: 0,Name,Score
0,Alex,98
1,Amy,60


In [11]:
df = pd.DataFrame({
  'A' : ['foo', 'bar', 'foo', 'bar'],
  'B' : ['one', 'one', 'two', 'three'],
  'C' : [1,2,3,4],
  'D' : [10, 20, 30, 40]
})
print(df)
#      A      B  C   D
# 0  foo    one  1  10
# 1  bar    one  2  20
# 2  foo    two  3  30
# 3  bar  three  4  40

print(df.groupby('A').sum())
# A    C   D    
# bar  6  60
# foo  4  40
print(df.groupby('A').agg(sum))
# A    C   D    
# bar  6  60
# foo  4  40
print(df.groupby(['A','B']).sum())
# A   B      C   D     
# bar one    2  20
#     three  4  40
# foo one    1  10
#     two    3  30

     A      B  C   D
0  foo    one  1  10
1  bar    one  2  20
2  foo    two  3  30
3  bar  three  4  40
     C   D
A         
bar  6  60
foo  4  40
     C   D
A         
bar  6  60
foo  4  40
           C   D
A   B           
bar one    2  20
    three  4  40
foo one    1  10
    two    3  30
