# 개요

1. numpy 
2. pandas
3. torch로 vector, matrix 등 만들어보기

## 참고자료
* Official tutorial: https://docs.scipy.org/doc/numpy/reference/
* Stanford Univ. CS231 tutorial: http://cs231n.github.io/python-numpy-tutorial/

## 1. numpy, array, matrix 등

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np

### 리스트, 행렬 선언

In [3]:
# list_name(변수명 설정) = ["item-1", "item-2", ... , "item-n"]

a = [1, 2, 3]
b = ["a", "b", "c"]
c = [["a"], ["a", "b"], ["a", "c"], ["a", "b", "c"]]

print(a)
print(b)
print(c)

[1, 2, 3]
['a', 'b', 'c']
[['a'], ['a', 'b'], ['a', 'c'], ['a', 'b', 'c']]


In [4]:
# append()로 리스트에 item 추가하기
c.append('new_item')
c.append(['new_item_list'])

print(c)

[['a'], ['a', 'b'], ['a', 'c'], ['a', 'b', 'c'], 'new_item', ['new_item_list']]


In [5]:
a = np.array([1, 2, 3])
b = np.array(["a", "b", "c"])
c = np.array([["a"], ["a", "b"], ["a", "c"], ["a", "b", "c"]])

print(a)
print(b)
print(c)

[1 2 3]
['a' 'b' 'c']
[list(['a']) list(['a', 'b']) list(['a', 'c']) list(['a', 'b', 'c'])]


In [6]:
# 배열(또는 행렬)을 1로 채우기
print(np.ones((1, 5)))
print(np.ones((3, 2)))

[[1. 1. 1. 1. 1.]]
[[1. 1.]
 [1. 1.]
 [1. 1.]]


In [7]:
# 배열(또는 행렬)을 0으로 채우기
print(np.zeros((1, 5)))
print(np.zeros((3, 2)))

[[0. 0. 0. 0. 0.]]
[[0. 0.]
 [0. 0.]
 [0. 0.]]


In [8]:
# 주어진 범위 안에서 순차적으로 증가하는 리스트 만들기
# np.arange([start,] stop, [step, ] dtype=None)

print(np.arange(10))
print(np.arange(3,7, dtype=np.float))
print(np.arange(3,10,2))

[0 1 2 3 4 5 6 7 8 9]
[3. 4. 5. 6.]
[3 5 7 9]


In [9]:
# 행렬 선언하기
mat1 = np.array([[1,2,3],[4,5,6]])

# random으로 matrix 만들기
mat2 = np.random.randint(low=1, high=10, size=(3,2))
mat3 = np.random.rand(3,2)

print(mat1)
print(mat2)
print(mat3)

[[1 2 3]
 [4 5 6]]
[[4 8]
 [4 5]
 [3 7]]
[[0.68930371 0.42660358]
 [0.25388768 0.21712119]
 [0.26092414 0.54278774]]


### 리스트 인덱싱(Indexing) & 슬라이싱(Slicing)

In [10]:
# Indexing
a = [1,3,5,7,9,11]
print(a[2], a[5], a[-1])

5 11 11


In [11]:
# Slicing
b = [2,4,6,8,10]
print(b[2:])
print(b[:2])
print(b[:])

[6, 8, 10]
[2, 4]
[2, 4, 6, 8, 10]


### numpy의 reshape (PyTorch의 view와 비교)

In [12]:
# (row, column)
mat1 = np.random.rand(6,3)
print(mat1)

[[0.05951402 0.55543042 0.76262914]
 [0.94078109 0.31138448 0.38126506]
 [0.17211228 0.04254331 0.27131842]
 [0.7077833  0.65864431 0.1832303 ]
 [0.30508765 0.26701813 0.1302911 ]
 [0.65444581 0.1304787  0.13441195]]


In [13]:
# -1: all
print(mat1.reshape(1, -1).shape)
print(mat1.reshape(1, -1))
print("=====")
print(mat1.reshape(-1, 1).shape)
print(mat1.reshape(-1, 1))

(1, 18)
[[0.05951402 0.55543042 0.76262914 0.94078109 0.31138448 0.38126506
  0.17211228 0.04254331 0.27131842 0.7077833  0.65864431 0.1832303
  0.30508765 0.26701813 0.1302911  0.65444581 0.1304787  0.13441195]]
=====
(18, 1)
[[0.05951402]
 [0.55543042]
 [0.76262914]
 [0.94078109]
 [0.31138448]
 [0.38126506]
 [0.17211228]
 [0.04254331]
 [0.27131842]
 [0.7077833 ]
 [0.65864431]
 [0.1832303 ]
 [0.30508765]
 [0.26701813]
 [0.1302911 ]
 [0.65444581]
 [0.1304787 ]
 [0.13441195]]


In [14]:
print(mat1.reshape(2, 9).shape)
print(mat1.reshape(9, 2))

(2, 9)
[[0.05951402 0.55543042]
 [0.76262914 0.94078109]
 [0.31138448 0.38126506]
 [0.17211228 0.04254331]
 [0.27131842 0.7077833 ]
 [0.65864431 0.1832303 ]
 [0.30508765 0.26701813]
 [0.1302911  0.65444581]
 [0.1304787  0.13441195]]


In [15]:
mat1.reshape(2,5)

ValueError: cannot reshape array of size 18 into shape (2,5)

In [17]:
# tensor 형태로 나타내기
print(mat1.reshape(3,2,3).shape)
print(mat1.reshape(3,2,3))

(3, 2, 3)
[[[0.05951402 0.55543042 0.76262914]
  [0.94078109 0.31138448 0.38126506]]

 [[0.17211228 0.04254331 0.27131842]
  [0.7077833  0.65864431 0.1832303 ]]

 [[0.30508765 0.26701813 0.1302911 ]
  [0.65444581 0.1304787  0.13441195]]]


### matrix 또는 tensor 형태에서도 slicing이 가능합니다!

In [18]:
mat2 = np.arange(24).reshape(-1, 4)
print(mat2)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]]


In [19]:
mat2[:1,:3]

array([[0, 1, 2]])

In [20]:
mat2[3, 0:2]

array([12, 13])

### Math Arithmetic Operations(사칙연산)

In [21]:
x = np.array([[1,3,5],[7,9,11],[13,15,17]])
y = np.array([[2,4,6],[8,10,12],[14,16,18]])

print(x)
print(y)

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[[ 2  4  6]
 [ 8 10 12]
 [14 16 18]]


In [22]:
# add
x + y

array([[ 3,  7, 11],
       [15, 19, 23],
       [27, 31, 35]])

In [23]:
x - y

array([[-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1]])

In [24]:
x * y

array([[  2,  12,  30],
       [ 56,  90, 132],
       [182, 240, 306]])

In [25]:
x / y

array([[0.5       , 0.75      , 0.83333333],
       [0.875     , 0.9       , 0.91666667],
       [0.92857143, 0.9375    , 0.94444444]])

In [26]:
print(x ** 2)
print(np.power(x, 2))

[[  1   9  25]
 [ 49  81 121]
 [169 225 289]]
[[  1   9  25]
 [ 49  81 121]
 [169 225 289]]


In [27]:
np.dot(x, y)

array([[ 96, 114, 132],
       [240, 294, 348],
       [384, 474, 564]])

In [28]:
np.sqrt(x)

array([[1.        , 1.73205081, 2.23606798],
       [2.64575131, 3.        , 3.31662479],
       [3.60555128, 3.87298335, 4.12310563]])

In [29]:
# More on matrix operation
z1 = np.array([[2,2,2]])
z2 = np.array([[2,2]])

x * z1

array([[ 2,  6, 10],
       [14, 18, 22],
       [26, 30, 34]])

In [30]:
print(z2.shape)
print(x.shape)
print(x * z2)

(1, 2)
(3, 3)


ValueError: operands could not be broadcast together with shapes (3,3) (1,2) 

In [31]:
# 합
print(x)
print(x.sum(axis = 0))
print(x.sum(axis = 1))

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[21 27 33]
[ 9 27 45]


In [32]:
# 평균
print(x)
print(x.mean(axis = 0))
print(x.mean(axis = 1))

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[ 7.  9. 11.]
[ 3.  9. 15.]


In [33]:
# 표준편차
print(x)
print(x.std(axis = 0))
print(x.std(axis = 1))

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[4.89897949 4.89897949 4.89897949]
[1.63299316 1.63299316 1.63299316]


In [34]:
print(x.T)
print(np.dot(x, z1.T))

[[ 1  7 13]
 [ 3  9 15]
 [ 5 11 17]]
[[18]
 [54]
 [90]]


### Other operations

In [35]:
xx = np.random.rand(15)
print(xx)

[0.9005869  0.90855213 0.27563754 0.73270184 0.6232615  0.31074651
 0.11984763 0.86232908 0.56457185 0.91063535 0.0366558  0.88809047
 0.70573287 0.19198639 0.3947756 ]


In [36]:
print(xx)
print(xx.argsort()) # axis를 활용해서 행렬(matrix)에도 적용할 수 있다.
xx.sort()
print(xx)

[0.9005869  0.90855213 0.27563754 0.73270184 0.6232615  0.31074651
 0.11984763 0.86232908 0.56457185 0.91063535 0.0366558  0.88809047
 0.70573287 0.19198639 0.3947756 ]
[10  6 13  2  5 14  8  4 12  3  7 11  0  1  9]
[0.0366558  0.11984763 0.19198639 0.27563754 0.31074651 0.3947756
 0.56457185 0.6232615  0.70573287 0.73270184 0.86232908 0.88809047
 0.9005869  0.90855213 0.91063535]


## 2. Pandas

* https://pandas.pydata.org/pandas-docs/stable/

In [37]:
import pandas as pd

### Pandas Series 만들기

* pandas series는 1차원 데이터 집합

In [38]:
pd_series = pd.Series(index = ['a','b','c','d','e'], data=[1,2,3,4,5])
pd_series

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [39]:
# Pandas Series의 기초 정보
print('차원:', pd_series.ndim)
print('형태: ', pd_series.shape)
print('총 원소의 수:', pd_series.size)

print('값:', pd_series.values)
print('인덱스:', pd_series.index)

차원: 1
형태:  (5,)
총 원소의 수: 5
값: [1 2 3 4 5]
인덱스: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


### index를 활용한 데이터 탐색

In [40]:
# loc(index를 활용하여 access), iloc(integer location)

print(pd_series.loc['a'])
print(pd_series.iloc[0])

print(pd_series.loc[['a','c']])
print(pd_series.iloc[[0,2]])

1
1
a    1
c    3
dtype: int64
a    1
c    3
dtype: int64


### 데이터 삭제

In [41]:
pd_series.drop('b')

a    1
c    3
d    4
e    5
dtype: int64

In [42]:
print(pd_series)
pd_series.drop('b', inplace=True)
print(pd_series)

a    1
b    2
c    3
d    4
e    5
dtype: int64
a    1
c    3
d    4
e    5
dtype: int64


### Pandas Dataframe 만들기

* 2차원 데이터 집합. 행렬과 비슷하게 row와 column을 갖고 있다

In [43]:
data = {
    'A': np.arange(15),
    'B': np.random.randint(low=0, high=15, size=(15)),
    'C': np.random.rand(15)
}

data_df = pd.DataFrame(data)

In [44]:
data_df.head()

Unnamed: 0,A,B,C
0,0,0,0.323804
1,1,8,0.423098
2,2,4,0.433722
3,3,12,0.544142
4,4,0,0.130687


In [45]:
data_df.tail()

Unnamed: 0,A,B,C
10,10,4,0.067978
11,11,8,0.342925
12,12,2,0.887297
13,13,14,0.750324
14,14,10,0.269248


In [46]:
data_df.shape

(15, 3)

### Indexing and Slicing

In [47]:
data_df[1:3]

Unnamed: 0,A,B,C
1,1,8,0.423098
2,2,4,0.433722


In [48]:
data_df.loc[1]

A    1.000000
B    8.000000
C    0.423098
Name: 1, dtype: float64

In [49]:
data_df.loc[1]['C']

0.423098345797852

### Add, Remove and etc

In [50]:
data_df['D'] = data_df['A'] >= 5
data_df

Unnamed: 0,A,B,C,D
0,0,0,0.323804,False
1,1,8,0.423098,False
2,2,4,0.433722,False
3,3,12,0.544142,False
4,4,0,0.130687,False
5,5,9,0.397488,True
6,6,13,0.26809,True
7,7,11,0.958373,True
8,8,1,0.486016,True
9,9,8,0.396873,True


In [51]:
data_df.drop('D', axis=1, inplace=True)

In [52]:
data_df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C
14,14,10,0.269248
13,13,14,0.750324
12,12,2,0.887297
11,11,8,0.342925
10,10,4,0.067978
9,9,8,0.396873
8,8,1,0.486016
7,7,11,0.958373
6,6,13,0.26809
5,5,9,0.397488
