# 1. Numpy, Pandas 기초 정리

# 1.1. Numpy

## numpy 기본 배열
- `np.array()`로 넘파이 배열을 만들 수 있다.

In [1]:
import numpy as np
v1 = np.array([1,2,3,4])
v1

array([1, 2, 3, 4])

## numpy arange
- `np.arange(start, end, step, dtype=int or float 등)` or `np.arange(end)`

In [2]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [3]:
np.arange(1, 10, 2)

array([1, 3, 5, 7, 9])

In [4]:
np.arange(1, 10, 2, dtype=float)

array([1., 3., 5., 7., 9.])

## numpy 행렬 만들기
- `np.array([[1,2],[3,4]])`
- `np.array([1,2,3,4]).reshape(차원, 행, 렬, order='C' or 'F')`: `C`는 행부터 채우기, `F`는 열부터 채우기

In [5]:
np.array([[1,2,3,4,5,6],[7,8,9,10,11,12]])

array([[ 1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11, 12]])

In [6]:
np.arange(12).reshape(2,6)

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11]])

In [7]:
np.arange(12).reshape(2,6, order='F')

array([[ 0,  2,  4,  6,  8, 10],
       [ 1,  3,  5,  7,  9, 11]])

## numpy 연산
- `np.add(변수명, 변수명)`: 덧셈
- `np.subtract(변수명, 변수명)`: 뺄셈
- `np.multiply(변수명, 변수명)`: 곱셈
- `np.dot(변수명, 변수명)`: 행렬곱

In [8]:
v1 = np.array([1,2,3,4])
v2 = np.array([6,7,8,9])
np.add(v1, v2)

array([ 7,  9, 11, 13])

In [9]:
np.subtract(v1, v2)

array([-5, -5, -5, -5])

In [10]:
np.multiply(v1, v2)

array([ 6, 14, 24, 36])

In [11]:
np.dot(v1, v2)

80

## numpy 최대값, 최소값, 데이터 타입, 차원 구하기
- `np.amax(변수명)`: 최대값
- `np.amin(변수명)`: 최소값
- `변수명.dtype`: 데이터 타입
- `변수명.shape`: 행, 열, 차원 확인

In [12]:
v1 = np.arange(20).reshape(2, -1)
v1

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]])

In [13]:
np.amax(v1)

19

In [14]:
v1.max()

19

In [15]:
np.amin(v1)

0

In [16]:
v1.min()

0

In [17]:
v1.dtype

dtype('int64')

In [18]:
v1.shape

(2, 10)

# Pandas

## Series
- `index`와 `value` 형태를 가지는 pandas 자료구조
- `index`는 0,1,2,3... 순으로 자동 생성됨

In [19]:
import pandas as pd
from pandas import Series

In [20]:
a = Series([1,3,5,7])
a

0    1
1    3
2    5
3    7
dtype: int64

In [21]:
a.values

array([1, 3, 5, 7])

In [22]:
a.index

RangeIndex(start=0, stop=4, step=1)

In [23]:
a2 = pd.Series([1,3,5,7], index=['a', 'b', 'c', 'd'])
a2

a    1
b    3
c    5
d    7
dtype: int64

## DataFrame
- 2차원 행렬구조
- 엑셀 및 CSV 데이터를 처리하기에 적합한 구조

In [24]:
import pandas as pd
from pandas import DataFrame

In [25]:
df = pd.read_csv("data/EX_GrapeData.csv")
df

Unnamed: 0,continent,brand,size,period,price
0,2,2,10.7,47.65,144
1,2,3,14.0,63.13,215
2,2,2,9.0,58.76,105
3,1,1,8.0,34.88,69
4,2,2,10.0,55.53,134
...,...,...,...,...,...
58,1,1,5.0,16.66,21.5
59,2,1,21.0,43.00,
60,2,2,5.0,12.00,
61,2,3,13.0,20.00,


In [26]:
# openpyxl 설치하면 엑셀도 가능
df = pd.read_excel("data/EX_GrapeData.xlsx")
df

Unnamed: 0,continent,brand,size,period,price
0,2.0,2.0,10.7,47.65,144.0
1,2.0,3.0,14.0,63.13,215.0
2,2.0,2.0,9.0,58.76,105.0
3,1.0,1.0,8.0,34.88,69.0
4,2.0,2.0,10.0,55.53,134.0
...,...,...,...,...,...
58,1.0,1.0,5.0,16.66,21.5
59,2.0,1.0,21.0,43.00,
60,2.0,2.0,5.0,12.00,
61,2.0,3.0,13.0,20.00,


### `df.head()`로 DataFrame 앞부분 내용 확인하기

In [27]:
df.head()

Unnamed: 0,continent,brand,size,period,price
0,2.0,2.0,10.7,47.65,144.0
1,2.0,3.0,14.0,63.13,215.0
2,2.0,2.0,9.0,58.76,105.0
3,1.0,1.0,8.0,34.88,69.0
4,2.0,2.0,10.0,55.53,134.0


### `df.tail()`로 DataFrame 뒷부분 내용 확인하기

In [28]:
df.tail()

Unnamed: 0,continent,brand,size,period,price
58,1.0,1.0,5.0,16.66,21.5
59,2.0,1.0,21.0,43.0,
60,2.0,2.0,5.0,12.0,
61,2.0,3.0,13.0,20.0,
62,2.0,3.0,31.0,19.0,


## DataFrame 행 범위 다루기
- `df[시작:종료]`: 시작 지점부터 종료-1까지 가져옴, 기존 배열 slicing과 같음
- `df[:종료]`: 종료-1 까지 가져옴
- `df[시작:]`: 시작 지점부터 끝까지 가져옴

In [29]:
df[1:5]

Unnamed: 0,continent,brand,size,period,price
1,2.0,3.0,14.0,63.13,215.0
2,2.0,2.0,9.0,58.76,105.0
3,1.0,1.0,8.0,34.88,69.0
4,2.0,2.0,10.0,55.53,134.0


In [30]:
df[:3]

Unnamed: 0,continent,brand,size,period,price
0,2.0,2.0,10.7,47.65,144.0
1,2.0,3.0,14.0,63.13,215.0
2,2.0,2.0,9.0,58.76,105.0


In [31]:
df[60:]

Unnamed: 0,continent,brand,size,period,price
60,2.0,2.0,5.0,12.0,
61,2.0,3.0,13.0,20.0,
62,2.0,3.0,31.0,19.0,


### DataFrame 열 범위 다루기
- `df[['행 이름']]`
- `df[df.columns[[행번호]]]`
- `df.loc[:, 첫 행 이름: 끝 행 이름]`

In [32]:
df[['price']]

Unnamed: 0,price
0,144.0
1,215.0
2,105.0
3,69.0
4,134.0
...,...
58,21.5
59,
60,
61,


In [33]:
# df['price'] 는 Series를 가져옴
df['price']

0     144.0
1     215.0
2     105.0
3      69.0
4     134.0
      ...  
58     21.5
59      NaN
60      NaN
61      NaN
62      NaN
Name: price, Length: 63, dtype: float64

In [34]:
print(df.columns[[0,2,4]])  # 컬럼명 확인
df[df.columns[[0,2,4]]]  # df[['continent', 'size', 'price']]

Index(['continent', 'size', 'price'], dtype='object')


Unnamed: 0,continent,size,price
0,2.0,10.7,144.0
1,2.0,14.0,215.0
2,2.0,9.0,105.0
3,1.0,8.0,69.0
4,2.0,10.0,134.0
...,...,...,...
58,1.0,5.0,21.5
59,2.0,21.0,
60,2.0,5.0,
61,2.0,13.0,


In [35]:
df.loc[:, 'size':'price'] # loc는 써있는 index명을 입력해야 함

Unnamed: 0,size,period,price
0,10.7,47.65,144.0
1,14.0,63.13,215.0
2,9.0,58.76,105.0
3,8.0,34.88,69.0
4,10.0,55.53,134.0
...,...,...,...
58,5.0,16.66,21.5
59,21.0,43.00,
60,5.0,12.00,
61,13.0,20.00,


In [36]:
df.iloc[:, 2:5]  # iloc는 인덱스 번호만 써야 함

Unnamed: 0,size,period,price
0,10.7,47.65,144.0
1,14.0,63.13,215.0
2,9.0,58.76,105.0
3,8.0,34.88,69.0
4,10.0,55.53,134.0
...,...,...,...
58,5.0,16.66,21.5
59,21.0,43.00,
60,5.0,12.00,
61,13.0,20.00,


#### 하나의 값을 지정해서 가져오기

In [37]:
df.at[5, 'price']  # 5행(6번째값)의 price

129.0

## Pandas 데이터 변환

### 복사, 추가, 삭제
- `df_new = df.copy()`: DataFrame을 복사
- `df.columns`: DataFrame의 컬럼을 Index 형태로 반환
- `df[['변수명1', '변수명2', '변수명3']]`: 특정 컬럼만 필터링하여 새로운 DataFrame 반환
- `df.rename(columns={'기존변수명': '새로운변수명', inplace=True})`: 기존 변수명을 새로운 변수명으로 변경, inplace는 덮어씌울지 여부
- `df['새로운변수명'] = 값`: 새로운 컬럼 생성 및 데이터 입력
- `del df['변수명']`: 특정 열을 삭제

In [38]:
df_new = df.copy()
df_new

Unnamed: 0,continent,brand,size,period,price
0,2.0,2.0,10.7,47.65,144.0
1,2.0,3.0,14.0,63.13,215.0
2,2.0,2.0,9.0,58.76,105.0
3,1.0,1.0,8.0,34.88,69.0
4,2.0,2.0,10.0,55.53,134.0
...,...,...,...,...,...
58,1.0,1.0,5.0,16.66,21.5
59,2.0,1.0,21.0,43.00,
60,2.0,2.0,5.0,12.00,
61,2.0,3.0,13.0,20.00,


In [39]:
df_new.columns

Index(['continent', 'brand', 'size', 'period', 'price'], dtype='object')

In [40]:
df_new[['brand', 'size']]

Unnamed: 0,brand,size
0,2.0,10.7
1,3.0,14.0
2,2.0,9.0
3,1.0,8.0
4,2.0,10.0
...,...,...
58,1.0,5.0
59,1.0,21.0
60,2.0,5.0
61,3.0,13.0


In [41]:
df_new.rename(columns={'period': 'time'}, inplace=True)
df_new.columns

Index(['continent', 'brand', 'size', 'time', 'price'], dtype='object')

In [42]:
df_new['growth'] = df_new['size']/df_new['time']
df_new.head()

Unnamed: 0,continent,brand,size,time,price,growth
0,2.0,2.0,10.7,47.65,144.0,0.224554
1,2.0,3.0,14.0,63.13,215.0,0.221765
2,2.0,2.0,9.0,58.76,105.0,0.153165
3,1.0,1.0,8.0,34.88,69.0,0.229358
4,2.0,2.0,10.0,55.53,134.0,0.180083


In [43]:
del df_new['growth']
df_new.head()

Unnamed: 0,continent,brand,size,time,price
0,2.0,2.0,10.7,47.65,144.0
1,2.0,3.0,14.0,63.13,215.0
2,2.0,2.0,9.0,58.76,105.0
3,1.0,1.0,8.0,34.88,69.0
4,2.0,2.0,10.0,55.53,134.0


### 조건에 의한 데이터 추출
- `AND` 조건은 `&`, `OR` 조건은 `|`로 표현

In [44]:
df[['continent', 'brand']].head()

Unnamed: 0,continent,brand
0,2.0,2.0
1,2.0,3.0
2,2.0,2.0
3,1.0,1.0
4,2.0,2.0


In [45]:
df_continent_brand = df[(df['continent']==1) & (df['brand']==1)]
df_continent_brand.head()

Unnamed: 0,continent,brand,size,period,price
3,1.0,1.0,8.0,34.88,69.0
11,1.0,1.0,10.4,17.67,54.0
12,1.0,1.0,7.4,16.41,39.0
13,1.0,1.0,5.4,12.02,29.5
16,1.0,1.0,6.0,23.21,42.0


- 특정 기준값 이상 이하 비교 등은 `>`, `<` 등을 이용함

In [46]:
df_over_size_period = df[(df['size']>=10) & (df['period']>=30)]
df_over_size_period.head()

Unnamed: 0,continent,brand,size,period,price
0,2.0,2.0,10.7,47.65,144.0
1,2.0,3.0,14.0,63.13,215.0
4,2.0,2.0,10.0,55.53,134.0
5,2.0,2.0,10.5,43.14,129.0
6,2.0,2.0,16.0,54.86,155.0


### 값 변경하기
- replace 이용하기
- 함수로 변경하기

In [47]:
# 데이터 범주 확인
df['brand'].value_counts()

2.0    24
1.0    23
3.0    16
Name: brand, dtype: int64

### replace 활용하기

In [48]:
recode_brand = {"brand": {1: 1, 2: 1, 3: 2}} # 1->1, 2->1, 3->2로 치환하기 위함
df_recode1 = df.replace(recode_brand)
df_recode1.head()

Unnamed: 0,continent,brand,size,period,price
0,2.0,1.0,10.7,47.65,144.0
1,2.0,2.0,14.0,63.13,215.0
2,2.0,1.0,9.0,58.76,105.0
3,1.0,1.0,8.0,34.88,69.0
4,2.0,1.0,10.0,55.53,134.0


In [49]:
df_recode1['brand'].value_counts()

1.0    47
2.0    16
Name: brand, dtype: int64

### 함수 활용하기

In [50]:
# 함수 정의
def brand_groups(series):
    if series == 1:
        return 1
    elif series == 2:
        return 1
    elif series == 3:
        return 2
df['re_brand'] = df['brand'].apply(brand_groups)
df.head()

Unnamed: 0,continent,brand,size,period,price,re_brand
0,2.0,2.0,10.7,47.65,144.0,1
1,2.0,3.0,14.0,63.13,215.0,2
2,2.0,2.0,9.0,58.76,105.0,1
3,1.0,1.0,8.0,34.88,69.0,1
4,2.0,2.0,10.0,55.53,134.0,1


## Numpy <--> Pandas 전환

In [51]:
import pandas as pd
df = pd.read_csv('data/EX_GrapeData.csv')
df.head()

Unnamed: 0,continent,brand,size,period,price
0,2,2,10.7,47.65,144
1,2,3,14.0,63.13,215
2,2,2,9.0,58.76,105
3,1,1,8.0,34.88,69
4,2,2,10.0,55.53,134


In [52]:
# Pandas DataFrame to Numpy
df_num = df.to_numpy()
df_num

array([[2, 2, 10.7, 47.65, '144'],
       [2, 3, 14.0, 63.13, '215'],
       [2, 2, 9.0, 58.76, '105'],
       [1, 1, 8.0, 34.88, '69'],
       [2, 2, 10.0, 55.53, '134'],
       [2, 2, 10.5, 43.14, '129'],
       [2, 2, 16.0, 54.86, '155'],
       [2, 1, 15.0, 44.14, '99'],
       [2, 1, 6.5, 17.46, '38.5'],
       [2, 1, 5.0, 21.04, '36.5'],
       [2, 2, 25.0, 109.38, '260'],
       [1, 1, 10.4, 17.67, '54'],
       [1, 1, 7.4, 16.41, '39'],
       [1, 1, 5.4, 12.02, '29.5'],
       [2, 2, 15.4, 49.48, '109'],
       [2, 1, 12.4, 48.74, '89.5'],
       [1, 1, 6.0, 23.21, '42'],
       [1, 1, 9.0, 28.64, '65'],
       [1, 3, 9.0, 44.95, '115'],
       [1, 1, 12.4, 23.77, '49.5'],
       [1, 1, 7.5, 20.21, '36.5'],
       [1, 3, 14.0, 32.62, '109'],
       [1, 1, 7.0, 17.84, '45'],
       [1, 2, 9.0, 22.82, '58'],
       [1, 2, 12.0, 29.48, '89'],
       [1, 1, 5.5, 15.61, '30'],
       [1, 2, 6.0, 13.25, '31'],
       [1, 3, 12.0, 45.78, '119'],
       [2, 1, 5.5, 26.53, '22'],
     

In [53]:
# Numpy to Pandas DataFrame
df_pd = pd.DataFrame(df_num)
df_pd.head()

Unnamed: 0,0,1,2,3,4
0,2,2,10.7,47.65,144
1,2,3,14.0,63.13,215
2,2,2,9.0,58.76,105
3,1,1,8.0,34.88,69
4,2,2,10.0,55.53,134


In [54]:
# 컬럼 추가 가능
df_pd2 = pd.DataFrame(df_num, columns=["continent", "brand", "size", "period", "price"])
df_pd2.head()

Unnamed: 0,continent,brand,size,period,price
0,2,2,10.7,47.65,144
1,2,3,14.0,63.13,215
2,2,2,9.0,58.76,105
3,1,1,8.0,34.88,69
4,2,2,10.0,55.53,134
