In [4]:
import pandas as pd

## pandas의 Data Structure

### 1. Series
* python의 List랑 거의 유사    
* list를 pandas의 Series로 바꿔줘야 pandas의 기능 지원을 받을 수 있다.

In [5]:
# python의 List
odd = [1,3,5,7,9]
odd

[1, 3, 5, 7, 9]

In [6]:
# pandas의 Series
pd_odd = pd.Series(odd)
pd_odd

0    1
1    3
2    5
3    7
4    9
dtype: int64

In [7]:
odd.mean()  # python에는 'mean' 기능이 없음

AttributeError: 'list' object has no attribute 'mean'

In [8]:
pd_odd.mean()  # 'mean' 기능을 쓰기 위해서는 list → Series로 바꿔줘야 함

5.0

### 2. DataFrame
* 행렬, 엑셀과 비슷
* DataFrame은 여러개의 Series로 이루어져 있다.
* Series는 vector 같고, DataFrame은 matrix 같다.

In [9]:
numbers = [
    [1,2,3],
    [4,5,6],
    [7,8,9]
]
numbers = pd.DataFrame(numbers)
numbers

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [11]:
type(numbers)

pandas.core.frame.DataFrame

In [13]:
type(numbers[0])  # DataFrame 중 하나의 Column의 type은 Series

pandas.core.series.Series

In [15]:
type(numbers.loc[0])  # DataFrame 중 하나의 row의 type은 Series

pandas.core.series.Series

### 3. Data Type

In [16]:
# int == integer == 정수형
type(1)

int

In [17]:
# float == 실수형
type(1.0)

float

In [18]:
# str == string == 문자열
type('Hello World!')

str

In [21]:
odd = [1,3,5,7,9]  # list

odd = pd.Series(odd)  # Series

odd.dtypes  # Series 안의 데이터들의 데이터 타입은?

dtype('int64')

In [22]:
odd = [1,3,5,7.0,9]  # list → 하나만 float여도 float

odd = pd.Series(odd)  # Series

odd.dtypes  # Series 안의 데이터들의 데이터 타입은?

dtype('float64')

In [23]:
odd = [1,3,5,7.0,9, 'hello']  # list → 하나만 문자열이어도 문자열

odd = pd.Series(odd)  # Series

# O == object == str == string
odd.dtypes  # Series 안의 데이터들의 데이터 타입은?

dtype('O')

### 4. NaN (Not a Number)
* NaN 끼리는 비교할 수 없다.

In [32]:
import numpy as np

odd = [1,np.nan,5,7,9]  # np.nan == NaN == 값이 비어있다  == float

odd = pd.Series(odd)

odd.dtypes

dtype('float64')

In [33]:
odd.mean()

5.5

In [39]:
odd = [1,-1,5,7,9]  # 만약 NaN 값을 -1로 처리하면?

odd = pd.Series(odd)

odd.mean()  # 통계치가 달라지기 때문에 데이터 분석 측면에서는 NaN 처리가 중요하다!

4.2

In [40]:
1 == 1

True

In [41]:
np.nan == np.nan  # NaN의 특이한 특징, NaN 끼리는 비교할 수 없다.

False

* `isnull`    
: 비어 있으면 True, 비어있지 않으면 False
* `notnull`    
: 비어 있으면 False, 비어있지 않으면 True

In [42]:
value = np.nan
pd.isnull(value)

True

In [43]:
pd.notnull(value)

False

## Data Frame

### 1. 생성하기
* 이중 list로 DataFrame 생성    
: column을 명시할 수 없어서 column값을 별도로 세팅해줘야 함

In [50]:
order = [
    ['2017-01-01', 500, 'confirmed'],
    ['2017-01-03', 700, 'confirmed'],
    ['2017-01-10', 200, 'canceled'],
]

columns = ['date', 'price', 'state']  # 개수가 맞지 않으면 에러남

order = pd.DataFrame(order, columns = columns)

order

Unnamed: 0,date,price,state
0,2017-01-01,500,confirmed
1,2017-01-03,700,confirmed
2,2017-01-10,200,canceled


* Dict를 만들고 그 안에 list 넣어서 DataFrame 생성    
: data와 columns이 같이 다녀서 편리

In [51]:
order = {
    'date' : ['2017-01-01', '2017-01-03', '2017-01-10'],
    'price' : [500, 700, 200],
    'state' : ['confiremd', 'confirmed', 'canceled']
}

order = pd.DataFrame(order)

order

Unnamed: 0,date,price,state
0,2017-01-01,500,confiremd
1,2017-01-03,700,confirmed
2,2017-01-10,200,canceled


* list를 만들고 그 안에 Dict을 넣어서 DataFrame 생성    
: 보통 데이터가 들어올 때 이 방식으로 많이 들어오기 때문에 많이 사용

In [55]:
order = [
    {'date':'2017-01-01', 'price':500, 'state':'confirmed'},
    {'date':'2017-01-03', 'price':700, 'state':'confirmed'},
    {'date':'2017-01-10', 'price':200, 'state':'canceled'},
]

order = pd.DataFrame(order)

order

Unnamed: 0,date,price,state
0,2017-01-01,500,confirmed
1,2017-01-03,700,confirmed
2,2017-01-10,200,canceled


### 2. DataFrame의 세가지 구성요소    
* column
* index : row 한 줄을 상징하고 가져오는 기준(유니크한 column을 index로 쓸 수 있음)    `index_col`, `set_index`    
* value : 값


In [87]:
order_url = 'http://bit.ly/dsa-01-order'

order = pd.read_csv(order_url)

order

Unnamed: 0,id,user_id,product_id,date,price,address,state
0,1,3,9,2017-01-01,500,Seoul,confirmed
1,2,1,7,2017-01-03,700,Seoul,confirmed
2,3,3,8,2017-01-03,900,Daejeon,confirmed
3,4,4,2,2017-01-07,500,,canceled
4,5,7,3,2017-01-09,700,Incheon,confirmed
5,6,5,7,2017-01-09,600,Busan,canceled
6,7,2,5,2017-01-10,200,,canceled


In [88]:
# 유니크한 column을 index로 사용하기
# 1. index_col
# order = pd.read_csv(order_url, index_col = 'id')

# 2. set_index
order = pd.read_csv(order_url)
order = order.set_index('id')
order

Unnamed: 0_level_0,user_id,product_id,date,price,address,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirmed
2,1,7,2017-01-03,700,Seoul,confirmed
3,3,8,2017-01-03,900,Daejeon,confirmed
4,4,2,2017-01-07,500,,canceled
5,7,3,2017-01-09,700,Incheon,confirmed
6,5,7,2017-01-09,600,Busan,canceled
7,2,5,2017-01-10,200,,canceled


In [89]:
order.index

Int64Index([1, 2, 3, 4, 5, 6, 7], dtype='int64', name='id')

In [90]:
order.columns

Index(['user_id', 'product_id', 'date', 'price', 'address', 'state'], dtype='object')

In [91]:
order.values

array([[3, 9, '2017-01-01', 500, 'Seoul', 'confirmed'],
       [1, 7, '2017-01-03', 700, 'Seoul', 'confirmed'],
       [3, 8, '2017-01-03', 900, 'Daejeon', 'confirmed'],
       [4, 2, '2017-01-07', 500, nan, 'canceled'],
       [7, 3, '2017-01-09', 700, 'Incheon', 'confirmed'],
       [5, 7, '2017-01-09', 600, 'Busan', 'canceled'],
       [2, 5, '2017-01-10', 200, nan, 'canceled']], dtype=object)

In [92]:
type(order.index)  # Int64Index == Series

pandas.core.indexes.numeric.Int64Index

In [93]:
# columns 이름 바꾸기
order.columns = ['user_id', 'product_id', 'date', 'amount', 'address', 'result']
order

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirmed
2,1,7,2017-01-03,700,Seoul,confirmed
3,3,8,2017-01-03,900,Daejeon,confirmed
4,4,2,2017-01-07,500,,canceled
5,7,3,2017-01-09,700,Incheon,confirmed
6,5,7,2017-01-09,600,Busan,canceled
7,2,5,2017-01-10,200,,canceled


In [94]:
order.head()  # 기본값은 5줄 출력

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirmed
2,1,7,2017-01-03,700,Seoul,confirmed
3,3,8,2017-01-03,900,Daejeon,confirmed
4,4,2,2017-01-07,500,,canceled
5,7,3,2017-01-09,700,Incheon,confirmed


In [95]:
order.tail()  # 기본값은 5줄 출력

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,3,8,2017-01-03,900,Daejeon,confirmed
4,4,2,2017-01-07,500,,canceled
5,7,3,2017-01-09,700,Incheon,confirmed
6,5,7,2017-01-09,600,Busan,canceled
7,2,5,2017-01-10,200,,canceled


### 3. 기본 연산

In [96]:
order['amount'].mean()

585.7142857142857

In [97]:
order['amount'].min()

200

In [98]:
order['amount'].max()

900

In [99]:
order['amount'].describe()

count      7.000000
mean     585.714286
std      219.306266
min      200.000000
25%      500.000000
50%      600.000000
75%      700.000000
max      900.000000
Name: amount, dtype: float64

* `unique`     
: 데이터의 중복 제거

In [122]:
order['result'].unique()

array(['confirm', 'cancel'], dtype=object)

* `value_counts()`    
: 각 종류별 데이터 개수    
: `normalize = True` 비율로 보여주는 옵션

In [101]:
order['result'].value_counts()

confirmed    4
canceled     3
Name: result, dtype: int64

In [102]:
order['result'].value_counts(normalize = True)

confirmed    0.571429
canceled     0.428571
Name: result, dtype: float64

* `replace`    
: 데이터 값 변경    
: 원래 값에 다시 넣어줘야 원상복귀 안됨

In [103]:
order['result'].replace('confirmed', 'confirm').replace('canceled', 'cancel')

id
1    confirm
2    confirm
3    confirm
4     cancel
5    confirm
6     cancel
7     cancel
Name: result, dtype: object

In [104]:
order  # 다시 원래 데이터로 돌아옴

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirmed
2,1,7,2017-01-03,700,Seoul,confirmed
3,3,8,2017-01-03,900,Daejeon,confirmed
4,4,2,2017-01-07,500,,canceled
5,7,3,2017-01-09,700,Incheon,confirmed
6,5,7,2017-01-09,600,Busan,canceled
7,2,5,2017-01-10,200,,canceled


In [105]:
# 바뀐 값을 order['result']에 다시 넣어줘야 원상복귀가 안됨
order['result'] = order['result'].replace('confirmed', 'confirm').replace('canceled', 'cancel')
order

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirm
2,1,7,2017-01-03,700,Seoul,confirm
3,3,8,2017-01-03,900,Daejeon,confirm
4,4,2,2017-01-07,500,,cancel
5,7,3,2017-01-09,700,Incheon,confirm
6,5,7,2017-01-09,600,Busan,cancel
7,2,5,2017-01-10,200,,cancel


* `to_datetime`    
: pandas는 object(문자열)로 인식    
: 날짜인지 구별 못해서 날짜 컬럼인 걸 지정해주면 연도/월/일을 편하게 불러올 수 있다.    
: `dt.year[month,day]`

In [107]:
order['date'].dt.year

AttributeError: Can only use .dt accessor with datetimelike values

In [108]:
order['date'] = pd.to_datetime(order['date'])  # 날짜인거 인식
order['date'].dt.year

id
1    2017
2    2017
3    2017
4    2017
5    2017
6    2017
7    2017
Name: date, dtype: int64

## 행렬 검색하기

### 1. column 가져오기

In [110]:
# column 하나 가져오기
# [ ]
order['date']

id
1   2017-01-01
2   2017-01-03
3   2017-01-03
4   2017-01-07
5   2017-01-09
6   2017-01-09
7   2017-01-10
Name: date, dtype: datetime64[ns]

In [111]:
# column 여러개 가져오기
# [ ] 안에 list를 넣기 때문에 [[ ]]
order[['user_id', 'date', 'amount']]

Unnamed: 0_level_0,user_id,date,amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,2017-01-01,500
2,1,2017-01-03,700
3,3,2017-01-03,900
4,4,2017-01-07,500
5,7,2017-01-09,700
6,5,2017-01-09,600
7,2,2017-01-10,200


In [112]:
columns = ['user_id', 'date', 'amount']  # python list
columns

['user_id', 'date', 'amount']

In [113]:
order[columns]

Unnamed: 0_level_0,user_id,date,amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,2017-01-01,500
2,1,2017-01-03,700
3,3,2017-01-03,900
4,4,2017-01-07,500
5,7,2017-01-09,700
6,5,2017-01-09,600
7,2,2017-01-10,200


In [115]:
type(order['date'])  # [ ] : Series

pandas.core.series.Series

In [116]:
type(order[['user_id', 'date', 'amount']])  # [[ ]] : DataFrame

pandas.core.frame.DataFrame

### 2. row 가져오기
* loc == locate

In [117]:
# row 하나 가져오기
order.loc[1]

user_id                         3
product_id                      9
date          2017-01-01 00:00:00
amount                        500
address                     Seoul
result                    confirm
Name: 1, dtype: object

In [118]:
# row 여러개 가져오기
order.loc[[1,3,7]]

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirm
3,3,8,2017-01-03,900,Daejeon,confirm
7,2,5,2017-01-10,200,,cancel


In [119]:
order_ids = [1,3,7]
order.loc[order_ids]

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirm
3,3,8,2017-01-03,900,Daejeon,confirm
7,2,5,2017-01-10,200,,cancel


In [120]:
type(order.loc[1])  # [ ] : Series

pandas.core.series.Series

In [121]:
type(order.loc[[1,3,7]])  # [[ ]] : DataFrame

pandas.core.frame.DataFrame

### 3. column, row 같이 가져오기
`.loc[index, column]`

In [124]:
order.loc[1]['date']  # [ ]를 몇번 열고 닫았는지가 실행 횟수, 이건 2번실행, 권장 x

Timestamp('2017-01-01 00:00:00')

In [126]:
order.loc[1, 'date']  # 위 방법보다 이 방법을 더 많이 쓴다.

Timestamp('2017-01-01 00:00:00')

In [128]:
# 실제 속도 차이
%timeit order.loc[1]['date']

222 µs ± 8.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [129]:
# 실제 속도 차이
%timeit order.loc[1, 'date']

11.5 µs ± 887 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [130]:
# 실제 속도 차이
%timeit order.at[1, 'date']  # .loc보다 속도는 빠르지만, row, column 모두 하나씩만 접근 가능

10.2 µs ± 4.51 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [132]:
# .loc vs .at

# order.at[[1,3,7], 'date']
# ValueError: At based indexing on an integer index can only have integer indexers

order.loc[[1,3,7], 'date']

id
1   2017-01-01
3   2017-01-03
7   2017-01-10
Name: date, dtype: datetime64[ns]

In [134]:
# order.at[1, ['date', 'amount', 'result']]
# TypeError: unhashable type: 'list'

order.loc[1, ['date', 'amount', 'result']]

date      2017-01-01 00:00:00
amount                    500
result                confirm
Name: 1, dtype: object

### 4. 색인

* 조건 1개 검색

In [138]:
# order['date'] == '2017-01-03'  # 이거만 하면 맞으면 True, 아니면 False
order[order['date'] == '2017-01-03']  # True인 데이터만 보여줘

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,7,2017-01-03,700,Seoul,confirm
3,3,8,2017-01-03,900,Daejeon,confirm


In [139]:
order[order['date'] != '2017-01-03']

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirm
4,4,2,2017-01-07,500,,cancel
5,7,3,2017-01-09,700,Incheon,confirm
6,5,7,2017-01-09,600,Busan,cancel
7,2,5,2017-01-10,200,,cancel


In [140]:
order[order['amount'] >= 500]

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirm
2,1,7,2017-01-03,700,Seoul,confirm
3,3,8,2017-01-03,900,Daejeon,confirm
4,4,2,2017-01-07,500,,cancel
5,7,3,2017-01-09,700,Incheon,confirm
6,5,7,2017-01-09,600,Busan,cancel


In [147]:
date_candidates = ['2017-01-01', '2017-01-05', '2017-01-09']

order[order['date'].isin(date_candidates)]  # 'date_candidates'에 해당되는 값만 나옴

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirm
5,7,3,2017-01-09,700,Incheon,confirm
6,5,7,2017-01-09,600,Busan,cancel


In [149]:
order[order['address'].isnull()]  # NaN으로 비어있으면 True

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,4,2,2017-01-07,500,,cancel
7,2,5,2017-01-10,200,,cancel


In [151]:
order[~order['address'].isnull()]  # 안비어 있으면 True, '~'은 다른데에서도 쓸 수 있음

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirm
2,1,7,2017-01-03,700,Seoul,confirm
3,3,8,2017-01-03,900,Daejeon,confirm
5,7,3,2017-01-09,700,Incheon,confirm
6,5,7,2017-01-09,600,Busan,cancel


In [152]:
order[order['address'].notnull()]  # 안비어 있으면 True

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirm
2,1,7,2017-01-03,700,Seoul,confirm
3,3,8,2017-01-03,900,Daejeon,confirm
5,7,3,2017-01-09,700,Incheon,confirm
6,5,7,2017-01-09,600,Busan,cancel


* 조건 2개 이상    
: `&` (and)   
: `|` (or)

In [160]:
# amount가 500이상, result가 confirm인 조건 검색

# python의 'and'는 pandas에서 지원 안함!!!
# order['amount'] >= 500 and order['result'] == 'confirm'

# & (pandas의 and)
# order[(order['amount'] >= 500) & (order['result'] == 'confirm')]
# 조건이 많아지면 가로로 길어지니까, 조건을 변수로 빼서 하기!

high = (order['amount'] >= 500)
confirm = (order['result'] == 'confirm')

order[high & confirm]

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirm
2,1,7,2017-01-03,700,Seoul,confirm
3,3,8,2017-01-03,900,Daejeon,confirm
5,7,3,2017-01-09,700,Incheon,confirm


In [161]:
# | (pandas의 or)
order[high | confirm]

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,9,2017-01-01,500,Seoul,confirm
2,1,7,2017-01-03,700,Seoul,confirm
3,3,8,2017-01-03,900,Daejeon,confirm
4,4,2,2017-01-07,500,,cancel
5,7,3,2017-01-09,700,Incheon,confirm
6,5,7,2017-01-09,600,Busan,cancel


* 색인한 다음에 컬럼 검색하기

In [163]:
# 색인한 다음에 
# order[order['date'] == '2017-01-09']

# amount 검색
order[order['date'] == '2017-01-09']['amount']

# but, 위의 행렬 검색했던 것처럼 [ ] 두번 열고 닫아서 좋지 않음 -> loc 사용하기

id
5    700
6    600
Name: amount, dtype: int64

`.loc[검색 조건, column]`

In [164]:
order.loc[order['date'] == '2017-01-09', 'amount']

id
5    700
6    600
Name: amount, dtype: int64

In [165]:
# 여러개 하고 싶을 때는 리스트!!!
order.loc[order['date'] == '2017-01-09', ['date', 'amount', 'result']]

Unnamed: 0_level_0,date,amount,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,2017-01-09,700,confirm
6,2017-01-09,600,cancel


### 5. 컬럼 추가 & 수정하기

In [171]:
# 컬럼 추가할 때 그 안에 있는 값이 전부 같다.
order['card-holder'] = 'KB Card'  # 추가와 수정이 같다
order

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card
2,1,7,2017-01-03,700,Seoul,confirm,KB Card
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card
4,4,2,2017-01-07,500,,cancel,KB Card
5,7,3,2017-01-09,700,Incheon,confirm,KB Card
6,5,7,2017-01-09,600,Busan,cancel,KB Card
7,2,5,2017-01-10,200,,cancel,KB Card


In [173]:
# 컬럼 추가할 때 그 안에 있는 값이 다르다.
# 개수만 맞으면 들어간다!
order['order'] = [1,2,3,4,5,6,7]
order

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card,3
4,4,2,2017-01-07,500,,cancel,KB Card,4
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6
7,2,5,2017-01-10,200,,cancel,KB Card,7


* True, False

In [175]:
# amount > 500 이상이면 VIP에 Ture, 아니면 False
order['VIP'] = order['amount'] >= 500
order

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order,VIP
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1,True
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2,True
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card,3,True
4,4,2,2017-01-07,500,,cancel,KB Card,4,True
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5,True
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6,True
7,2,5,2017-01-10,200,,cancel,KB Card,7,False


In [176]:
# amount > 500 이상이고, result가 confirm이면 VIP에 Ture, 아니면 False
order['VIP'] = (order['amount'] >= 500) & (order['result'] == 'confirm')
order

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order,VIP
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1,True
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2,True
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card,3,True
4,4,2,2017-01-07,500,,cancel,KB Card,4,False
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5,True
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6,False
7,2,5,2017-01-10,200,,cancel,KB Card,7,False


`.loc[검색 조건, 추가할 column] = 입력 내용` 

In [183]:
# amount > 500 이상이고, result가 confirm이면 status에 VIP, 아니면 None-VIP

order.loc[(order['amount'] >= 500) & (order['result'] == 'confirm'), 'status'] = 'VIP'
order.loc[(order['amount'] < 500) | (order['result'] == 'cancle'), 'status'] = 'None-VIP'
order

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order,VIP,status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1,True,VIP
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2,True,VIP
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card,3,True,VIP
4,4,2,2017-01-07,500,,cancel,KB Card,4,False,VIP
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5,True,VIP
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6,False,VIP
7,2,5,2017-01-10,200,,cancel,KB Card,7,False,None-VIP


### 6. 삭제하기
* colums 삭제하기    
`.drop(컬럼명, axis='columns)`    

In [185]:
order.drop('status', axis='columns')  # axis='columns' 컬럼삭제

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order,VIP
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1,True
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2,True
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card,3,True
4,4,2,2017-01-07,500,,cancel,KB Card,4,False
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5,True
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6,False
7,2,5,2017-01-10,200,,cancel,KB Card,7,False


In [186]:
# 두개 이상 지울때는 리스트!
order.drop(['status', 'VIP'], axis='columns')

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card,3
4,4,2,2017-01-07,500,,cancel,KB Card,4
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6
7,2,5,2017-01-10,200,,cancel,KB Card,7


In [187]:
order  # 근데 다시 order하면 다 살아있으므로 수정된 내용은 자기 자신한테 다시 넣어줘야 함

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order,VIP,status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1,True,VIP
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2,True,VIP
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card,3,True,VIP
4,4,2,2017-01-07,500,,cancel,KB Card,4,False,VIP
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5,True,VIP
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6,False,VIP
7,2,5,2017-01-10,200,,cancel,KB Card,7,False,None-VIP


In [189]:
order = order.drop(['status', 'VIP'], axis='columns')
order

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card,3
4,4,2,2017-01-07,500,,cancel,KB Card,4
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6
7,2,5,2017-01-10,200,,cancel,KB Card,7


* rows 삭제하기    
`.drop(index, axis='rows)` 

In [190]:
order.drop(3, axis='rows')

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2
4,4,2,2017-01-07,500,,cancel,KB Card,4
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6
7,2,5,2017-01-10,200,,cancel,KB Card,7


### 6. apply
* python의 for문과 비슷
* `.apply(함수명, axis='columns)`

In [194]:
def is_vip(amount):
    return amount

order['amount'].apply(is_vip)

id
1    500
2    700
3    900
4    500
5    700
6    600
7    200
Name: amount, dtype: int64

In [195]:
def is_vip(amount):
    
    # 여기서는 python 코드를 사용할 수 있다!
    if amount >= 500:
        return 'VIP'
    else:
        return 'None-VIP'

order['status'] = order['amount'].apply(is_vip)
order

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order,status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1,VIP
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2,VIP
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card,3,VIP
4,4,2,2017-01-07,500,,cancel,KB Card,4,VIP
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5,VIP
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6,VIP
7,2,5,2017-01-10,200,,cancel,KB Card,7,None-VIP


In [196]:
def is_vip(row):  # order의 한줄한줄을 row라는 변수로 받음
    return row

order.apply(is_vip, axis='columns')  # axis='columns'하면 row 전체를 반환

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order,status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1,VIP
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2,VIP
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card,3,VIP
4,4,2,2017-01-07,500,,cancel,KB Card,4,VIP
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5,VIP
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6,VIP
7,2,5,2017-01-10,200,,cancel,KB Card,7,None-VIP


In [203]:
def is_vip(row): 
    amount = row['amount']
    result = row['result']
    
    if amount >= 500 and result == 'confirm':
        return 'VIP'
    else:
        return 'None-VIP'

order['status2'] = order.apply(is_vip, axis='columns')
order

Unnamed: 0_level_0,user_id,product_id,date,amount,address,result,card-holder,order,status,status2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,9,2017-01-01,500,Seoul,confirm,KB Card,1,VIP,VIP
2,1,7,2017-01-03,700,Seoul,confirm,KB Card,2,VIP,VIP
3,3,8,2017-01-03,900,Daejeon,confirm,KB Card,3,VIP,VIP
4,4,2,2017-01-07,500,,cancel,KB Card,4,VIP,None-VIP
5,7,3,2017-01-09,700,Incheon,confirm,KB Card,5,VIP,VIP
6,5,7,2017-01-09,600,Busan,cancel,KB Card,6,VIP,None-VIP
7,2,5,2017-01-10,200,,cancel,KB Card,7,None-VIP,None-VIP


### python
점프 투 파이썬(2~4장)        
https://wikidocs.net/book/1

### pandas
10 minutes to pandas