# 🚩 DataFrame에 대해서
## 주요 토픽
1. 데이터프레임 기초
2. 데이터 접근 및 제거
3. 데이터프레임 결측값
4. 데이터프레임 정렬과 필터링
5. 데이터프레임 컬럼 수정
6. 판다스 데이터 타입
7. 메모리 최적화
## 목표
- 행 필터링/정렬, 결측값 처리, 새로운 컬럼 추가, 컬럼에 함수 적용하기

In [1]:
import numpy as np
import pandas as pd

## 1. 데이터프레임이란?
- 데이터프레임의 열은 하나의 Series이다
    - 각 컬럼은 동일한 행 인덱스를 공유한다.
    - 각 컬럼명 = column index. Series name 속성값을 포함한다.
### > 주요 속성
1. shape
2. index (행 인덱스)
3. columns
4. axes (행과 열의 인덱스)
5. dtypes (각 Series의 데이터 타입)

In [2]:
oil = pd.read_csv('./data/retail/oil.csv')
oil

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.20
...,...,...
1213,2017-08-25,47.65
1214,2017-08-28,46.40
1215,2017-08-29,46.46
1216,2017-08-30,45.96


In [3]:
oil.shape

(1218, 2)

In [4]:
oil.axes

[RangeIndex(start=0, stop=1218, step=1),
 Index(['date', 'dcoilwtico'], dtype='object')]

In [5]:
oil.index

RangeIndex(start=0, stop=1218, step=1)

In [6]:
oil.columns

Index(['date', 'dcoilwtico'], dtype='object')

In [7]:
oil.columns = ['price_date', 'oil_price']
oil.head()

Unnamed: 0,price_date,oil_price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


### > 주요 메서드
1. head, tail
2. sample (임의의 행 데이터 추출)
3. info (데이터프레임 기본 정보)
4. describe (데이터프레임의 통계 정보; ONLY numeric columns BY DEFAULT)

In [8]:
oil.head()

Unnamed: 0,price_date,oil_price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [9]:
oil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   price_date  1218 non-null   object 
 1   oil_price   1175 non-null   float64
dtypes: float64(1), object(1)
memory usage: 19.2+ KB


In [10]:
oil.describe()

Unnamed: 0,oil_price
count,1175.0
mean,67.714366
std,25.630476
min,26.19
25%,46.405
50%,53.19
75%,95.66
max,110.62


In [11]:
oil.describe(include='all')

Unnamed: 0,price_date,oil_price
count,1218,1175.0
unique,1218,
top,2013-01-01,
freq,1,
mean,,67.714366
std,,25.630476
min,,26.19
25%,,46.405
50%,,53.19
75%,,95.66


## 2. 데이터프레임 조회
### > 단일 컬럼 조회
- 대괄호를 사용한다
### > 복수 컬럼 조회
- 대괄호를 사용한다
### > ✅ Pro Tips
1. 1개 이상의 컬럼을 조회하는 경우에는 loc 접근자를 사용하는 편이 권장된다.
    - 대괄호는 새로운 컬럼을 생성하거나 빠른 확인이 필요할 때만 사용한다.

In [12]:
oil.columns = ['date', 'price']
oil.head()

Unnamed: 0,date,price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [13]:
oil['price'].head()

0      NaN
1    93.14
2    92.97
3    93.12
4    93.20
Name: price, dtype: float64

In [14]:
type(oil['price'])

pandas.core.series.Series

In [15]:
type(oil[['price']])

pandas.core.frame.DataFrame

In [16]:
oil[['price', 'date']]

Unnamed: 0,price,date
0,,2013-01-01
1,93.14,2013-01-02
2,92.97,2013-01-03
3,93.12,2013-01-04
4,93.20,2013-01-07
...,...,...
1213,47.65,2017-08-25
1214,46.40,2017-08-28
1215,46.46,2017-08-29
1216,45.96,2017-08-30


### > iloc 접근자
- Position-based
- [행, 열]
### > loc 접근자
- Label-based
- [행, 열]

In [17]:
oil['euro_price'] = oil['price'] * 1.1
oil.head()

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.2,102.52


In [18]:
oil.iloc[:3, :]

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267


In [19]:
oil.iloc[:3, -1]

0        NaN
1    102.454
2    102.267
Name: euro_price, dtype: float64

In [20]:
oil.iloc[:3, -2:]

Unnamed: 0,price,euro_price
0,,
1,93.14,102.454
2,92.97,102.267


In [21]:
oil.loc[:5, :]

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.2,102.52
5,2013-01-08,93.21,102.531


In [22]:
oil.loc[:3, 'date']

0    2013-01-01
1    2013-01-02
2    2013-01-03
3    2013-01-04
Name: date, dtype: object

In [23]:
oil.loc[:3, ['date', 'euro_price']]

Unnamed: 0,date,euro_price
0,2013-01-01,
1,2013-01-02,102.454
2,2013-01-03,102.267
3,2013-01-04,102.432


In [24]:
oil.loc[:3, ['euro_price', 'date']]

Unnamed: 0,euro_price,date
0,,2013-01-01
1,102.454,2013-01-02
2,102.267,2013-01-03
3,102.432,2013-01-04


In [25]:
oil.loc[:3, 'date':'price']

Unnamed: 0,date,price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12


## 3. 데이터프레임 삭제/중복
### > drop 메서드
- axis=0 (drop rows by label)
- axis=1 (drop columns)

In [26]:
oil.head()

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.2,102.52


In [27]:
oil_euro = oil.drop('price', axis=1)
oil_euro.head()

Unnamed: 0,date,euro_price
0,2013-01-01,
1,2013-01-02,102.454
2,2013-01-03,102.267
3,2013-01-04,102.432
4,2013-01-07,102.52


In [28]:
oil_0_off = oil.drop(0, axis=0)
oil_0_off.head()

Unnamed: 0,date,price,euro_price
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.2,102.52
5,2013-01-08,93.21,102.531


In [29]:
oil_0_off.reset_index(drop=True)

Unnamed: 0,date,price,euro_price
0,2013-01-02,93.14,102.454
1,2013-01-03,92.97,102.267
2,2013-01-04,93.12,102.432
3,2013-01-07,93.20,102.520
4,2013-01-08,93.21,102.531
...,...,...,...
1212,2017-08-25,47.65,52.415
1213,2017-08-28,46.40,51.040
1214,2017-08-29,46.46,51.106
1215,2017-08-30,45.96,50.556


### > duplicated 메서드
- 중복된 행 검색 (중복이면 True를 리턴)
- duplicated(subset=None, keep='first')
    - subset (specifies columns)
### > drop_duplicates 메서드
- 중복된 행을 제거하고 새로운 데이터프레임을 리턴

In [30]:
items = pd.DataFrame({
    'product': ['Dairy', 'Dairy', 'Dairy', 'Vegetables', 'Fruits'],
    'price': [2.56, 2.56, 4.55, 2.74, 5.44]
})
items

Unnamed: 0,product,price
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetables,2.74
4,Fruits,5.44


In [31]:
items.duplicated()

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [32]:
items.drop_duplicates(subset='product', keep='last', ignore_index=True)

Unnamed: 0,product,price
0,Dairy,4.55
1,Vegetables,2.74
2,Fruits,5.44


In [33]:
oil.loc[len(oil)] = oil.iloc[-1]
oil.tail()

Unnamed: 0,date,price,euro_price
1214,2017-08-28,46.4,51.04
1215,2017-08-29,46.46,51.106
1216,2017-08-30,45.96,50.556
1217,2017-08-31,47.26,51.986
1218,2017-08-31,47.26,51.986


In [34]:
oil.shape

(1219, 3)

In [35]:
# DataFrame.nunique(axis=0, dropna=True)
oil.nunique(dropna=False)

date          1218
price          999
euro_price     999
dtype: int64

In [36]:
oil.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1214    False
1215    False
1216    False
1217    False
1218     True
Length: 1219, dtype: bool

In [37]:
oil.duplicated(subset='price').sum()

220

In [38]:
oil.drop_duplicates(keep='last')

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.20,102.520
...,...,...,...
1213,2017-08-25,47.65,52.415
1214,2017-08-28,46.40,51.040
1215,2017-08-29,46.46,51.106
1216,2017-08-30,45.96,50.556


In [39]:
oil.drop_duplicates(keep='last', ignore_index=True)

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.20,102.520
...,...,...,...
1213,2017-08-25,47.65,52.415
1214,2017-08-28,46.40,51.040
1215,2017-08-29,46.46,51.106
1216,2017-08-30,45.96,50.556


## 4. 데이터프레임 결측값
### > isna, sum 메서드

In [40]:
products = pd.DataFrame({
    'product': [np.nan, 'Dairy', 'Dairy', np.nan, 'Fruits'],
    'price': [2.56, np.nan, 4.55, 2.74, np.nan],
    'product_id': [1, 2, 3, 4, 5]
})

products

Unnamed: 0,product,price,product_id
0,,2.56,1
1,Dairy,,2
2,Dairy,4.55,3
3,,2.74,4
4,Fruits,,5


In [41]:
# 결측값 개수 카운팅
products.isna().sum()

product       2
price         2
product_id    0
dtype: int64

In [42]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   product     3 non-null      object 
 1   price       3 non-null      float64
 2   product_id  5 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes


### > dropna 메서드
### > fillna 메서드

In [43]:
products

Unnamed: 0,product,price,product_id
0,,2.56,1
1,Dairy,,2
2,Dairy,4.55,3
3,,2.74,4
4,Fruits,,5


In [44]:
products.fillna({'price': 0})

Unnamed: 0,product,price,product_id
0,,2.56,1
1,Dairy,0.0,2
2,Dairy,4.55,3
3,,2.74,4
4,Fruits,0.0,5


In [45]:
products.dropna()

Unnamed: 0,product,price,product_id
2,Dairy,4.55,3


In [46]:
products.dropna(subset='price')

Unnamed: 0,product,price,product_id
0,,2.56,1
2,Dairy,4.55,3
3,,2.74,4


## 5. 데이터프레임 필터링
- loc 접근자 사용 가능

In [47]:
oil = pd.read_csv('./data/retail/oil.csv')
oil['benchmark'] = 100
oil.head()

Unnamed: 0,date,dcoilwtico,benchmark
0,2013-01-01,,100
1,2013-01-02,93.14,100
2,2013-01-03,92.97,100
3,2013-01-04,93.12,100
4,2013-01-07,93.2,100


In [48]:
# oil[oil['dcoilwtico'] > 100]
oil[oil['dcoilwtico'] > 100]

Unnamed: 0,date,dcoilwtico,benchmark
131,2013-07-03,101.92,100
133,2013-07-05,103.09,100
134,2013-07-08,103.03,100
135,2013-07-09,103.46,100
136,2013-07-10,106.41,100
...,...,...,...
407,2014-07-24,102.76,100
408,2014-07-25,105.23,100
409,2014-07-28,105.68,100
410,2014-07-29,104.91,100


In [49]:
mask = ((oil['dcoilwtico'] > oil['benchmark'])
       & (oil['date'].str[:4] == '2013'))
oil[mask]

Unnamed: 0,date,dcoilwtico,benchmark
131,2013-07-03,101.92,100
133,2013-07-05,103.09,100
134,2013-07-08,103.03,100
135,2013-07-09,103.46,100
136,2013-07-10,106.41,100
...,...,...,...
204,2013-10-14,102.46,100
205,2013-10-15,101.15,100
206,2013-10-16,102.34,100
207,2013-10-17,100.72,100


### > ✅ Pro Tips
- query 메서드
    - 변수는 '@' 기호로 접근이 가능

In [50]:
oil = pd.read_csv('./data/retail/oil.csv', parse_dates=['date'])
oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [51]:
oil['benchmark'] = 100
oil.query(
    'dcoilwtico > benchmark and date.dt.year == 2013' 
)

Unnamed: 0,date,dcoilwtico,benchmark
131,2013-07-03,101.92,100
133,2013-07-05,103.09,100
134,2013-07-08,103.03,100
135,2013-07-09,103.46,100
136,2013-07-10,106.41,100
...,...,...,...
204,2013-10-14,102.46,100
205,2013-10-15,101.15,100
206,2013-10-16,102.34,100
207,2013-10-17,100.72,100
