# 🚩 DataFrame에 대해서
## 주요 토픽
1. 데이터프레임 기초
2. 데이터 접근 및 제거
3. 데이터프레임 결측값
4. 데이터프레임 정렬과 필터링
5. 데이터프레임 컬럼 수정
6. 판다스 데이터 타입
7. 메모리 최적화
## 목표
- 행 필터링/정렬, 결측값 처리, 새로운 컬럼 추가, 컬럼에 함수 적용하기
## 목차
1. 데이터프레임이란?
2. 데이터프레임 조회
3. 데이터프레임 삭제/중복
4. 데이터프레임 결측값
5. 데이터프레임 필터링
6. 데이터프레임 정렬
7. 데이터프레임 컬럼 수정
8. 데이터프레임 컬럼 추가
9. map 메서드
10. assign 메서드
11. 데이터 타입

In [1]:
import numpy as np
import pandas as pd

## 1. 데이터프레임이란?
- 데이터프레임의 열은 하나의 Series이다
    - 각 컬럼은 동일한 행 인덱스를 공유한다.
    - 각 컬럼명 = column index. Series name 속성값을 포함한다.
### > 주요 속성
1. shape
2. index (행 인덱스)
3. columns
4. axes (행과 열의 인덱스)
5. dtypes (각 Series의 데이터 타입)

In [2]:
oil = pd.read_csv('./data/retail/oil.csv')
oil

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.20
...,...,...
1213,2017-08-25,47.65
1214,2017-08-28,46.40
1215,2017-08-29,46.46
1216,2017-08-30,45.96


In [3]:
oil.shape

(1218, 2)

In [4]:
oil.axes

[RangeIndex(start=0, stop=1218, step=1),
 Index(['date', 'dcoilwtico'], dtype='object')]

In [5]:
oil.index

RangeIndex(start=0, stop=1218, step=1)

In [6]:
oil.columns

Index(['date', 'dcoilwtico'], dtype='object')

In [7]:
oil.columns = ['price_date', 'oil_price']
oil.head()

Unnamed: 0,price_date,oil_price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


### > 주요 메서드
1. head, tail
2. sample (임의의 행 데이터 추출)
3. info (데이터프레임 기본 정보)
4. describe (데이터프레임의 통계 정보; ONLY numeric columns BY DEFAULT)

In [8]:
oil.head()

Unnamed: 0,price_date,oil_price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [9]:
oil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   price_date  1218 non-null   object 
 1   oil_price   1175 non-null   float64
dtypes: float64(1), object(1)
memory usage: 19.2+ KB


In [10]:
oil.describe()

Unnamed: 0,oil_price
count,1175.0
mean,67.714366
std,25.630476
min,26.19
25%,46.405
50%,53.19
75%,95.66
max,110.62


In [11]:
oil.describe(include='all')

Unnamed: 0,price_date,oil_price
count,1218,1175.0
unique,1218,
top,2013-01-01,
freq,1,
mean,,67.714366
std,,25.630476
min,,26.19
25%,,46.405
50%,,53.19
75%,,95.66


## 2. 데이터프레임 조회
### > 단일 컬럼 조회
- 대괄호를 사용한다
### > 복수 컬럼 조회
- 대괄호를 사용한다
### > ✅ Pro Tips
1. 1개 이상의 컬럼을 조회하는 경우에는 loc 접근자를 사용하는 편이 권장된다.
    - 대괄호는 새로운 컬럼을 생성하거나 빠른 확인이 필요할 때만 사용한다.

In [12]:
oil.columns = ['date', 'price']
oil.head()

Unnamed: 0,date,price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [13]:
oil['price'].head()

0      NaN
1    93.14
2    92.97
3    93.12
4    93.20
Name: price, dtype: float64

In [14]:
type(oil['price'])

pandas.core.series.Series

In [15]:
type(oil[['price']])

pandas.core.frame.DataFrame

In [16]:
oil[['price', 'date']]

Unnamed: 0,price,date
0,,2013-01-01
1,93.14,2013-01-02
2,92.97,2013-01-03
3,93.12,2013-01-04
4,93.20,2013-01-07
...,...,...
1213,47.65,2017-08-25
1214,46.40,2017-08-28
1215,46.46,2017-08-29
1216,45.96,2017-08-30


### > iloc 접근자
- Position-based
- [행, 열]
### > loc 접근자
- Label-based
- [행, 열]

In [17]:
oil['euro_price'] = oil['price'] * 1.1
oil.head()

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.2,102.52


In [18]:
oil.iloc[:3, :]

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267


In [19]:
oil.iloc[:3, -1]

0        NaN
1    102.454
2    102.267
Name: euro_price, dtype: float64

In [20]:
oil.iloc[:3, -2:]

Unnamed: 0,price,euro_price
0,,
1,93.14,102.454
2,92.97,102.267


In [21]:
oil.loc[:5, :]

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.2,102.52
5,2013-01-08,93.21,102.531


In [22]:
oil.loc[:3, 'date']

0    2013-01-01
1    2013-01-02
2    2013-01-03
3    2013-01-04
Name: date, dtype: object

In [23]:
oil.loc[:3, ['date', 'euro_price']]

Unnamed: 0,date,euro_price
0,2013-01-01,
1,2013-01-02,102.454
2,2013-01-03,102.267
3,2013-01-04,102.432


In [24]:
oil.loc[:3, ['euro_price', 'date']]

Unnamed: 0,euro_price,date
0,,2013-01-01
1,102.454,2013-01-02
2,102.267,2013-01-03
3,102.432,2013-01-04


In [25]:
oil.loc[:3, 'date':'price']

Unnamed: 0,date,price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12


## 3. 데이터프레임 삭제/중복
### > drop 메서드
- axis=0 (drop rows by label)
- axis=1 (drop columns)

In [26]:
oil.head()

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.2,102.52


In [27]:
oil_euro = oil.drop('price', axis=1)
oil_euro.head()

Unnamed: 0,date,euro_price
0,2013-01-01,
1,2013-01-02,102.454
2,2013-01-03,102.267
3,2013-01-04,102.432
4,2013-01-07,102.52


In [28]:
oil_0_off = oil.drop(0, axis=0)
oil_0_off.head()

Unnamed: 0,date,price,euro_price
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.2,102.52
5,2013-01-08,93.21,102.531


In [29]:
oil_0_off.reset_index(drop=True)

Unnamed: 0,date,price,euro_price
0,2013-01-02,93.14,102.454
1,2013-01-03,92.97,102.267
2,2013-01-04,93.12,102.432
3,2013-01-07,93.20,102.520
4,2013-01-08,93.21,102.531
...,...,...,...
1212,2017-08-25,47.65,52.415
1213,2017-08-28,46.40,51.040
1214,2017-08-29,46.46,51.106
1215,2017-08-30,45.96,50.556


### > duplicated 메서드
- 중복된 행 검색 (중복이면 True를 리턴)
- duplicated(subset=None, keep='first')
    - subset (specifies columns)
### > drop_duplicates 메서드
- 중복된 행을 제거하고 새로운 데이터프레임을 리턴

In [30]:
items = pd.DataFrame({
    'product': ['Dairy', 'Dairy', 'Dairy', 'Vegetables', 'Fruits'],
    'price': [2.56, 2.56, 4.55, 2.74, 5.44]
})
items

Unnamed: 0,product,price
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetables,2.74
4,Fruits,5.44


In [31]:
items.duplicated()

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [32]:
items.drop_duplicates(subset='product', keep='last', ignore_index=True)

Unnamed: 0,product,price
0,Dairy,4.55
1,Vegetables,2.74
2,Fruits,5.44


In [33]:
oil.loc[len(oil)] = oil.iloc[-1]
oil.tail()

Unnamed: 0,date,price,euro_price
1214,2017-08-28,46.4,51.04
1215,2017-08-29,46.46,51.106
1216,2017-08-30,45.96,50.556
1217,2017-08-31,47.26,51.986
1218,2017-08-31,47.26,51.986


In [34]:
oil.shape

(1219, 3)

In [35]:
# DataFrame.nunique(axis=0, dropna=True)
oil.nunique(dropna=False)

date          1218
price          999
euro_price     999
dtype: int64

In [36]:
oil.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1214    False
1215    False
1216    False
1217    False
1218     True
Length: 1219, dtype: bool

In [37]:
oil.duplicated(subset='price').sum()

220

In [38]:
oil.drop_duplicates(keep='last')

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.20,102.520
...,...,...,...
1213,2017-08-25,47.65,52.415
1214,2017-08-28,46.40,51.040
1215,2017-08-29,46.46,51.106
1216,2017-08-30,45.96,50.556


In [39]:
oil.drop_duplicates(keep='last', ignore_index=True)

Unnamed: 0,date,price,euro_price
0,2013-01-01,,
1,2013-01-02,93.14,102.454
2,2013-01-03,92.97,102.267
3,2013-01-04,93.12,102.432
4,2013-01-07,93.20,102.520
...,...,...,...
1213,2017-08-25,47.65,52.415
1214,2017-08-28,46.40,51.040
1215,2017-08-29,46.46,51.106
1216,2017-08-30,45.96,50.556


## 4. 데이터프레임 결측값
### > isna, sum 메서드

In [40]:
products = pd.DataFrame({
    'product': [np.nan, 'Dairy', 'Dairy', np.nan, 'Fruits'],
    'price': [2.56, np.nan, 4.55, 2.74, np.nan],
    'product_id': [1, 2, 3, 4, 5]
})

products

Unnamed: 0,product,price,product_id
0,,2.56,1
1,Dairy,,2
2,Dairy,4.55,3
3,,2.74,4
4,Fruits,,5


In [41]:
# 결측값 개수 카운팅
products.isna().sum()

product       2
price         2
product_id    0
dtype: int64

In [42]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   product     3 non-null      object 
 1   price       3 non-null      float64
 2   product_id  5 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes


### > dropna 메서드
### > fillna 메서드

In [43]:
products

Unnamed: 0,product,price,product_id
0,,2.56,1
1,Dairy,,2
2,Dairy,4.55,3
3,,2.74,4
4,Fruits,,5


In [44]:
products.fillna({'price': 0})

Unnamed: 0,product,price,product_id
0,,2.56,1
1,Dairy,0.0,2
2,Dairy,4.55,3
3,,2.74,4
4,Fruits,0.0,5


In [45]:
products.dropna()

Unnamed: 0,product,price,product_id
2,Dairy,4.55,3


In [46]:
products.dropna(subset='price')

Unnamed: 0,product,price,product_id
0,,2.56,1
2,Dairy,4.55,3
3,,2.74,4


## 5. 데이터프레임 필터링
- loc 접근자 사용 가능

In [47]:
oil = pd.read_csv('./data/retail/oil.csv')
oil['benchmark'] = 100
oil.head()

Unnamed: 0,date,dcoilwtico,benchmark
0,2013-01-01,,100
1,2013-01-02,93.14,100
2,2013-01-03,92.97,100
3,2013-01-04,93.12,100
4,2013-01-07,93.2,100


In [48]:
oil[oil['dcoilwtico'] > 100]

Unnamed: 0,date,dcoilwtico,benchmark
131,2013-07-03,101.92,100
133,2013-07-05,103.09,100
134,2013-07-08,103.03,100
135,2013-07-09,103.46,100
136,2013-07-10,106.41,100
...,...,...,...
407,2014-07-24,102.76,100
408,2014-07-25,105.23,100
409,2014-07-28,105.68,100
410,2014-07-29,104.91,100


In [49]:
mask = ((oil['dcoilwtico'] > oil['benchmark'])
       & (oil['date'].str[:4] == '2013'))
oil[mask]

Unnamed: 0,date,dcoilwtico,benchmark
131,2013-07-03,101.92,100
133,2013-07-05,103.09,100
134,2013-07-08,103.03,100
135,2013-07-09,103.46,100
136,2013-07-10,106.41,100
...,...,...,...
204,2013-10-14,102.46,100
205,2013-10-15,101.15,100
206,2013-10-16,102.34,100
207,2013-10-17,100.72,100


### > ✅ Pro Tips
- query 메서드
    - 변수는 '@' 기호로 접근이 가능

In [50]:
oil = pd.read_csv('./data/retail/oil.csv', parse_dates=['date'])
oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [51]:
oil['benchmark'] = 100
oil.query(
    'dcoilwtico > benchmark and date.dt.year == 2013' 
)

Unnamed: 0,date,dcoilwtico,benchmark
131,2013-07-03,101.92,100
133,2013-07-05,103.09,100
134,2013-07-08,103.03,100
135,2013-07-09,103.46,100
136,2013-07-10,106.41,100
...,...,...,...
204,2013-10-14,102.46,100
205,2013-10-15,101.15,100
206,2013-10-16,102.34,100
207,2013-10-17,100.72,100


## 6. 데이터프레임 정렬
### > sort_index 메서드
- axis=0 (row index 정렬)
- axis=1 (column name 정렬)
### > sort_values 메서드
- 복수 컬럼 정렬 (대괄호 사용)

In [52]:
oil.head()

Unnamed: 0,date,dcoilwtico,benchmark
0,2013-01-01,,100
1,2013-01-02,93.14,100
2,2013-01-03,92.97,100
3,2013-01-04,93.12,100
4,2013-01-07,93.2,100


In [53]:
oil.sort_index(ascending=False)

Unnamed: 0,date,dcoilwtico,benchmark
1217,2017-08-31,47.26,100
1216,2017-08-30,45.96,100
1215,2017-08-29,46.46,100
1214,2017-08-28,46.40,100
1213,2017-08-25,47.65,100
...,...,...,...
4,2013-01-07,93.20,100
3,2013-01-04,93.12,100
2,2013-01-03,92.97,100
1,2013-01-02,93.14,100


In [54]:
oil.sort_values('dcoilwtico', ascending=False)

Unnamed: 0,date,dcoilwtico,benchmark
178,2013-09-06,110.62,100
171,2013-08-28,110.17,100
179,2013-09-09,109.62,100
170,2013-08-27,109.11,100
182,2013-09-12,108.72,100
...,...,...,...
1079,2017-02-20,,100
1118,2017-04-14,,100
1149,2017-05-29,,100
1174,2017-07-03,,100


In [55]:
oil['month'] = oil['date'].astype('datetime64[ns]').dt.month
oil.sort_values(['month', 'dcoilwtico'], ascending=[True, False])

Unnamed: 0,date,dcoilwtico,benchmark,month
282,2014-01-30,98.25,100,1
21,2013-01-30,97.98,100,1
22,2013-01-31,97.65,100,1
20,2013-01-29,97.62,100,1
283,2014-01-31,97.55,100,1
...,...,...,...,...
774,2015-12-21,34.55,100,12
256,2013-12-25,,100,12
517,2014-12-25,,100,12
778,2015-12-25,,100,12


## 7. 데이터프레임 컬럼 수정
### > 컬럼명 변경
1. columns 속성
2. rename 메서드
### > 컬럼 재정렬
- reindex 메서드
    - labels, axis

## 8. 데이터프레임 컬럼 추가
### > 산술/비교 연산자
- create columns with arithmetic by assigning them Series operations (산술연산자)
- create BOOLEAN columns by assigning them a logical test (비교연산자)

In [56]:
oil['benchmark'] = 90
oil = oil.drop('month', axis=1)
oil.head()

Unnamed: 0,date,dcoilwtico,benchmark
0,2013-01-01,,90
1,2013-01-02,93.14,90
2,2013-01-03,92.97,90
3,2013-01-04,93.12,90
4,2013-01-07,93.2,90


In [57]:
oil['benchmark ratio'] = oil.loc[:, 'dcoilwtico'] / oil.loc[:, 'benchmark'] * 100
oil.head()

Unnamed: 0,date,dcoilwtico,benchmark,benchmark ratio
0,2013-01-01,,90,
1,2013-01-02,93.14,90,103.488889
2,2013-01-03,92.97,90,103.3
3,2013-01-04,93.12,90,103.466667
4,2013-01-07,93.2,90,103.555556


In [58]:
# 마스킹
oil['buy'] = (oil.loc[:, 'benchmark ratio'] < 80) * (1000000 / oil.loc[:, 'dcoilwtico'])
oil.head()

Unnamed: 0,date,dcoilwtico,benchmark,benchmark ratio,buy
0,2013-01-01,,90,,
1,2013-01-02,93.14,90,103.488889,0.0
2,2013-01-03,92.97,90,103.3,0.0
3,2013-01-04,93.12,90,103.466667,0.0
4,2013-01-07,93.2,90,103.555556,0.0


### > select 메서드
- 넘파이 select 메서드
    - create columns based on multiple conditions
- 카테고리별 값 지정

In [59]:
oil = pd.read_csv('./data/retail/oil.csv')
oil.columns = ['date', 'price']
oil.head()

Unnamed: 0,date,price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [60]:
conditions = [
    (oil['price'] > 100),
    (oil['price'] <= 100) & (oil['price'] > 50),
    (oil['price'] <= 50)
]

choices = [
    "Don't Buy",
    "Buy",
    "Strong Buy"
]

oil['buy'] = np.select(conditions, choices, default='Missing')
oil

Unnamed: 0,date,price,buy
0,2013-01-01,,Missing
1,2013-01-02,93.14,Buy
2,2013-01-03,92.97,Buy
3,2013-01-04,93.12,Buy
4,2013-01-07,93.20,Buy
...,...,...,...
1213,2017-08-25,47.65,Strong Buy
1214,2017-08-28,46.40,Strong Buy
1215,2017-08-29,46.46,Strong Buy
1216,2017-08-30,45.96,Strong Buy


## 9. map 메서드
- maps values to a column
    1. 딕셔너리 전달 (existing values as keys, new values as values)
    2. lambda 함수 (포맷팅)

In [61]:
retail = pd.read_csv('./data/retail/retail_2016_2017.csv')

product_cat_dict = {
    'PRODUCE': 'Grocery',
    'POULTRY': 'Grocery',
    'Grocery I': 'Grocery',
    'Grocery II': 'Grocery',
    'EGGS': 'Grocery'
}

retail.loc[:, 'family'].map(product_cat_dict).value_counts(dropna=False)

NaN        959040
Grocery     95904
Name: family, dtype: int64

## 10. assign 메서드
- creates multiple columns at once (returns Dataframe)
    - 체이닝 가능

In [62]:
retail.head(1)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,1945944,2016-01-01,1,AUTOMOTIVE,0.0,0


In [63]:
(
    retail
    .assign(
    onpromotion_flag=retail['onpromotion'] > 0,
    family_abbrev=retail['family'].str[:3],
    onpromotion_ratio=retail['sales'] / retail['onpromotion'],
    sales_onprom_target=lambda x: x['onpromotion_ratio'] > 100)
    .query('sales_onprom_target == True')
)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,onpromotion_flag,family_abbrev,onpromotion_ratio,sales_onprom_target
561,1946505,2016-01-01,25,AUTOMOTIVE,4.000,0,False,AUT,inf,True
563,1946507,2016-01-01,25,BEAUTY,13.000,0,False,BEA,inf,True
564,1946508,2016-01-01,25,BEVERAGES,5104.000,1,True,BEV,5104.000,True
566,1946510,2016-01-01,25,BREAD/BAKERY,680.952,0,False,BRE,inf,True
567,1946511,2016-01-01,25,CELEBRATION,13.000,0,False,CEL,inf,True
...,...,...,...,...,...,...,...,...,...,...
1054937,3000881,2017-08-15,9,PET SUPPLIES,6.000,0,False,PET,inf,True
1054938,3000882,2017-08-15,9,PLAYERS AND ELECTRONICS,6.000,0,False,PLA,inf,True
1054939,3000883,2017-08-15,9,POULTRY,438.133,0,False,POU,inf,True
1054940,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,True,PRE,154.553,True


## 11. Data Types
### > 판다스 데이터 타입
- pandas data-types expand on python/numpy
- Numeric
    1. boolean: nullable boolean True/False → 8
    2. int64(default): nullable whole numbers → 8, 16, 32, 64
    3. float64(default): nullable decimal numbers → 32, 64
- Time Series
    1. datetime
    2. timedelta
    3. perod
### > categorical data type
- stores text data with repeated values efficiently
- use when (unique categories < number of rows / 2) 

In [64]:
retail = pd.read_csv('./data/retail/retail_2016_2017.csv')
retail.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054944 entries, 0 to 1054943
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   id           1054944 non-null  int64  
 1   date         1054944 non-null  object 
 2   store_nbr    1054944 non-null  int64  
 3   family       1054944 non-null  object 
 4   sales        1054944 non-null  float64
 5   onpromotion  1054944 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 167.8 MB


In [65]:
retail['family'].value_counts()

AUTOMOTIVE                    31968
HOME APPLIANCES               31968
SCHOOL AND OFFICE SUPPLIES    31968
PRODUCE                       31968
PREPARED FOODS                31968
POULTRY                       31968
PLAYERS AND ELECTRONICS       31968
PET SUPPLIES                  31968
PERSONAL CARE                 31968
MEATS                         31968
MAGAZINES                     31968
LIQUOR,WINE,BEER              31968
LINGERIE                      31968
LAWN AND GARDEN               31968
LADIESWEAR                    31968
HOME CARE                     31968
HOME AND KITCHEN II           31968
BABY CARE                     31968
HOME AND KITCHEN I            31968
HARDWARE                      31968
GROCERY II                    31968
GROCERY I                     31968
FROZEN FOODS                  31968
EGGS                          31968
DELI                          31968
DAIRY                         31968
CLEANING                      31968
CELEBRATION                 

In [66]:
retail = retail.astype({'family': 'category'})
retail.dtypes

id                int64
date             object
store_nbr         int64
family         category
sales           float64
onpromotion       int64
dtype: object

In [67]:
retail.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054944 entries, 0 to 1054943
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype   
---  ------       --------------    -----   
 0   id           1054944 non-null  int64   
 1   date         1054944 non-null  object  
 2   store_nbr    1054944 non-null  int64   
 3   family       1054944 non-null  category
 4   sales        1054944 non-null  float64 
 5   onpromotion  1054944 non-null  int64   
dtypes: category(1), float64(1), int64(3), object(1)
memory usage: 100.6 MB


### > 메모리 최적화 순서
1. 불필요한 컬럼 제거 (when possible, avoid reading them in at all)
2. 데이터 타입 변환) object → numeric 또는 datetime (where possible)
3. Downcast numeric data to the smallest appropriate bit size
4. Use categorical datatype for columns where 'the number of unique values < (rows /2)'

- DataFrame.memory_usage(deep=True)

## 🚩 DataFrame 정리
1. 데이터프레임은 collections of Series
2. Exploration methods to take a glimpse of data and its characteristics
    - head, tail
    - describe, info
3. filter/sort/modify 데이터프레임
4. Memory optimization with large datasets