# A.2 메모리 절약을 위한 데이터 다운캐스팅

데이터 다운캐스팅은 9장 [향후 판매량 예측 경진대회 데이터](https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data)를 활용해 수행했습니다.

## A.2.1 데이터 다운캐스팅

In [None]:
import pandas as pd

data_path = '/kaggle/input/competitive-data-science-predict-future-sales/'

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')

In [None]:
train = sales_train.merge(shops, on='shop_id', how='left')
train = train.merge(items, on='item_id', how='left')
train = train.merge(item_categories, on='item_category_id', how='left')

In [None]:
train.dtypes

In [None]:
train.memory_usage()

In [None]:
start_mem = train.memory_usage().sum() / 1024**2
start_mem

In [None]:
for col in train.columns:
    dtype_name = train[col].dtype.name
    if dtype_name == 'object':
        pass
    elif dtype_name == 'bool':
        train[col] = train[col].astype('int8')
    elif dtype_name.startswith('int') or (train[col].round()==train[col]).all():
        train[col] = pd.to_numeric(train[col], downcast='integer')
    else:
        train[col] = pd.to_numeric(train[col], downcast='float')

In [None]:
train.dtypes

In [None]:
train.memory_usage()

In [None]:
end_mem = train.memory_usage().sum() / 1024**2
end_mem

In [None]:
print("{:.1f}% 압축됨".format(100 * (start_mem - end_mem) / start_mem))

## A.2.2 데이터 다운캐스팅 함수

In [None]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('{:.1f}% 압축됨'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df