# 09 향후 판매량 예측

## 9.3 베이스라인 모델

In [90]:
import gc

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

In [91]:
# 데이터 블로오기
data_path = '../../data/09_sales/'

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

### 9.3.1 피처 엔지니어링 I: 피처명 한글화

In [92]:
sales_train = sales_train.rename(columns={
        'date': '날짜',
        'date_block_num': '월ID',
        'shop_id': '상점ID',
        'item_id': '상품ID',
        'item_price': '판매가',
        'item_cnt_day': '판매량',
    }
)
sales_train.head()

Unnamed: 0,날짜,월ID,상점ID,상품ID,판매가,판매량
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [93]:
shops = shops.rename(columns={
    'shop_name': '상점명',
    'shop_id': '상점ID',
})
shops.head()

Unnamed: 0,상점명,상점ID
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [94]:
items = items.rename(columns={
    'item_name': '상품명',
    'item_id': '상품ID',
    'item_category_id': '상품분류ID',
})
items.head()

Unnamed: 0,상품명,상품ID,상품분류ID
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [95]:
item_categories = item_categories.rename(columns={
    'item_category_name': '상품분류명',
    'item_category_id': '상품분류ID',
})
item_categories.head()

Unnamed: 0,상품분류명,상품분류ID
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [96]:
test = test.rename(columns={
    'shop_id': '상점ID',
    'item_id': '상품ID',
})
test.head()

Unnamed: 0,ID,상점ID,상품ID
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


### 9.3.2 피처 엔지니어링 II: 데이터 다운캐스팅

In [97]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'{100*(start_mem-end_mem)/start_mem:.2f} % 압축됨')
    return df

### 9.3.3 피처 엔지니어링 III: 데이터 조합 생성

In [None]:
from itertools import product

train = []
for i in sales_train['월ID'].unique():
    all_shop = sales_train.loc[sales_train['월ID']==i, '상점ID'].unique()
    all_item = sales_train.loc[sales_train['월ID']==i, '상품ID'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

idx_features = ['월ID', '상점ID', '상품ID']
train = pd.DataFrame(np.vstack(train), columns=idx_features)
train.head()

### 9.3.4 피처 엔지니어링 IV: 타깃값(월간 판매량) 추가

In [99]:
group = sales_train.groupby(idx_features).agg({'판매량': 'sum'})
group = group.reset_index()
group = group.rename(columns={'판매량': '월간판매량'})
group.head()

Unnamed: 0,월ID,상점ID,상품ID,월간판매량
0,0,0,32,6.0
1,0,0,33,3.0
2,0,0,35,1.0
3,0,0,43,1.0
4,0,0,51,2.0


In [100]:
train = train.merge(group, on=idx_features, how='left')
train.head()

Unnamed: 0,월ID,상점ID,상품ID,월간판매량
0,0,59,22154,1.0
1,0,59,2552,
2,0,59,2554,
3,0,59,2555,
4,0,59,2564,


### 9.3.5 피처 엔지니어링 V: 테스트 데이터 이어붙이기

In [101]:
test['월ID'] = 34

In [102]:
all_data = pd.concat([train, test.drop(columns='ID')], ignore_index=True, keys=idx_features)
all_data = all_data.fillna(0)
all_data.head(10)

Unnamed: 0,월ID,상점ID,상품ID,월간판매량
0,0,59,22154,1.0
1,0,59,2552,0.0
2,0,59,2554,0.0
3,0,59,2555,0.0
4,0,59,2564,0.0
5,0,59,2565,0.0
6,0,59,2572,0.0
7,0,59,2573,0.0
8,0,59,2574,2.0
9,0,59,2593,0.0


### 9.3.6 피처 엔지니어링 VI: 나머지 데이터 병합(최종 데이터 생성)

In [103]:
all_data = all_data.merge(shops, on='상점ID', how='left')
all_data = all_data.merge(items, on='상품ID', how='left')
all_data = all_data.merge(item_categories, on='상품분류ID', how='left')

In [104]:
all_data = downcast(all_data)
all_data.head()

51.56 % 압축됨


Unnamed: 0,월ID,상점ID,상품ID,월간판매량,상점명,상품명,상품분류ID,상품분류명
0,0,59,22154,1,"Ярославль ТЦ ""Альтаир""",ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,0,59,2552,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
2,0,59,2554,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил
3,0,59,2555,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства
4,0,59,2564,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,Музыка - Музыкальное видео


In [105]:
del shops, items, item_categories
gc.collect()

372

In [106]:
all_data = all_data.drop(['상점명', '상품명', '상품분류명'], axis=1)

### 9.3.7 피처 엔지니어링 VII: 마무리

In [107]:
X_train = all_data[all_data['월ID']<33]
X_train = X_train.drop(columns='월간판매량')

X_valid = all_data[all_data['월ID']==33]
X_valid = X_valid.drop(columns='월간판매량')

X_test = all_data[all_data['월ID']==34]
X_test = X_test.drop(columns='월간판매량')

y_train = all_data[all_data['월ID']<33]['월간판매량']
y_train = y_train.clip(0,20)

y_valid = all_data[all_data['월ID'] == 33]['월간판매량']
y_valid = y_valid.clip(0, 20)

In [108]:
del all_data
gc.collect()

0

### 9.3.8 모델 훈련 및 성능 검증

In [109]:
import lightgbm as lgb

In [112]:
params = {
    'metric': 'rmse',
    'num_leaves': 255,
    'learning_rate': 0.01,
    'force_col_wise': True,
    'random_state': 10,
}
cat_features = ['상점ID', '상품분류ID']

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid)

In [113]:
lgb_model = lgb.train(
    params=params,
    train_set=dtrain,
    num_boost_round=500,
    valid_sets=(dtrain, dvalid),
    categorical_feature=cat_features,
    verbose_eval=50
)

New categorical_feature is ['상점ID', '상품분류ID']


[LightGBM] [Info] Total Bins 426
[LightGBM] [Info] Number of data points in the train set: 10675678, number of used features: 4
[LightGBM] [Info] Start training from score 0.299125




[50]	training's rmse: 1.14777	valid_1's rmse: 1.06755
[100]	training's rmse: 1.11425	valid_1's rmse: 1.0386
[150]	training's rmse: 1.09673	valid_1's rmse: 1.02671
[200]	training's rmse: 1.08573	valid_1's rmse: 1.02027
[250]	training's rmse: 1.07722	valid_1's rmse: 1.01661
[300]	training's rmse: 1.0698	valid_1's rmse: 1.0138
[350]	training's rmse: 1.06317	valid_1's rmse: 1.01084
[400]	training's rmse: 1.05734	valid_1's rmse: 1.00936
[450]	training's rmse: 1.05224	valid_1's rmse: 1.00818
[500]	training's rmse: 1.04792	valid_1's rmse: 1.00722


### 9.3.9 예측 및 결과 제출

In [114]:
preds = lgb_model.predict(X_test).clip(0,20)
submission['item_cnt_month'] = preds
submission.to_csv(data_path+'submission.csv', index=False)

In [115]:
del X_train, y_train, X_valid, y_valid, X_test, lgb_model, dtrain, dvalid
gc.collect()

422