In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import lightgbm as lgb
from tqdm import tqdm
from itertools import product
from sklearn.model_selection import train_test_split
import seaborn as sns
import gc

# DataLoad

In [None]:
PATH = '../input/competitive-data-science-predict-future-sales/'

df = pd.read_csv(PATH + 'sales_train.csv')
df_test = pd.read_csv(PATH + 'test.csv')
sample = pd.read_csv(PATH + 'sample_submission.csv')
items = pd.read_csv(PATH + 'items.csv')
shops = pd.read_csv(PATH + 'shops.csv')
item_cats = pd.read_csv(PATH + 'item_categories.csv')

# EDA

## Check data
各テーブルの内容を確認

In [None]:
print('Record num of each tables')
print()
print(f'train: {len(df)}')
print(f'test: {len(df_test)}')
print(f'items: {len(items)}')
print(f'item_categories: {len(item_cats)}')
print(f'shops: {len(shops)}')

In [None]:
#レコードはショップ×商品×日毎
#'date_block_num'が月毎のタイムステップ。34ヶ月分（値:0〜33)存在
display(df.head())
print(f'train num: {len(df)}')

In [None]:
#レコードはショップ×商品×月毎
#'date_block_num'が35ヶ月目（値:34）の際の'item_cnt_day'を予測する
display(df_test.head())
print(f'test num: {len(df_test)}')

In [None]:
display(items.head())
print(f'items num: {items.item_id.nunique()}')
print(f'item categories num: {items.item_category_id.nunique()}')

In [None]:
display(shops.head())
print(f'shop num: {shops.shop_id.nunique()}')

In [None]:
item_cats.head()

## Differance between train and test

In [None]:
print('【item_id】')
print(f'total item num: {len(items)}')
print(f'item num in train: {len(set(df.item_id))}')
print(f'item num in test: {len(set(df_test.item_id))}')
print(f"train have, test doesn't have: {len(set(df.item_id) - set(df_test.item_id))}")
print(f"test have, train doesn't have: {len(set(df_test.item_id) - set(df.item_id))}")
print()
print('【shop_id】')
print(f'total shop num: {len(shops)}')
print(f'shop num in train: {len(set(df.shop_id))}')
print(f'shop num in test: {len(set(df_test.shop_id))}')
print(f"train have, test doesn't have: {len(set(df.shop_id) - set(df_test.shop_id))}")
print(f"test have, train doesn't have: {len(set(df_test.shop_id) - set(df.shop_id))}")

## Check 'item_cnt_day' and 'item_price' columns
それぞれのカラムを外れ値を確認

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(2, 1, 1)
plt.subplots_adjust(wspace=0.4, hspace=0.6)
plt.title("count of item sales(per shop per day)")
ax1.boxplot(df['item_cnt_day'], labels=['item_cnt_day'], vert=False)
ax1.set_xlabel('count')

ax2 = fig.add_subplot(2, 1, 2)
plt.title("item_price")

ax2.boxplot(df['item_price'], labels=['item_price'], vert=False)
ax2.set_xlabel('price')
plt.show()

## Check shop sales each month
ショップごとの売上推移（周期性があることを確認）

In [None]:
tmp = df.copy()
tmp['revenue'] = tmp['item_price']*tmp['item_cnt_day']
tmp = tmp.groupby(['shop_id','date_block_num']).sum().reset_index()

month = []
tmp_month = []
for i in range(tmp['date_block_num'].max()+1):
    if i == 0 or (i+1) % 6 == 0:
        tmp_month.append(i)
        if 1 > i/12:
            month.append(f'2013/{i+1}')
        elif 2 > i/12:
            month.append(f'2014/{i+1-12}')
        else:
            month.append(f'2015/{i+1-24}')

colors = list(matplotlib.colors.CSS4_COLORS.values())
plt.figure(figsize=(20, 8))
plt.title('Shop Sales Trend')
plt.xlabel('month')
plt.xticks(tmp_month,month)
plt.ylabel('Shop Sales')

for shop_id in tmp['shop_id'].unique():
    x = tmp[tmp['shop_id']==shop_id]['date_block_num']
    y = tmp[tmp['shop_id']==shop_id]['revenue']
    plt.plot(x, y,color=colors[shop_id], marker='o')
plt.show()

# PreProcessing

## Correct shop_id
ショップ名の表記揺れにて別々に扱われていたshopのidを修正

In [None]:
df.loc[df.shop_id == 0, 'shop_id'] = 57
df_test.loc[df_test.shop_id == 0, 'shop_id'] = 57

df.loc[df.shop_id == 1, 'shop_id'] = 58
df_test.loc[df_test.shop_id == 1, 'shop_id'] = 58

df.loc[df.shop_id == 40, 'shop_id'] = 39
df_test.loc[df_test.shop_id == 40, 'shop_id'] = 39

shops.drop(index=[0,1,40], inplace=True)

## Delete Outliers of item_price & item_cnt_day
'item_price'と'item_cnt_day'の外れ値レコードを削除

In [None]:
df = df[df['item_price'] < 100000]
df = df[df['item_cnt_day'] < 999]

## Convert dtype of 'date' column
'date'カラムをdate型へ変換

In [None]:
df['date'] = pd.to_datetime(df.date,format='%d.%m.%Y')

# Feature engineering

## Create MinMaxAvg 'item_price' features
ショップ×商品ごとの商品価格の最大値・最小値・平均値の特徴量を作成

In [None]:
item_prices = df.groupby(['shop_id','item_id'], as_index=False
                           ).agg({'item_price': ['max','mean','min']}
                            ).set_axis(['shop_id','item_id','max_price','avg_price','min_price'], axis=1).astype('int16')

## Create 'first_sale_day' feature
'商品ごとに初めて売れるまでの日数'と'ショップごとの各商品が初めて売れるまでの日数'の特徴量を作成

In [None]:
tmp = df[['date', 'date_block_num', 'shop_id', 'item_id']].copy()
tmp['first_sale'] = tmp.date.dt.dayofyear
tmp['first_sale'] += 365 * (tmp.date.dt.year-2013)

#アイテムごとの初回購入日（itemsテーブルへ追加）
tmp_for_item_first_sale = tmp[['item_id', 'first_sale']].groupby('item_id', as_index=False).agg({'first_sale':'min'}).astype('int16')
items = pd.merge(items, tmp_for_item_first_sale, on='item_id', how='left')
del tmp_for_item_first_sale

#ショップごとのアイテム初回購入日（独自テーブル新規作成）
shop_item_first_sale = tmp[['item_id', 'shop_id','first_sale']].groupby(['item_id', 'shop_id'], as_index=False).agg({'first_sale':'min'}).astype('int16')
shop_item_first_sale = shop_item_first_sale.sort_values('first_sale').drop_duplicates(subset=['item_id','shop_id']
                        )[['shop_id', 'item_id','first_sale']].rename(columns={'first_sale': 'shop_first_sale'})

## Merge train and test
粒度を揃えて、trainデータとtestデータを結合

In [None]:
#売上毎→月毎のデータへ変換
df = df.groupby(['date_block_num', 'shop_id', 'item_id'], as_index=False
        ).agg({'item_cnt_day':'sum'}
        ).rename(columns={'item_cnt_day':'mon_shop_item_cnt'})


df_test['date_block_num'] = 34
df_test['mon_shop_item_cnt'] = np.nan
del df_test['ID']

df = df.append(df_test)
df

## Clip Monthly item cnt ※as written in the [Overview](https://www.kaggle.com/c/competitive-data-science-predict-future-sales/overview/evaluation)

月々の売上個数が0を下回る場合0へ変換。20を上回る場合は20へ変換

In [None]:
df.mon_shop_item_cnt = df.mon_shop_item_cnt.clip(0, 20)

## Create mean and median of monthly shop item cnt
ショップ×商品ごとの平均（中央値も）売上個数の特徴量を作成

In [None]:
tmp = df[['shop_id', 'item_id', 'mon_shop_item_cnt']].groupby(['shop_id', 'item_id'],as_index=False
        ).agg({'mon_shop_item_cnt':['mean', 'median']}
        ).set_axis(['shop_id', 'item_id','mean_sales_cnt', 'median_sales_cnt'], axis=1)

df = pd.merge(df, tmp, on=['shop_id', 'item_id'], how='left')

## Create sum and mean, median of item sales cnt
商品の売上個数に関する合計、平均、中央値の特徴量を作成

In [None]:
items['sum_item_sales_ctn'] = 0
items['mean_item_sales_ctn'] = 0
# items['median_item_sales_ctn'] = 0

for item_id in items.item_id.unique():
    tmp = df[df.item_id==item_id]['mon_shop_item_cnt']
    items.loc[item_id, 'sum_item_sales_ctn'] = tmp.sum().astype('int8')
    items.loc[item_id, 'mean_item_sales_ctn'] = tmp.mean()
#     items.loc[item_id, 'median_item_sales_ctn'] = tmp.median()

## Create matrix

In [None]:
df_tmp = []
for num in df.date_block_num.unique(): 
    tmp = df[df.date_block_num==num]
    df_tmp.append(np.array(list(product([num], tmp.shop_id.unique(), tmp.item_id.unique())), dtype='int16'))

# # Turn the grid into a dataframe
df_tmp = pd.DataFrame(np.vstack(df_tmp), columns=['date_block_num', 'shop_id', 'item_id'], dtype=np.int16)

# # Add the features from sales data to the matrix
df_tmp = df_tmp.merge(df, how='left', on=['date_block_num', 'shop_id', 'item_id'])

In [None]:
df = df_tmp.copy()
del df_tmp
del tmp
gc.collect()

## Create month feature
月を追加

In [None]:
df['month'] = ( (df['date_block_num'] + 1) % 12 ).astype('int8')

## Create lag features
ショップ×商品×月ごとの1〜12ヶ月前に売れた商品の個数の特徴量を作成

In [None]:
# def lag_feature_item_cnt(df, lags, col):
#     tmp = df[['date_block_num','shop_id','item_id',col]]
#     for i in lags:
#         shifted = tmp.copy()
#         shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
#         shifted['date_block_num'] += i
        
#         df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
#         df[col+'_lag_'+str(i)] = df[col+'_lag_'+str(i)].astype('float16')
#     return df

# df = lag_feature_item_cnt(df, [1, 2, 3], 'mon_shop_item_cnt')

## Create 'moving_average' feature
単純移動平均線、指数平滑移動平均

In [None]:
def calc_moving_ave(df):
    moving_ave = pd.DataFrame(columns=['date_block_num', 'shop_id', 'item_id', 'moving_average', 'expanding_mean', 'rolling_6month_mean', 'rolling_12month_mean'])
    
    for shop_id in tqdm(df.shop_id.unique()):
        moving_ave_tmp = _calc(df[df['shop_id']==shop_id][['date_block_num', 'shop_id', 'item_id', 'mon_shop_item_cnt']])
        moving_ave = pd.concat([moving_ave, moving_ave_tmp], axis=0)
        
    df = pd.merge(df, moving_ave, on=['date_block_num', 'shop_id', 'item_id'], how='left')
    
    return df

def _calc(df):
    base_tmp = make_base_tmp()
    moving_ave_tmp = pd.DataFrame(columns=['date_block_num', 'shop_id', 'item_id', 'moving_average', 
                                           'expanding_mean', 'rolling_6month_mean', 'rolling_12month_mean'], dtype='float16')
    
    for item_id in df.item_id.unique():
        exist_tmp = df[df['item_id']==item_id][['date_block_num', 'shop_id', 'item_id', 'mon_shop_item_cnt']]
        
        tmp = base_tmp[~base_tmp['date_block_num'].isin(exist_tmp['date_block_num'].to_list())].copy()
        tmp = pd.concat((tmp, exist_tmp), axis=0).sort_values('date_block_num')
        tmp['moving_average'] = tmp['mon_shop_item_cnt'].ewm(halflife=1).mean().astype('float16')
        tmp['expanding_mean'] = tmp['mon_shop_item_cnt'].expanding().mean().astype('float16')
        tmp['rolling_6month_mean'] = tmp['mon_shop_item_cnt'].rolling(window=6, min_periods=1).mean().astype('float16')
        tmp['rolling_12month_mean'] = tmp['mon_shop_item_cnt'].rolling(window=12, min_periods=1).mean().astype('float16')
        
        tmp = tmp[tmp['date_block_num'].isin(exist_tmp['date_block_num'].to_list())].drop('mon_shop_item_cnt', axis=1)
        
        moving_ave_tmp = pd.concat([moving_ave_tmp, tmp], axis=0)

    return moving_ave_tmp

def make_base_tmp():
    df = np.concatenate((np.arange(0,35).reshape(-1,1), np.zeros((35,3))), axis=1)
    df = pd.DataFrame(df, columns=['date_block_num', 'shop_id', 'item_id', 'mon_shop_item_cnt'], dtype='float16')
#     df.date_block_num = df.date_block_num.astype('int16')
    return df

df = calc_moving_ave(df)

## Create new item category id instead of original one
既存の商品カテゴリを新たに作成したものへ変更  
※学習の結果、既存カテゴリが結果に好影響を与えない為

In [None]:
item_cats['item_category_name'] = ['blank - ' + x if '-' not in x else x for x in item_cats['item_category_name']]
item_cats['split'] = item_cats['item_category_name'].str.split('-')
item_cats['new_cat'] = [x[-1].split() for x in item_cats['split']]
item_cats['new_cat'] = [x[0] for x in item_cats['new_cat'] if len(item_cats['new_cat']) > 1]
item_cats['new_cat_code'], _ = pd.factorize(item_cats['new_cat'])

#itemテーブルのカテゴリID変更用辞書
cat_dict = {}
for i, cat in enumerate(item_cats['item_category_id']):
    cat_dict[cat] = item_cats.loc[i, 'new_cat_code']

item_cats = pd.DataFrame(data=item_cats['new_cat_code'].unique(), columns=['item_category_id'])
print(f'new category id num: {len(item_cats)}')

In [None]:
#itemsテーブルのカテゴリーIDを新たなIDへ置換
items.item_category_id = items.item_category_id.map(cat_dict)

## Create 'city_code' feature
ショップ名をスペースで区切り、一要素目を都市名とする

In [None]:
shops['split'] = shops['shop_name'].str.split(' ')
shops['city'] = [x[0] for x in shops['split']]
shops['city_code'], _ = pd.factorize(shops['city'])

print(f'city num: {shops.city_code.nunique()}')

## Merge tables
各テーブルをメインテーブルへ結合

In [None]:
df = pd.merge(df, item_prices, on=['shop_id', 'item_id'], how='left')
df = pd.merge(df, shop_item_first_sale, on=['shop_id', 'item_id'], how='left')
df = pd.merge(df, items.drop('item_name', axis=1), on='item_id', how='left')
df = pd.merge(df, item_cats, on='item_category_id', how='left')
df = pd.merge(df, shops[['shop_id', 'city_code']], on='shop_id', how='left')

In [None]:
df.info()

## check correlation
変数間の相関関係を確認

In [None]:
# change order of columns to see easy
col_y = 'mon_shop_item_cnt'
col = df.columns.tolist()
col.remove(col_y)
col.append(col_y)

df = df.reindex(columns=col)

cor = df.corr()
sns.set(rc = {'figure.figsize':(20,10)})
sns.heatmap(cor, cmap= sns.color_palette('coolwarm', 10), annot=True,fmt='.2f', vmin=-1, vmax=1)

# Modeling

## Set Category columns
カテゴリ変数を設定し、そのカラムのデータ型を変換

In [None]:
categorical_features = ['shop_id', 'item_id', 'month', 'item_category_id', 'city_code']
df[categorical_features] = df[categorical_features].astype('category')

## Split train, val and test
訓練データ、検証データ、推論データに分割する（date_block_numをドロップさせる）

In [None]:
X_test = df[df['date_block_num']==34].drop(['mon_shop_item_cnt', 'date_block_num'], axis=1)

X = df[df.date_block_num <= 33].drop('date_block_num', axis=1)
y = X.mon_shop_item_cnt
X = X.drop('mon_shop_item_cnt', axis=1).copy()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

## Set learning parameters
学習パラメータを設定

In [None]:
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features, free_raw_data=False)
lgb_val = lgb.Dataset(X_val, y_val, categorical_feature=categorical_features, reference=lgb_train, free_raw_data=False)

# optimized by oputuna
lgbm_params = {
    'objective': 'mse',
    'metric' : 'rmse',
    "num_leaves": 966,
    "cat_smooth": 45.01680827234465,
    "min_child_samples": 27,
    "min_child_weight": 0.021144950289224463,
    "max_bin": 214,
    "learning_rate": 0.01,
    "subsample_for_bin": 300000,
    "min_data_in_bin": 7,
    "colsample_bytree": 0.8,
    "subsample": 0.6,
    "subsample_freq": 5,
    "n_estimators": 3000,
}

## Training with LightGBM
LightGBMを用いて、学習実行

In [None]:
model = lgb.train(lgbm_params,
                  lgb_train,
                  valid_names=['train', 'valid'],
                  valid_sets=[lgb_train, lgb_val],
                  categorical_feature=categorical_features, 
                  early_stopping_rounds=20,
                  verbose_eval=100)

Y_pred = model.predict(X_test, num_iteration=model.best_iteration)

## Check feature importance
重要度の高い変数を確認

In [None]:
_ = lgb.plot_importance(model, figsize=(10,10), height=0.7, importance_type="gain")

# Submit

## Create submit data
提出データの作成 ※推論データを0-20へ再度クリップする

In [None]:
#as written in the Overview (https://www.kaggle.com/c/competitive-data-science-predict-future-sales/overview/evaluation)
sample['item_cnt_month'] = Y_pred.clip(0,20)

sample.to_csv('submission.csv', index=False)