In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#データの読み込み
items = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
item_categories = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
shops = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")
sales_train = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
test = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/test.csv")

In [None]:
#sales_trainに説明変数を追加
sales_train_x = sales_train.copy()
test_x = test.copy()
sales_train_x = pd.merge(sales_train_x, items, on = 'item_id')
sales_train_x = sales_train_x.drop('item_name', axis = 1)
# 売上額の追加
sales_train_x['date_sales'] = sales_train_x['item_cnt_day'] * sales_train_x['item_price']
sales_train_x['mouth'] = pd.to_datetime(sales_train_x['date']).dt.strftime('%m')
sales_train_x['year'] = pd.to_datetime(sales_train_x['date']).dt.strftime('%y')
sales_train_x['weekday'] = pd.to_datetime(sales_train_x['date'].astype('str')).dt.dayofweek
sales_train_x

In [None]:
#両方とも12月のデータではないので外れ値として良さそう
sales_train_x_no_out = sales_train_x.drop([1181188, 1494614], axis = 0)
sales_train_x_no_out.shape

In [None]:
#月ごとのデータに変換
sales_train_by_month = sales_train_x_no_out[
    ['date_block_num','shop_id','item_id','item_cnt_day']
].groupby(
    ['date_block_num','shop_id','item_id'],
    as_index=False
).sum().rename(columns={'item_cnt_day':'mon_shop_item_cnt'})

# 月ごとのshop_id*item_id別売上金額
mon_shop_item_sales = sales_train_x_no_out[
    ['date_block_num','shop_id','item_id','date_sales']
].groupby(
    ['date_block_num','shop_id','item_id'],
    as_index=False
).sum().rename(columns={'date_sales':'mon_shop_item_sales'})

In [None]:
#testに含まれるIDに限定した訓練データを作成
#IDごとの月ごとの販売数を表示するものの原型
train_ID_only = pd.DataFrame()
for i in range(35):
    mid = test[['shop_id','item_id']]
    mid['date_block_num'] = i
    train_ID_only = pd.concat([train_ID_only,mid],axis=0)
train_ID_only

In [None]:
#IDごとの月別販売数
train = pd.merge(
    train_ID_only,
    sales_train_by_month,
    on=['date_block_num','shop_id','item_id'],
    how='left'
)
#月別売上額の追加
train = pd.merge(
    train,
    mon_shop_item_sales,
    on=['date_block_num','shop_id','item_id'],
    how='left'
)
train

In [None]:
#item_category_idの情報も追加
train = pd.merge(
    train,
    items[['item_id','item_category_id']],
    on='item_id',
    how='left'
)
train

In [None]:
#月別の売上数のグラフを表示
plt_df_1 = train.groupby(
    ['date_block_num'],
    as_index=False
).sum()
plt.figure(figsize=(20, 10))
sns.lineplot(x='date_block_num',y='mon_shop_item_cnt',data=plt_df_1)
plt.title('Montly item counts')

In [None]:
#月別売上額
plt_df_2 = train.groupby(
    ['date_block_num'],
    as_index=False
).sum()
plt.figure(figsize=(20, 10))
sns.lineplot(x='date_block_num',y='mon_shop_item_sales',data=plt_df_2)
plt.title('Montly item counts')

In [None]:
#制限事項
train['mon_shop_item_cnt'] = train['mon_shop_item_cnt'].clip(0,20)

In [None]:
# ラグ生成対象
lag_col_list = ['mon_shop_item_cnt', 'mon_shop_item_sales']
# ラグリスト(1ヶ月前、2ヶ月前、3ヶ月前、4ヶ月前、5ヶ月前、6ヶ月前、9ヶ月前、12ヶ月前)
lag_num_list = [1,2,3,4,5,6,12]

# shop_id*item_id*date_block_numでソート
train = train.sort_values(
    ['shop_id', 'item_id', 'date_block_num'],
    ascending=[True, True,True]
).reset_index(drop=True)

# ラグ特徴量の生成
for lag_col in lag_col_list:
    for lag in lag_num_list:
        set_col_name =  lag_col + '_' +  str(lag)
        df_lag = train[['shop_id', 'item_id','date_block_num',lag_col]].sort_values(
            ['shop_id', 'item_id','date_block_num'],
            ascending=[True, True,True]
        ).reset_index(drop=True).shift(lag).rename(columns={lag_col: set_col_name})
        train = pd.concat([train, df_lag[set_col_name]], axis=1)
train

In [None]:
train = train.fillna(0)

In [None]:
# ラグで最大12ヶ月前の売上数を使用するため2014年1月以降のデータを使用
train_ = train[(train['date_block_num']<=32) & (train['date_block_num']>=12)].reset_index(drop=True)
val_ = train[train['date_block_num']==33].reset_index(drop=True)
test_ = train[train['date_block_num']==34].reset_index(drop=True)

# モデルに入力する特徴量とターゲット変数に分割
train_y = train_['mon_shop_item_cnt']
train_X = train_.drop(columns=['date_block_num','mon_shop_item_cnt','mon_shop_item_sales'])
val_X = val_.drop(columns=['date_block_num','mon_shop_item_cnt','mon_shop_item_sales'])
val_y = val_['mon_shop_item_cnt']
test_X = test_.drop(columns=['date_block_num', 'mon_shop_item_cnt','mon_shop_item_sales'])

lightGBM実装

In [None]:
from lightgbm.sklearn import LGBMRegressor
from lightgbm import plot_importance

gbm = LGBMRegressor(n_estimators=10000, early_stopping_rounds=100, n_jobs=-1)
gbm.fit(train_X, train_y,
       eval_metric='rmse',
       eval_set=(val_X, val_y))
#これで0.85とかなら期待できるのかも

In [None]:
from optuna.integration import lightgbm as lgb
from lightgbm import plot_importance
import time

In [None]:
ts = time.time()

dtrain = lgb.Dataset(train_X, label=train_y)
eval_data = lgb.Dataset(val_X, label=val_y)

param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
    }

best = lgb.train(param, 
                 dtrain,
                 valid_sets=eval_data,
                 early_stopping_rounds=50)

time.time() - ts

In [None]:
print(best.params)
print(best.best_iteration)
print(best.best_score)

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(
    x=gbm.feature_importances_,
    y=train_X.columns.values
)
plt.title('Importance of features')

In [None]:
test_X.head()

In [None]:
test_y = best.predict(test_X)
test_X['item_cnt_month'] = test_y
submission = pd.merge(
    test,
    test_X[['shop_id','item_id','item_cnt_month']],
    on=['shop_id','item_id'],
    how='left'
)
# 提出ファイル作成
submission[['ID','item_cnt_month']].to_csv('submission.csv', index = False)