# Predict Future Sales 日本語  

## ライブラリのインポート

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import product
from sklearn.preprocessing import LabelEncoder
import datetime
import re

import xgboost as xgb
# from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

import time
import gc
import pickle

# 描画サイズ
plt.rcParams["figure.figsize"] = (20 ,10)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## データの読み込み

In [None]:
#データの読み込み
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
cats = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
test  = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

## 外れ値除外

In [None]:
# 外れ値の処理と可視化

ts = time.time()

plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
flierprops = dict(marker='o', markerfacecolor='purple', markersize=6,
                  linestyle='none', markeredgecolor='black')
sns.boxplot(x=train.item_cnt_day, flierprops=flierprops)
plt.figure(figsize=(10,4))
plt.xlim(train.item_price.min(), train.item_price.max()*1.1)
sns.boxplot(x=train.item_price, flierprops=flierprops)
plt.show()

# exclude outliers
train = train[(train.item_price < 300000 )& (train.item_cnt_day < 1000)]
train = train[train.item_price > 0].reset_index(drop = True)
# 返品処理されたもの(値がマイナス)は、売上0として扱う
# train.loc[train.item_cnt_day < 1, "item_cnt_day"] = 0

# print run time
print(f"outlier fix run time: {time.time()-ts}")

## データの整理  

### shops df

In [None]:
#shops dfの前処理
ts = time.time()

# shop_nameの重複を修正
# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

# city_nameをshop_category_nameから作成
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"',"shop_name" ] = 'СергиевПосад ТЦ "7Я"'
shops["city_name"] = shops.shop_name.str.split(" ").map( lambda x: x[0] )
shops["shop_category_name"] = shops.shop_name.str.split(" ").map( lambda x: x[1] )
shops.loc[shops.city_name == "!Якутск", "city_name"] = "Якутск"



# label-encording
shops["shop_category_id"] = LabelEncoder().fit_transform(shops.shop_category_name)
shops["city_id"] = LabelEncoder().fit_transform( shops.city_name )
display(shops.head())
shops = shops[["shop_id", "shop_category_id", "city_id"]]
display(shops.head())

# print run time
print(f"run time: {time.time()-ts}")

### cats-df

In [None]:
# cats-dfの前処理
ts = time.time()

# type_nameをcats-dfへ追加
cats["type_name"] = cats.item_category_name.apply( lambda x: x.split(" ")[0] ).astype(str)
cats.loc[ (cats.type_name == "Игровые")| (cats.type_name == "Аксессуары"), "type_name" ] = "Игры"



#label-encording
cats["type_id"] = LabelEncoder().fit_transform(cats.type_name)
cats["split"] = cats.item_category_name.apply(lambda x: x.split("-"))
cats["subtype"] = cats.split.apply(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats["subtype_id"] = LabelEncoder().fit_transform(cats.subtype)
display(cats.head())
cats = cats[["item_category_id", "subtype_id", "type_id"]]
display(cats.head())

# print run time
print(f"run time: {time.time()-ts}")

### items-df

## データをまとめる
### 組み合わせ作成  
> 組み合わせパターンを複数用意した

In [None]:
# ラグとる際の組み合わせを作成する
ts = time.time()
# 訓練データのユニークな組み合わせだけ取り出す
# ラグをとる際は,欠損値が発生するので、それを0で埋める予定

cols  = ["date_block_num", "shop_id", "item_id"]

##################################################
"""
# 訓練データ内全体のユニークな組み合わせ
matrix = train.groupby(cols,as_index=False)\
                        .item_cnt_day.sum()\
                        .rename(columns={'item_cnt_day':'item_cnt_month'})

"""
# 訓練データの月ごとのユニークの組み合わせ
matrix = []
for i in range(34):
    sales = train[train.date_block_num == i]
    matrix.append( np.array(list( product( [i], sales.shop_id.unique(), sales.item_id.unique() ) ), dtype = np.int16) )
matrix = pd.DataFrame( np.vstack(matrix), columns = cols )

"""
# 訓練データのユニークな組み合わせ
matrix = []
matrix.append(np.array(list(product(train.date_block_num.unique(), train.shop_id.unique(), train.item_id.unique())), dtype = np.int16))
matrix = pd.DataFrame( np.vstack(matrix), columns = cols)


# テストデータのユニークの組み合わせ
matrix= pd.DataFrame()
for i in range(34):
    mid = test[['shop_id','item_id']]
    mid['date_block_num'] = i
    matrix = pd.concat([matirx,mid],axis=0)
"""

################################################
# item_cnt_monthをマージする
group = train.groupby( ["date_block_num", "shop_id", "item_id"] ).agg( {"item_cnt_day": ["sum"]} )
group.columns = ["item_cnt_month"]
group.reset_index(inplace = True)
matrix = pd.merge(matrix, group, on = cols, how = "left" )
matrix["item_cnt_month"] = matrix["item_cnt_month"].astype(np.float16)



# testデータをmatrixにくっつける準備
test["date_block_num"] = 34

# testをmatrixの下にくっつける
matrix = pd.concat([matrix, test.drop(["ID"],axis = 1)],\
                   ignore_index=True, sort=False, keys=cols)

# 作成したmatrixを確認
display(matrix.head())
print(matrix.isna().sum())
print(matrix.info())

# print run time
print(f"run time: {time.time()-ts}")

### 訓練データへこれまでに作成した特徴量をすべてmergeする

In [None]:
# 訓練データへこれまでに作成した特徴量をすべてmergeするts = time.time()
ts = time.time()

# 訓練データへすべてマージする
matrix = pd.merge( matrix, shops, on = ["shop_id"], how = "left" )
matrix = pd.merge(matrix, items, on = ["item_id"], how = "left")
matrix = pd.merge( matrix, cats, on = ["item_category_id"], how = "left" )
# 年月追加
matrix["month"] = (matrix["date_block_num"]%12)+1
matrix["year"] = (matrix["date_block_num"]//12)+2013
# 商品店舗別, 商品別で初めて売れた月と現在月の差を変数として加える
# 売り出されてからどれだけ経ているかがわかる
# matrix["item_shop_first_sale"] = matrix["date_block_num"]\
# - matrix.groupby(["item_id","shop_id"])["date_block_num"].transform('min')
# matrix["item_first_sale"] = matrix["date_block_num"]\
# - matrix.groupby(["item_id"])["date_block_num"].transform('min')

# データ型を最適なものへ変更
matrix["item_id"] = matrix["item_id"].astype(np.int16)
matrix["shop_id"] = matrix["shop_id"].astype(np.int8)
matrix["date_block_num"] = matrix["date_block_num"].astype(np.int8)
matrix["item_cnt_month"] = matrix["item_cnt_month"].astype(np.float16)
matrix["city_id"] = matrix["city_id"].astype(np.int8)
matrix["shop_category_id"] = matrix["shop_category_id"].astype(np.int8)
matrix["item_category_id"] = matrix["item_category_id"].astype(np.int8)
matrix["type_id"] = matrix["type_id"].astype(np.int8)
matrix["subtype_id"] = matrix["subtype_id"].astype(np.int8)

matrix["month"] = matrix["month"].astype(np.int8)
matrix["year"] = matrix["year"].astype(np.int16)
# matrix["item_shop_first_sale"] = matrix["item_shop_first_sale"].astype(np.int8)
# matrix["item_first_sale"] = matrix["item_first_sale"].astype(np.int16)



# mergeしたmatrixを確認
display(matrix.head())
print(matrix.isna().sum())
print(matrix.info())

# print run time
print(f"run time: {time.time()-ts}")

### ラグ特徴量を作成する

In [None]:
# ラグ特徴量作成
ts = time.time()

# ラグ作成する関数を定義
def lag_feature( df,lags, cols ):
    for col in cols:
        print(col)
        tmp = df[["date_block_num", "shop_id","item_id",col ]]
        for i in lags:
            print(i)
            shifted = tmp.copy()
            shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_"+str(i)]
            shifted.date_block_num = shifted.date_block_num + i
            df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

# ラグ特徴量作成
matrix = lag_feature( matrix, [1,2,3], ["item_cnt_month"] )
# ラグを作った分だけ削除
matrix = matrix[matrix["date_block_num"] >= 3]

# 欠損値埋め
matrix.fillna( 0, inplace = True)

# mergeしたmatrixを確認
display(matrix.head())
print(matrix.isna().sum())
print(matrix.info())

# print run time
print(f"run time: {time.time()-ts}")

### 使う特徴量を選定

In [None]:
matrix.columns

In [None]:
#matrix = matrix[['date_block_num', 'item_cnt_month', 'item_cnt_month_lag_1',\
                 #'item_cnt_month_lag_2', 'item_cnt_month_lag_3']]

matrix.head().T

In [None]:
matrix = matrix.drop("item_name", axis=1)
matrix

In [None]:
df_corr = matrix.corr()
fig, ax = plt.subplots(figsize=(12, 9)) 
sns.heatmap(df_corr, square=True, vmax=1, vmin=-1, center=0)

## 予測
### モデリング

In [None]:
# 学習のためのモデルの準備
ts = time.time()

# gcでメモリ解放
data = matrix.copy()
# del matrix
gc.collect()

# データ分割
train_X = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
train_y = data[data.date_block_num < 33]['item_cnt_month']
val_X = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
val_y = data[data.date_block_num == 33]['item_cnt_month']
test_X = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

#gcでデータ解放
del data
gc.collect()

# クリップ
train_y = train_y.clip(0, 20)
val_y = val_y.clip(0, 20)

# print run time
print(f"run time: {time.time()-ts}")

### 学習

#### lightGBM

In [None]:
from optuna.integration import lightgbm as lgb
from lightgbm import plot_importance


ts = time.time()

dtrain = lgb.Dataset(train_X, label=train_y)
eval_data = lgb.Dataset(val_X, label=val_y)

param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
    }

best = lgb.train(param, 
                 dtrain,
                 valid_sets=eval_data,
                 early_stopping_rounds=50)

time.time() - ts

In [None]:
ts = time.time()
import lightgbm as lgb

dtrain = lgb.Dataset(train_X, label=train_y)
eval_data = lgb.Dataset(val_X, label=val_y)

param = {'objective': 'regression', 'metric': 'rmse', \
         'verbosity': -1, 'boosting_type': 'gbdt',\
         'feature_pre_filter': False, 'lambda_l1': 3.389173441244444e-07,\
         'lambda_l2': 3.821912829254147e-07, 'num_leaves': 68, \
         'feature_fraction': 0.4, 'bagging_fraction': 1.0,\
         'bagging_freq': 0, 'min_child_samples': 20, \
         'num_iterations': 1000, 'early_stopping_round': 50}

best = lgb.train(param, 
                 dtrain,
                 valid_sets=eval_data,
                 early_stopping_rounds=50)

print(best.best_score)
time.time() - ts

In [None]:
print(best.params)
print(best.best_iteration)
print(best.best_score)

In [None]:
pred_y = best.predict(val_X)

mse = mean_squared_error(val_y, pred_y)
rmse = np.sqrt(mse)
print(f"検証データのRMSE:{rmse}\n")

plt.figure(figsize=(8,8))
plt.xlim(0,20)
plt.ylim(0,20)
plt.xlabel("val_y")
plt.ylabel("pred_y")

plt.scatter(val_y, pred_y, alpha = 0.5, s=0.5)
plt.plot(np.linspace(0, 20, 100), np.linspace(0, 20, 100), "red")

plt.show()

### XGBoost

In [None]:
# モデル学習
ts = time.time()

dtrain = xgb.DMatrix(train_X, train_y)
dvalid = xgb.DMatrix(val_X, val_y)

params = {
    "objective" : "reg:squarederror",
    "eval_metric" : "rmse"
}

results_dict = {}

model = xgb.train(
    params = params,
    dtrain = dtrain,
    evals = [(dtrain, "train"), (dvalid, "valid")],
    num_boost_round = 1000,
    early_stopping_rounds = 20,
    evals_result = results_dict
)

# print run time
print(f"run time: {time.time()-ts}")

### 提出

In [None]:
test_y = model.predict(xgb.DMatrix(test_X)).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": test_y
})
# 提出の際、欠損値でエラーが出ないか確認
print(submission.isna().sum())

submission.to_csv('xgb_submission.csv', index=False)

### 結果の可視化  
#### 訓練データのRMSE

In [None]:
# 訓練データのRMSE
train_pred_y = model.predict(xgb.DMatrix(train_X))

mse = mean_squared_error(train_y, train_pred_y)
rmse = np.sqrt(mse)
print(f"訓練データのRMSE:{rmse}\n")

plt.figure(figsize=(8,8))
plt.xlim(0,20)
plt.ylim(0,20)
plt.xlabel("train_y")
plt.ylabel("train_pred_y")

plt.scatter(train_y, train_pred_y, alpha = 0.5, s=0.5)
plt.plot(np.linspace(0, 20, 100), np.linspace(0, 20, 100), "red")

plt.show()

#### 検証データのRMSE

In [None]:
pred_y = model.predict(xgb.DMatrix(val_X))# .round()

mse = mean_squared_error(val_y, pred_y)
rmse = np.sqrt(mse)
print(f"検証データのRMSE:{rmse}\n")

plt.figure(figsize=(8,8))
plt.xlim(0,20)
plt.ylim(0,20)
plt.xlabel("val_y")
plt.ylabel("pred_y")

plt.scatter(val_y, pred_y, alpha = 0.5, s=0.5)
plt.plot(np.linspace(0, 20, 100), np.linspace(0, 20, 100), "red")

plt.show()

#### 学習曲線

In [None]:
plt.plot(results_dict["train"]["rmse"], color = "red", label = "train")
plt.plot(results_dict["valid"]["rmse"], color = "blue", label = "valid")
plt.legend()
plt.show()

#### 重要度(分岐寄与数)

In [None]:
xgb.plot_importance(model)
plt.show()

#### 重要度(寄与度)

In [None]:
xgb.plot_importance(model, importance_type = "gain")
plt.show()

#### 決定木

In [None]:
fig = plt.figure(figsize=(100,100))
ax = plt.subplot()
xgb.plot_tree(model, num_trees=2, ax=ax, rankdir='LR')
# fig.savefig("img.png")
plt.show()