In [None]:
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import catboost
from catboost import Pool
from catboost import CatBoostRegressor

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)


from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
%matplotlib inline
#sns.set(style="darkgrid")
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
warnings.filterwarnings("ignore")

import time

from xgboost import XGBRegressor
from string import punctuation
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv', dtype={'ID': 'int32', 'shop_id': 'int32', 
                                                  'item_id': 'int32'})
item_categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv', 
                              dtype={'item_category_name': 'str', 'item_category_id': 'int32'})
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv', dtype={'item_name': 'str', 'item_id': 'int32', 
                                                 'item_category_id': 'int32'})
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv', dtype={'shop_name': 'str', 'shop_id': 'int32'})
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv', 
                    dtype={'date': 'str', 'date_block_num': 'int32', 'shop_id': 'int32', 
                          'item_id': 'int32', 'item_price': 'float32', 'item_cnt_day': 'int32'})
#sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')

### Data
本次資料來源為2013~2015俄羅斯商店銷售資料，希望預測未來銷售數值。
初步計劃：著重data engineering部分

### Sale data (main training data)

In [None]:
sales.date=sales.date.apply(lambda x:datetime.datetime.strptime(x, '%d.%m.%Y'))
# check
print(sales.info())
sales.head()

### Link with item catigory

In [None]:
train = sales.join(items, on='item_id', rsuffix='_').join(shops, on='shop_id', rsuffix='_').join(item_categories, on='item_category_id', rsuffix='_').drop(['item_id_', 'shop_id_', 'item_category_id_'], axis=1)
train.head()

In [None]:
monthly_sales=sales.groupby(["date_block_num","shop_id","item_id"])[
    "date","item_price","item_cnt_day"].agg({"date":["min",'max'],"item_price":"mean","item_cnt_day":"sum"})
monthly_sales.head(10)

### Find date / price / item sell per day

In [None]:
print('Min date from train set: %s' % train['date'].min().date())
print('Max date from train set: %s' % train['date'].max().date())
print('Min item_price from train set: %s' % train['item_price'].min())
print('Max item_price from train set: %s' % train['item_price'].max())
print('Min item_cnt_day from train set: %s' % train['item_cnt_day'].min())
print('Max item_cnt_day from train set: %s' % train['item_cnt_day'].max())

### 由Max item_cnt_day from train set 得知銷售行為與價格相關 （價格浮動）（特價）
若價格調降有週期性 -> 銷售量有週期性 -> 不需要觀察價格即可得知

### Future Plan:
根據價格做預測，進一步預測銷售數量


### training data info

In [None]:
train.head()

### 查看價格分佈

In [None]:
sns.distplot(np.log10(train['item_price']), kde=False)
plt.yscale('log')

### 查看每日銷量分佈

In [None]:
sns.distplot(train.loc[(train['item_cnt_day']<10000)&(train['item_cnt_day']>-5),
                          'item_cnt_day'], kde=False)
plt.yscale('log')

### 由於分佈有巨大落差，選擇每日銷量<750作為分界點，以避免outlier

多數資料單日有巨大銷量可能因為新品上市(如GTA5)，特價促銷，或某些資料上無法確認的狀態造成。

note: 750只是一個隨意抓取的數值，有優化空間



In [None]:
train_df = train.loc[(train['item_cnt_day']<750), :]
train_df = train_df.loc[(train_df['item_price']<10**5), :]

### 查看價格與金額是否有相關性

note: 一定金額之內有此特性

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(train_df['item_price'], train_df['item_cnt_day'], alpha=0.25)
plt.xscale('log')
plt.yscale('log')
plt.ylim((1e-1, 1e4))
plt.xlim((1e-3, 1e6))
plt.show()

### 由於training data 包含許多非test data中有的item，選擇剔除

future plan: 檢查以全數dataset是否能得到更好結果

In [None]:
test_shop_ids = test['shop_id'].unique()
test_item_ids = test['item_id'].unique()
# 在test data中有shop_id
train_in_data = train[train['shop_id'].isin(test_shop_ids)]
# 在test data中有item_id
train_in_data = train_in_data[train_in_data['item_id'].isin(test_item_ids)]
print('Data set size in train:', train.shape[0])
print('Data set size in both train & test:', train_in_data.shape[0])

### 簡易EDA

credit: NowYSM@kaggle

In [None]:
def eda(data):
    print("----------Top-5- Record----------")
    print(data.head(5))
    print("-----------Information-----------")
    print(data.info())
    print("-----------Data Types-----------")
    print(data.dtypes)
    print("----------Missing value-----------")
    print(data.isnull().sum())
    print("----------Null value-----------")
    print(data.isna().sum())
    print("----------Shape of Data----------")
    print(data.shape)

def graph_insight(data):
    print(set(data.dtypes.tolist()))
#     df_num = data.select_dtypes(include = ['float64', 'int64'])
    df_num = data.select_dtypes(include = ['float32', 'int32'])
    df_num.hist(figsize=(16, 16), bins=50, xlabelsize=8, ylabelsize=8);
    
def drop_duplicate(data, subset):
    print('Before drop shape:', data.shape)
    before = data.shape[0]
    data.drop_duplicates(subset,keep='first', inplace=True) #subset is list where you have to put all column for duplicate check
    data.reset_index(drop=True, inplace=True)
    print('After drop shape:', data.shape)
    after = data.shape[0]
    print('Total Duplicate:', before-after)
eda(train)
graph_insight(train)

In [None]:
train.head(20)

### 預期default分類方式過細，根據商品型態進行category merge

In [None]:
l = list(item_categories.item_category_name)
l_cat = l
l_cat_int = l
l_cat[0] = 'pc_headphone'

for ind in range(1,8):
    l_cat[ind] = 'console_accessory'

l_cat[8] = 'ticket'
l_cat[9] = 'delivary'
for ind in range(10,18):
    l_cat[ind] = 'console'
l_cat[12] = 'new_console'
l_cat[14] = 'new_console'
l_cat[16] = 'new_console'
    
for ind in range(18,25):
    l_cat[ind] = 'console_game'
l_cat[25] = 'console_game_accessory'
l_cat[21] = 'mobile_game'
l_cat[22] = 'mobile_game'
l_cat[26] = 'mobile_game'
for ind in range(27,32):
    l_cat[ind] = 'pc_game'

for ind in range(32,37):
    l_cat[ind] = 'payment_card'

for ind in range(37,42):
    l_cat[ind] = 'dvd'

for ind in range(42,55):
    l_cat[ind] = 'book'

for ind in range(55,61):
    l_cat[ind] = 'music'

for ind in range(61,73):
    l_cat[ind] = 'gift'

for ind in range(73,79):
    l_cat[ind] = 'software'



item_categories['my_category_tmp'] = l_cat

l_tmp=[]
l_cat_int=[]
init=0
for x in l_cat:
    if x in l_tmp:
        l_cat_int.append(l_tmp.index(x))
    else:
        l_tmp.append(x)        
        l_cat_int.append(init)
        init=init+1
item_categories['my_category'] = l_cat_int

item_categories.head()

In [None]:
train_monthly = train_in_data[['date', 'date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price', 'item_cnt_day']]
shop_ids = train_monthly['shop_id'].unique()
item_ids = train_monthly['item_id'].unique()
empty_df = []
for i in range(34):
    for shop in shop_ids:
        for item in item_ids:
            empty_df.append([i, shop, item])
    
empty_df = pd.DataFrame(empty_df, columns=['date_block_num','shop_id','item_id'])
train_monthly = pd.merge(empty_df, train_monthly, on=['date_block_num','shop_id','item_id'], how='left')
train_monthly.fillna(0, inplace=True)

In [None]:
train_monthly[train_monthly["item_cnt_day"]>20].count()

### 比較每日銷量在20以上/以下 和 僅在test data中出現的item / training data的所有item 的比例


In [None]:
print("unique item day sale cnt>20 amt in train_monthly (keep only item_id in test) is",len(train_monthly[train_monthly["item_cnt_day"]>20]['item_id'].unique()),"/",len(train_monthly[train_monthly["item_cnt_day"]>=0]['item_id'].unique()))
print("item day sale cnt>20 amt in train_monthly (keep only item_id in test) is",len(train_monthly[train_monthly["item_cnt_day"]>20]),"/",len(train_monthly[train_monthly["item_cnt_day"]>0]))

In [None]:
print("unique item day sale cnt>20 amt in train is",len(train[train["item_cnt_day"]>20]['item_id'].unique()),"/",len(train[train["item_cnt_day"]>=0]['item_id'].unique()))
print("item day sale cnt>20 amt in train is",len(train[train["item_cnt_day"]>20]),"/",len(train[train["item_cnt_day"]>0]))

In [None]:
train_monthly.head()

In [None]:
train.head()

In [None]:
train_monthly['date_block_num'] = train_monthly['date_block_num'].astype(int)
train_monthly['shop_id'] = train_monthly['shop_id'].astype(int)
train_monthly['item_category_id'] = train_monthly['item_category_id'].astype(int)
train_monthly['item_id'] = train_monthly['item_id'].astype(int)

train_monthly = train_monthly.sort_values('date_block_num').groupby(['date_block_num', 'shop_id', 'item_category_id', 'item_id'], as_index=False)
# train_monthly = train_monthly.agg({'item_price':['sum', 'mean'], 'item_cnt_day':['sum', 'mean','count']})
# Rename features.
# train_monthly.columns = ['date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price', 'mean_item_price', 'item_cnt', 'mean_item_cnt', 'transactions']

In [None]:
train_monthly = train_monthly.agg({'item_price':['sum', 'mean'], 'item_cnt_day':['sum', 'mean','count']})
# Rename feature
train_monthly.columns = ['date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price', 'mean_item_price', 'item_cnt', 'mean_item_cnt', 'transactions']

In [None]:
train_monthly[train_monthly['item_category_id']==40].head()

In [None]:
# train_monthly.columns = ['date_block_num', 'shop_id', 'item_id', 'date', 'item_category_id', 'item_price',  'item_cnt', 'mean_item_cnt', 'transactions']
train_monthly['year'] = train_monthly['date_block_num'].apply(lambda x: ((x//12) + 2013))
train_monthly['month'] = train_monthly['date_block_num'].apply(lambda x: (x % 12))
gp_month_mean = train_monthly.groupby(['month'], as_index=False)['item_cnt'].mean()
gp_month_sum = train_monthly.groupby(['month'], as_index=False)['item_cnt'].sum()
gp_category_mean = train_monthly.groupby(['item_category_id'], as_index=False)['item_cnt'].mean()
gp_category_sum = train_monthly.groupby(['item_category_id'], as_index=False)['item_cnt'].sum()
gp_shop_mean = train_monthly.groupby(['shop_id'], as_index=False)['item_cnt'].mean()
gp_shop_sum = train_monthly.groupby(['shop_id'], as_index=False)['item_cnt'].sum()

In [None]:
train_monthly.head()

### 檢查類別每日銷量

In [None]:
f, axes = plt.subplots(2, 1, figsize=(20, 10), sharex=True)
sns.barplot(x="item_category_id", y="item_cnt", data=gp_category_mean, ax=axes[0], palette="mako").set_title("Monthly mean")
sns.barplot(x="item_category_id", y="item_cnt", data=gp_category_sum, ax=axes[1], palette="mako").set_title("Monthly sum")
plt.show()

### 利用boxplot檢查極端值分佈

In [None]:
plt.figure(figsize=(20,10)) 
sns.boxplot(x="item_category_id",y="item_cnt",data=train_monthly,palette="mako")
plt.show()

In [None]:
train_monthly[train_monthly['item_cnt']>500].head(10)

In [None]:
## print('unique shop #: ',len(train_monthly['shop_id'].unique()))
print('unique item #: ',len(train_monthly['item_id'].unique()))
print('unique category #: ',len(train_monthly['item_category_id'].unique()))

In [None]:
train_monthly.groupby(['month'], as_index=False)['item_cnt'].sum()

In [None]:
train_monthly[train_monthly['mean_item_price']>10].head()

In [None]:
input_data = pd.merge(train_monthly, item_categories.drop(columns=['item_category_name']), on=['item_category_id'])
input_data['item_id']= input_data.item_id.astype('str')
input_data['item_category_id']= input_data.item_category_id.astype('str')
input_data['shop_id']= input_data.shop_id.astype('str')
input_data['year']= input_data.year.astype('str')
input_data['month']= input_data.month.astype('str')
input_data = input_data.drop(['item_price','mean_item_cnt','transactions','my_category_tmp'], axis = 1)
# remove mean_item_price for now

cols = input_data.columns.tolist()
cols = cols[5:]+cols[:5]
input_data[cols].head()



### 使用xgboost訓練/預測

In [None]:
# Encode Categories
from sklearn import preprocessing

input_data_clean=input_data[cols]
number = preprocessing.LabelEncoder()
input_data_clean[['item_id']] = number.fit_transform(input_data_clean.item_id)
input_data_clean[['item_category_id']] = number.fit_transform(input_data_clean.item_category_id)
input_data_clean[['shop_id']] = number.fit_transform(input_data_clean.shop_id)
input_data_clean[['year']] = number.fit_transform(input_data_clean.year)
input_data_clean[['month']] = number.fit_transform(input_data_clean.month)
input_data_clean[['my_category']] = number.fit_transform(input_data_clean.my_category)
input_data_clean[['date_block_num']] = number.fit_transform(input_data_clean.date_block_num)
input_data_clean.head()

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# input_data_clean=input_data_clean.drop(['my_category'], axis = 1)

X, y = input_data_clean.iloc[:,1:],input_data_clean.iloc[:,0]
# data_dmatrix = xgb.DMatrix(data=X,label=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
# xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
#                 max_depth = 10, alpha = 10, n_estimators = 10)

param = {'max_depth':10, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'num_round':1000, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse'}

progress = dict()
xgbtrain = xgb.DMatrix(X_train, y_train)
watchlist  = [(xgbtrain,'train-rmse')]

bst = xgb.train(param, xgbtrain)
preds = bst.predict(xgb.DMatrix(X_test))
from sklearn.metrics import mean_squared_error 
rmse = np.sqrt(mean_squared_error(preds,y_test))
print(rmse)

In [None]:
plt.figure(figsize=(20,10)) 
xgb.plot_importance(bst)
plt.show()

### 可以發現自定義的分類效果有限，重要性遠低於原始定義

future plan：
    1. 去掉自分類
    2. 此輸入資料單純把xgboost當成回歸工具使用(針對單一item給予item_category_id,shop_id,year,month,date_block_num)，對於週期性波動預測度恐有限。
       應針對單一item輸入最接近的數筆資料作為預測最新時間點的參考