In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Problem Statement

Predict total sales for every product and store in the next month.

#### Data fields

|Field| Meaning|
| --- | --- |
|ID | an Id that represents a (Shop, Item) tuple within the test set|
|shop_id | unique identifier of a shop|
|item_id | unique identifier of a product|
|item_category_id | unique identifier of item category|
|item_cnt_day | number of products sold. You are predicting a monthly amount of this measure|
|item_price | current price of an item|
|date | date in format dd/mm/yyyy|
|date_block_num | a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33|
|item_name | name of item|
|shop_name | name of shop|
|item_category_name | name of item category|

#### Data files

|File|descriptions|
|---|---|
|sales_train.csv | the training set. Daily historical data from January 2013 to October 2015.|
|test.csv | the test set. You need to forecast the sales for these shops and products for November 2015.|
|sample_submission.csv | a sample submission file in the correct format.|
|items.csv | supplemental information about the items/products.|
|item_categories.csv  | supplemental information about the items categories.|
|shops.csv | supplemental information about the shops.|

In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
pd.set_option('max_colwidth', 400)

# Download and View Data

In [None]:
# Read CSVs into pandas
train_data = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
items_info = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")

# Name info uneeded in this model
#category_info = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
#shops_info = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")

In [None]:
# Uncomment to see length, dtypes, nulls, memory
if False:
    train_data.info()
    items_info.info()
    category_info.info()
    shops_info.info()

In [None]:
# View data sample
train_data.head()

# Data Preperation & Feature Creation

In [None]:
# Format date
train_data["date"] = pd.to_datetime(train_data["date"], dayfirst=True)

In [None]:
# Due to 28 duplucates groupby on "date", "shop_id", "item_id"
items_info = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
grouped_data = train_data.groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_price": ["mean", "max", "min"], "item_cnt_day": ["sum", "max", "min", "median"]})
grouped_data = grouped_data.reset_index()
grouped_data = grouped_data.merge(items_info[["item_id", "item_category_id"]].drop_duplicates(), left_on = "item_id", right_on = "item_id", how = "left")
grouped_data.drop("item_id", axis = 1, inplace = True)

In [None]:
grouped_data.columns = ['date_block_num', 'shop_id', 'item_id', 'item_price_mean', 'item_price_max', 
                        'item_price_min', 'item_cnt_day_sum', 'item_cnt_day_max', 'item_cnt_day_min',
                        'item_cnt_day_median', 'item_category_id']

In [None]:
# Merge with prev month data for prediction
t_minus1_data = grouped_data.loc[grouped_data.date_block_num != 33, ["date_block_num", "shop_id", "item_id", "item_price_mean", "item_cnt_day_sum", "item_cnt_day_median"]]
t_minus1_data["date_block_num"] += 1
driver_data = grouped_data[grouped_data.date_block_num != 0]
driver_data = driver_data.merge(t_minus1_data, left_on = ["date_block_num", "shop_id", "item_id"], right_on = ["date_block_num", "shop_id", "item_id"], suffixes = ("", "_tminus1"))

In [None]:
# Create deltas
for col in ["item_price_mean", "item_cnt_day_sum", "item_cnt_day_median"]:
    driver_data[col + "_tminus1"] = np.log(driver_data[col + "_tminus1"]) - np.log(driver_data[col])

In [None]:
# Copy data with all features for later
submission_data = driver_data.copy()

In [None]:
# Join with T+1 response data
response_data = grouped_data.loc[grouped_data.date_block_num != 0, ["date_block_num", "shop_id", "item_id", "item_cnt_day_sum"]]
driver_data = driver_data[driver_data.date_block_num != 33]
response_data["date_block_num"] -= 1
driver_data = driver_data.merge(response_data, left_on = ["date_block_num", "shop_id", "item_id"], right_on = ["date_block_num", "shop_id", "item_id"], suffixes = ("", "_realised"))

# Fit Xgboost Model

In [None]:
xgb_model = XGBRegressor(colsample_bylevel=0.8, colsample_bynode=0.8, colsample_bytree=0.8, subsample=0.8, 
                         gamma=10, learning_rate=0.1, max_depth=12, min_child_weight=1, 
                         n_estimators=500, random_state=27, reg_alpha=50, reg_lambda=200)
training_cols = driver_data.drop(["date_block_num", "shop_id", "item_id", "item_cnt_day_sum_realised"], axis = 1).columns
X, y = driver_data.drop(["date_block_num", "shop_id", "item_id", "item_cnt_day_sum_realised"], axis = 1).values, driver_data["item_cnt_day_sum_realised"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27, shuffle=True)

In [None]:
eval_set = [(X_test, y_test)]
xgb_model.fit(X_train, y_train, early_stopping_rounds=20, eval_set=eval_set, verbose=True)

In [None]:
# make predictions for test data
y_pred = xgb_model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
rmse = mean_squared_error(y_test, predictions)
# Default: 103.00
# Manual tuned 1: 89.51

# Bayesian Optimisation

In [None]:
# Install Gpyopt
!pip install GPyOpt
import GPyOpt

In [None]:
# https://machinelearningapplied.com/hyperparameter-search-with-gpyopt-part-2-xgboost-classification-and-ensembling/

# hyper param tuning bayes
# cross validation hyper param tunin

In [None]:
search_space = [

       {'name':'colsample_bylevel', 'type':'continuous', 'domain':(0.5, 0.99)},
       {'name':'colsample_bynode', 'type':'continuous', 'domain':(0.5, 0.99)},
       {'name':'colsample_bytree', 'type':'continuous', 'domain':(0.5, 0.99)},
       {'name':'subsample', 'type':'continuous', 'domain':(0.6, 0.99)},

       {'name':'min_child_weight', 'type':'discrete', 'domain':(1, 10)},
       {'name':'max_depth', 'type':'discrete', 'domain':(8, 9, 10, 11, 12, 13, 14, 15)},

       {'name':'gamma', 'type':'continuous', 'domain':(0.1, 10)},
       {'name':'reg_alpha', 'type':'discrete', 'domain':(0,1,10,50)},
       {'name':'reg_lambda', 'type':'discrete', 'domain':(1,10,50,100,150,200,500,1000)}
               ]

In [None]:
def xgb_function(params):
    """
    
    Training data defined outside function
    
    Inputs
    ------
    
    Outputs
    -------
    
    """
    
     
    dict_parameters = {'colsample_bylevel':params[0][0], 
                         'colsample_bynode':params[0][1],
                         'colsample_bytree':params[0][2],
                         'subsample':params[0][3],
                         'min_child_weight':int(params[0][4]),
                         'max_depth':int(params[0][5]),
                         'gamma':params[0][6],
                         'reg_alpha':int(params[0][7]),
                         'reg_lambda':int(params[0][8]),
                         "random_state": 27,
                         "n_estimators": 500}
          
    opt_model = XGBRegressor(**dict_parameters)
    opt_model.fit(X_train, y_train, early_stopping_rounds=20, eval_set=eval_set, verbose=False)

    y_pred = opt_model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    rmse = mean_squared_error(y_test, predictions)
     
    print('\ndict_parameters:',dict_parameters)
    print('best_iteration =',opt_model.best_iteration)
    print('rmse =',rmse)
     
    return rmse

In [None]:
gpyopt_bo = GPyOpt.methods.BayesianOptimization(f=xgb_function, domain=search_space, 
                model_type='GP', initial_design_numdata=5, 
                initial_design_type='random', acquisition_type='EI', 
                normalize_Y=True, exact_feval=False, 
                acquisition_optimizer_type='lbfgs', 
                model_update_interval=1, evaluator_type='sequential', 
                batch_size=1, num_cores=os.cpu_count(), verbosity=True, 
                verbosity_model=False, maximize=False, de_duplication=True)

In [None]:
gpyopt_bo.run_optimization(max_iter=10)

In [None]:
header_params = []
for param in search_space:
    header_params.append(param['name'])
     
df_results = pd.DataFrame(data=gpyopt_bo.X, columns=header_params)
df_results["rmse"] = gpyopt_bo.Y

In [None]:
df_results

In [None]:
xgb_opt_model = XGBRegressor(**{'colsample_bylevel': 0.5095174790289587, 'colsample_bynode': 0.6375048884962791, 'colsample_bytree': 0.6727943521088571, 'subsample': 0.6096441166795321, 
                                'min_child_weight': 10, 'max_depth': 13, 'gamma': 2.797555559145436, 'reg_alpha': 1, 'reg_lambda': 100, 'random_state': 27, 'n_estimators': 500,
                                'learning_rate': 0.1})
xgb_opt_model.fit(X_train, y_train, early_stopping_rounds=20, eval_set=eval_set, verbose=False)

In [None]:
# make predictions for test data
y_pred = xgb_opt_model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
mean_squared_error(y_test, predictions)

# Submission

In [None]:
# Read test CSVs and ID labels
sample_submission = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")
test_labels = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/test.csv")

In [None]:
# Make predictions and keep last ones
submissions = submission_data.sort_values(["date_block_num", "shop_id", "item_id"], ascending = True).drop_duplicates(["shop_id", "item_id"], keep = "last")
submissions["y_predict"] = xgb_opt_model.predict(submissions[training_cols].values)

In [None]:
test_labels = test_labels.merge(submissions[["shop_id", "item_id", "y_predict"]], left_on = ["shop_id", "item_id"], right_on = ["shop_id", "item_id"], how = "right")
sample_submission = sample_submission.merge(test_labels[["ID", "y_predict"]], left_on = "ID", right_on = "ID", how = "left")
sample_submission["y_predict"].fillna(1, inplace = True)
sample_submission.drop("item_cnt_month", axis = 1, inplace = True)
sample_submission.rename(columns = {"y_predict": "item_cnt_month"}, inplace = True)

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('/kaggle/working/submission.csv', index=False)