In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
M = pd.read_pickle('/kaggle/input/sales-data-prep/matrix.pkl')
M.columns

In [None]:
import pandas as pd
import gc
import pickle
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance

# Retrieve dataset

M = pd.read_pickle('/kaggle/input/sales-data-prep/matrix.pkl')
M.drop(["new_item_cat_enc_lag_1", "new_item_cat_enc_lag_2", "new_item_cat_enc_lag_3"], axis=1, inplace=True)
M = M[M["date_block_num"] > 2]
M.fillna(0)
#M = pd.read_pickle('/kaggle/input/sales-lag3/matrix.pkl')

#M.drop(["new_item_cat_enc_lag_1", "new_item_cat_enc_lag_2", "new_item_cat_enc_lag_3",
#        "item_target_shop_enc_lag_1", "item_target_shop_enc_lag_2", "item_target_shop_enc_lag_3"], axis=1, inplace=True)

for col in M.columns:
    print(col,M[col].nunique())

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# reduce matrix memory usage

#M = reduce_mem_usage(M, use_float16=True)

# Separate Train, Test and Validation

X_train = M[M.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = M[M.date_block_num < 33]['item_cnt_month']

X_test = M[M.date_block_num == 34].drop(['item_cnt_month'], axis=1)

X_val = M[M.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_val = M[M.date_block_num == 33]['item_cnt_month']

#Y_train = Y_train.clip(0, 20)
#Y_val = Y_val.clip(0, 20)

del M
gc.collect()

In [None]:
# Fit model

model = XGBRegressor(
    max_depth=10,
    booster='gbtree',
    n_estimators=1000,
    min_child_weight=0.5, 
    subsample=0.8,
    sampling_method="uniform",
    colsample_bynode=1,
    colsample_bytree=0.8, 
    eta=0.1,
    #base_score=0.05,
    #gamma=0.001,
    tree_method='gpu_hist',
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_val, Y_val)], 
    verbose=True, 
    early_stopping_rounds = 20)

In [None]:
# Save model

pickle.dump(model, open("model.pkl", "wb"))

In [None]:
# Retrieve model

loaded_model = pickle.load(open("model.pkl", "rb"))

In [None]:
# Plot feature importance

fig, ax = plt.subplots(1,1,figsize=(15,20))
plot_importance(booster=loaded_model, ax=ax);

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

Y_test = loaded_model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})

In [None]:
# load data for postprocessing

items=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
item_categories=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")

In [None]:
df = pd.merge(items, item_categories)
df

In [None]:
submission.to_csv('my_submission.csv', index=False)