In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import gc

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

from sklearn.externals import joblib

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
calendar_df = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
sell_prices_df = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
train_df = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
sample_sub_df = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')

## Data Description

### Calendar.csv: Contains information about the dates the products are sold.
* **  date**: The date in a “y-m-d” format.
* **  wm_yr_wk**: The id of the week the date belongs to.
* ** weekday**: The type of the day (Saturday, Sunday, …, Friday).
* ** wday**: The id of the weekday, starting from Saturday.
* ** month**: The month of the date.
* ** year**: The year of the date.
* ** event_name_1**: If the date includes an event, the name of this event.
* ** event_type_1**: If the date includes an event, the type of this event.
* ** event_name_2**: If the date includes a second event, the name of this event.
* ** event_type_2**: If the date includes a second event, the type of this event.
* ** snap_CA, snap_TX, and snap_WI**: A binary variable (0 or 1) indicating whether the stores of CA,
TX or WI allow SNAP2 purchases on the examined date. 1 indicates that SNAP purchases are
allowed.

In [None]:
calendar_df

`Weekday` is redundent with `wday`. So Let's drop it. Also we have date feature expanded. So let's also drop the `date` feature.

In [None]:
calendar_df = calendar_df.drop(['weekday'], axis=1)
calendar_df = calendar_df.drop(['date'], axis=1)

### sell_prices.csv: Contains information about the price of the products sold per store and date.

store_id: The id of the store where the product is sold.
* ** item_id**: The id of the product.
* ** wm_yr_wk**: The id of the week.
* ** sell_price**: The price of the product for the given week/store. The price is provided per week
(average across seven days). If not available, this means that the product was not sold during the
examined week. Note that although prices are constant at weekly basis, they may change through
time (both training and test set). 

In [None]:
sell_prices_df.head(10)

### sales_train.csv: Contains the historical daily unit sales data per product and store.
* ** item_id**: The id of the product.
* ** dept_id**: The id of the department the product belongs to.
* ** cat_id**: The id of the category the product belongs to.
* ** store_id**: The id of the store where the product is sold.
* ** state_id**: The State where the store is located.
* ** d_1, d_2, …, d_i, … d_1941**: The number of units sold at day i, starting from 2011-01-29.

In [None]:
train_df.head(10)

In [None]:
sample_sub_df.head()

In [None]:
print(f'Shape of calendar: {calendar_df.shape}')
print(f'Shape of sell prices: {sell_prices_df.shape}')
print(f'Shape of validation dataset: {train_df.shape}')
print(f'Shape of test dataset: {sample_sub_df.shape}')

In [None]:
## Function to reduce the memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
calendar_df = reduce_mem_usage(calendar_df)
sell_prices_df = reduce_mem_usage(sell_prices_df)
gc.collect()

## Let's analyse Missing values

In [None]:
calendar_df.isna().sum()

In [None]:
sell_prices_df.isna().sum()

In [None]:
(train_df.isna().sum() == 0).all()

Since the missing values are for the days when there was no events, we will fill it by string 'None'. Here 'None' will serve as a category instead of missing value.

In [None]:
calendar_df = calendar_df.fillna('None')
calendar_df.isna().sum()

In [None]:
calendar_df.memory_usage()

Let's convert the categorical variables to `category` datatype to reduce memory usage.

In [None]:
calendar_df.dtypes

In [None]:
for feature in ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']:
    calendar_df[feature] = calendar_df[feature].astype('category')
    calendar_df[feature] = calendar_df[feature].cat.codes

In [None]:
calendar_df.memory_usage()

Also let's map `d` to integer type to reduce some more memory usage.

In [None]:
TOTAL_TRAINING_DAYS = 1969
TRAINING_DAYS = 1913

In [None]:
day_dict = {}
for i in range(TOTAL_TRAINING_DAYS):
    day_dict[f'd_{i+1}'] = i + 1
calendar_df['d'] = calendar_df['d'].map(day_dict).astype(np.int16)
calendar_df

In [None]:
calendar_df.dtypes

In [None]:
sell_prices_df.memory_usage()

In [None]:
for feature in ['store_id', 'item_id']:
    sell_prices_df[feature] = sell_prices_df[feature].astype('category')
    sell_prices_df[feature] = sell_prices_df[feature].cat.codes

In [None]:
sell_prices_df.memory_usage()

In [None]:
train_df.memory_usage()

In [None]:
train_df.dtypes

In [None]:
for feature in ['store_id', 'item_id', 'cat_id', 'dept_id', 'state_id']:
    train_df[feature] = train_df[feature].astype('category')
    train_df[feature] = train_df[feature].cat.codes

In [None]:
train_df.memory_usage()

In [None]:
%%time
full_train_df = train_df.drop([f'd_{i+1}' for i in range(TRAINING_DAYS)], axis=1)
full_train_df = pd.concat([full_train_df]*TRAINING_DAYS, ignore_index=True)
full_train_df['sales'] = pd.Series(train_df[[f'd_{i+1}' for i in range(TRAINING_DAYS)]].values.ravel('F'))

days = [i+1 for i in range(TRAINING_DAYS)] * len(train_df)
days.sort()

full_train_df['d'] = pd.Series(days)
full_train_df = reduce_mem_usage(full_train_df)

In [None]:
TEST_DAYS = 28

In [None]:
%%time
full_test_df = train_df.drop([f'd_{i+1}' for i in range(TRAINING_DAYS)], axis=1)
full_test_df = pd.concat([full_test_df]*TEST_DAYS, ignore_index=True)

days = [i for i in range(1914, 1914+TEST_DAYS)] * len(train_df)
days.sort()

full_test_df['d'] = pd.Series(days)
full_test_df = reduce_mem_usage(full_test_df)
full_test_df

In [None]:
del days
del train_df
gc.collect()

In [None]:
%%time
full_train_df = full_train_df.merge(calendar_df, how='inner', on='d')
full_train_df

In [None]:
test_cal_df = calendar_df[(calendar_df['d'] > 1913) & (calendar_df['d'] <= 1941)]
test_cal_df

In [None]:
%%time
full_test_df = full_test_df.merge(test_cal_df, how='inner', on='d')
full_test_df

In [None]:
full_train_df = reduce_mem_usage(full_train_df)
full_test_df = reduce_mem_usage(full_test_df)
gc.collect()

In [None]:
%%time
full_train_df = full_train_df.merge(sell_prices_df, how='inner', on=['store_id', 'item_id', 'wm_yr_wk'])
full_train_df

In [None]:
%%time
full_test_df = full_test_df.merge(sell_prices_df, how='inner', on=['store_id', 'item_id', 'wm_yr_wk'])
full_test_df

In [None]:
full_train_df = reduce_mem_usage(full_train_df)
full_test_df = reduce_mem_usage(full_test_df)
gc.collect()

In [None]:
del calendar_df
del sample_sub_df
del day_dict
gc.collect()

In [None]:
full_train_df = full_train_df.drop(['wm_yr_wk', 'd', 'id'], axis=1)
gc.collect()
full_train_df.shape

In [None]:
full_test_df = full_test_df.drop(['wm_yr_wk', 'd', 'id'], axis=1)
full_test_df.shape

In [None]:
X_train = full_train_df.drop('sales', axis=1)
Y_train = full_train_df['sales']

In [None]:
del full_train_df
gc.collect()

In [None]:
categoricals = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'wday', 'month', 'year', 'event_name_1', 
               'event_name_2', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI']

In [None]:
params = {
      'num_leaves': 555,
      'min_child_weight': 0.034,
      'feature_fraction': 0.379,
      'bagging_fraction': 0.418,
      'min_data_in_leaf': 106,
      'objective': 'regression',
      'max_depth': -1,
      'learning_rate': 0.007,
      "boosting_type": "gbdt",
      "bagging_seed": 11,
      "metric": 'rmse',
      "verbosity": -1,
      'reg_alpha': 0.3899,
      'reg_lambda': 0.648,
      'random_state': 666,
    }
folds = 5
seed = 666

kf = StratifiedKFold(n_splits=folds, shuffle=False, random_state=seed)

models = []
for train_index, val_index in kf.split(X_train, Y_train):
    x_train = X_train.iloc[train_index]
    x_val = X_train.iloc[val_index]
    
    y_train = Y_train.iloc[train_index]
    y_val = Y_train.iloc[val_index]
    
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categoricals)
    lgb_eval = lgb.Dataset(x_val, y_val, categorical_feature=categoricals)
    
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=(lgb_train, lgb_eval),
                early_stopping_rounds=100,
                verbose_eval = 100)
    
    models.append(gbm)

In [None]:
# save model
# joblib.dump(models, 'models.pkl')
# load model
# models = joblib.load('/kaggle/input/m5models/models.pkl')

In [None]:
preds = sum([model.predict(full_test_df) for model in tqdm(models)])/folds

In [None]:
full_test_df['sales'] = preds
full_test_df

In [None]:
full_test_df['item_id']

In [None]:
sample_sub_df = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
sample_sub_df

In [None]:
daywise_preds = {}
for day in range(TEST_DAYS):
    day_str = f'F{day+1}'
    for i in range(day, len(full_test_df), 28):
        if day_str in daywise_preds:
            daywise_preds[day_str].append(preds[i])
        else:
            daywise_preds[day_str] = [preds[i]]
            
zeros = [0 for _ in range(30490)]
for k, v in daywise_preds.items():
    daywise_preds[k] = v + zeros

In [None]:
daywise_preds = pd.DataFrame.from_dict(daywise_preds)

In [None]:
daywise_preds

In [None]:
daywise_preds['id'] = sample_sub_df['id']
cols = daywise_preds.columns.tolist()
cols = cols[-1:] + cols[:-1]
daywise_preds = daywise_preds[cols]

In [None]:
daywise_preds.to_csv('submission.csv', index=False)
daywise_preds

In [None]:
from IPython.display import FileLink
FileLink('submission.csv')