In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
rng = np.random.default_rng(673)

train_data = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv', parse_dates=['date'])
print(train_data.head(5))

In [None]:
test_data = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv', parse_dates=['date'])
print(test_data.head(5))

In [None]:
categorical_features = ['family', 'store_nbr']
enc = OrdinalEncoder()
enc.fit(train_data[categorical_features])

def extract_features(data):
    extracted = pd.DataFrame()
    extracted['month'] = data['date'].dt.month
    extracted['year'] = data['date'].dt.year
    extracted['dow'] = data['date'].dt.dayofweek
    extracted['doy'] = data['date'].dt.dayofyear
    extracted[categorical_features] = enc.transform(data[categorical_features])
    return extracted

def get_rolling(train_data, test_data, column):
    data = pd.concat([train_data, test_data])
    groups = data.groupby(categorical_features).rolling(28, on='date')[column].mean().shift(21)
    c = data.join(groups, on=categorical_features  + ['date'], rsuffix='_28')[column + '_28']
    return c.iloc[:len(train_data)], c.iloc[len(train_data):]

In [None]:
# print(data.head())
# groups = data.groupby(categorical_features).rolling(2, on='date')['sales'].mean().shift(16)
# data.join(groups, on=categorical_features + ['date'], rsuffix='_28')

In [None]:
sales_28 = get_rolling(train_data, test_data, 'sales')
op_28 = get_rolling(train_data, test_data, 'onpromotion')

X, y = extract_features(train_data), train_data['sales']
X['sales_28'] = sales_28[0] 
X['op_28'] = op_28[0]
print(X.head(5))

X_test = extract_features(test_data)
X_test['sales_28'] = sales_28[1]
X_test['op_28'] = op_28[1]
print(X_test.head(5))

In [None]:
def train(num_boost_round, params):
    n_class = 33 * 53
    tscv = TimeSeriesSplit(test_size=15 * n_class, n_splits=10)
    rmsle_val, rmsle_train = [], []
    
    for train_index, val_index in tscv.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]
        train_dataset = lgb.Dataset(X_train, y_train)
        val_dataset = lgb.Dataset(X_val, y_val)

        booster = lgb.train(dict({'objective': 'regression', 'seed': 673}, **params), 
                            train_set=train_dataset, valid_sets=(val_dataset,),
                            num_boost_round=num_boost_round)

        train_preds = np.round(np.maximum(0, booster.predict(X_train))).astype(int)
        rmsle_train.append(mean_squared_log_error(train_preds, y_train, squared=False))
        
        val_preds = np.round(np.maximum(0, booster.predict(X_val))).astype(int)
        rmsle_val.append(mean_squared_log_error(val_preds, y_val, squared=False))
    
    return rmsle_val, rmsle_train

In [None]:
params = {
    'max_depth': [3, 8, 15],
}
rmsle = []

max_depth = [3, 8, 15]
num_boost_round = [130]

for p in max_depth:
    rmsle.append(train(130, {'max_depth': p}))

In [None]:
for i in range(len(max_depth)):
    print(f'Mean validation score for max depth {max_depth[i]}: {np.mean(rmsle[i][0]):.2f}+-{np.std(rmsle[i][0]):.2f}')
# print(f'Submission score: {1.48719:.2f}')

In [None]:
plt.plot(max_depth, [np.mean(row[0]) for row in rmsle], label='val')
plt.plot(max_depth, [np.mean(row[1]) for row in rmsle], label='train')
plt.legend()
plt.show()

In [None]:
train_dataset = lgb.Dataset(X, y)

booster = lgb.train({'objective': 'regression', 'seed': 673, 'max_depth ': 15}, 
                    train_set=train_dataset,
                    num_boost_round=150)

result = pd.DataFrame()
result['id'] = test_data['id']
result['sales'] = np.round(np.maximum(0, booster.predict(X_test))).astype(int)
result.to_csv('result.csv', index=False)

In [None]:
# train_data.merge(
#     train_data.groupby(['family','store_nbr']).rolling(14,on='date').mean().reset_index().rename(columns={'sales':'avg_sales'}),
#     how='left',
#     on=['family','store_nbr', 'date']
# )