In [1]:
import os
import pickle
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder

from features import (
    read_csv_from_7z,
    StoreFeatureService,
    ItemFeatureService,
    DateFeatureService,
    MovingAverageFeatureService
)


In [2]:
MODEL_PATH = 'model'
MODEL_VERSION = 'latest'
MAX_MOVING_AVERAGE_WINDOW_WEEKS = 2

# Test data loading

In [3]:
test_df = read_csv_from_7z('dataset/test.csv.7z')
dates_to_predict = test_df['date'].unique()
test_df['date'] = pd.to_datetime(test_df['date'])
test_df['onpromotion'] = test_df['onpromotion'].fillna(False)


target = 'unit_sales'
# dummy target, we will use model prediction to overwrite this
test_df[target] = 0



# Fetching features from feature services

In [4]:
store_fs = StoreFeatureService()
item_fs = ItemFeatureService()
date_fs = DateFeatureService()

ma_s_i_7d_fs = MovingAverageFeatureService(["store_nbr", "item_nbr", "date"], 7)
ma_i_7d_fs = MovingAverageFeatureService(["item_nbr", "date"], 7)
ma_s_7d_fs = MovingAverageFeatureService(["store_nbr", "date"], 7)

ma_s_i_7d_fs = MovingAverageFeatureService(["store_nbr", "item_nbr", "date"], 7)
ma_i_7d_fs = MovingAverageFeatureService(["item_nbr", "date"], 7)
ma_s_7d_fs = MovingAverageFeatureService(["store_nbr", "date"], 7)

ma_s_i_3d_fs = MovingAverageFeatureService(["store_nbr", "item_nbr", "date"], 3)
ma_i_3d_fs = MovingAverageFeatureService(["item_nbr", "date"], 3)
ma_s_3d_fs = MovingAverageFeatureService(["store_nbr", "date"], 3)

all_feature_services = [store_fs, item_fs, date_fs, ma_s_i_7d_fs, ma_i_7d_fs, ma_s_7d_fs, ma_s_i_3d_fs, ma_i_3d_fs, ma_s_3d_fs,]

categorical = ['onpromotion']
continuous = []

for fs in all_feature_services:
    categorical.extend(fs.categorical)
    continuous.extend(fs.continuous)

# Loading model and encoder from the latest model push

In [5]:
model_push_path = os.path.join(MODEL_PATH, MODEL_VERSION)
with open(os.path.join(model_push_path, 'encoder'), 'rb') as f:
    ordinal_encoder = pickle.load(f)
model = lgb.Booster(model_file=os.path.join(model_push_path, 'model.txt'))

# Generate model prediction

In [6]:
for date in dates_to_predict:
    print(f"Predicting {date}")
    test_set = test_df.copy()
    
    for fs in all_feature_services:
        test_set = fs.join(test_set)

    today_index = test_set[test_set.date == date].index
    test_current_date_df = test_set.loc[today_index]
    
    test_current_date_df[categorical] = ordinal_encoder.transform(test_current_date_df[categorical])
    dataset =  test_current_date_df[categorical+continuous]
    test_set.loc[today_index,target] = model.predict(dataset, num_iteration=model.best_iteration)
    merged = test_df.merge(test_set[['id', target]], how='left', on='id')
    merged[target] = merged[[f"{target}_x",f"{target}_y"]].max(axis=1)
    test_df = merged.drop([f"{target}_x",f"{target}_y"], axis=1)

Predicting 2017-08-16


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  test_set.loc[today_index,target] = model.predict(dataset, num_iteration=model.best_iteration)


Predicting 2017-08-17


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-18


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-19


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-20


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-21


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-22


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-23


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-24


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-25


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-26


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-27


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-28


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-29


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-30


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


Predicting 2017-08-31


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


In [7]:
submission = test_df[test_df.date.isin(dates_to_predict)].sort_values("id")[["id", "unit_sales"]]
submission[target] = np.clip(np.expm1(submission[target]), 0, 1000)

submission.to_csv(f'{MODEL_VERSION}_output.csv', float_format='%.4f', index=None)

  submission = test_df[test_df.date.isin(dates_to_predict)].sort_values("id")[["id", "unit_sales"]]
