In [1]:
!pip install py7zr scikit-learn --quiet

In [6]:
!conda install -c conda-forge lightgbm==4.1.0 -y --quiet

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [1]:
EVAL_WEEKS = 0
TRAIN_WEEKS = 8
MAX_MOVING_AVERAGE_WINDOW_WEEKS = 1
MODEL_PATH = 'model'


In [11]:
import os
import json
import pathlib
import pickle
from datetime import datetime, timedelta
import math

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb

from features import (
    read_csv_from_7z,
    StoreFeatureService,
    ItemFeatureService,
    DateFeatureService,
    MovingAverageFeatureService
)

In [3]:
# Metric used by the competition
class NWRMSLE:
    def __init__(self, epsilon=1e-7):
        self.epsilon = epsilon

    def NWRMSLE(self, a, p, w):
        a = np.array(a)
        p = np.array(p)
        w = np.array(w)
        
        a = np.maximum(a, 0) + self.epsilon
        p = np.maximum(p, 0) + self.epsilon
        
        weighted_errors = np.dot(np.square(p - a), np.transpose(w))
        weights_sum = np.sum(w)
        return math.sqrt(weighted_errors / weights_sum)

    def NWRMSLE_lgb(self, preds, train_data):
        labels = train_data.get_label()
        w = train_data.get_weight()  
        NWRMSLE_score = self.NWRMSLE(labels, preds, w)
        return 'NWRMSLE', NWRMSLE_score, False

In [4]:

train_df = read_csv_from_7z('dataset/train.csv.7z')
train_df['date'] = pd.to_datetime(train_df['date'])

end_date = train_df['date'].max()
start_date = end_date - timedelta(weeks=TRAIN_WEEKS+EVAL_WEEKS+MAX_MOVING_AVERAGE_WINDOW_WEEKS)
train_df = train_df[train_df['date'] >= start_date]

  return pd.read_csv(io.BytesIO(data))


In [5]:
target = 'unit_sales'

# As we know from the EDA, the unit_sales column heavily skew to the left, so I decide 
# sense to apply log1p here to make it easier for the model to predict
train_df[target] = train_df[target].clip(lower=0).apply(np.log1p)


train_df['onpromotion'] = train_df['onpromotion'].fillna(False)

  train_df['onpromotion'] = train_df['onpromotion'].fillna(False)


# Fetching features from feature services

In [6]:
store_fs = StoreFeatureService()
item_fs = ItemFeatureService()
date_fs = DateFeatureService()

ma_s_i_7d_fs = MovingAverageFeatureService(["store_nbr", "item_nbr", "date"], 7)
ma_i_7d_fs = MovingAverageFeatureService(["item_nbr", "date"], 7)
ma_s_7d_fs = MovingAverageFeatureService(["store_nbr", "date"], 7)

ma_s_i_7d_fs = MovingAverageFeatureService(["store_nbr", "item_nbr", "date"], 7)
ma_i_7d_fs = MovingAverageFeatureService(["item_nbr", "date"], 7)
ma_s_7d_fs = MovingAverageFeatureService(["store_nbr", "date"], 7)

ma_s_i_3d_fs = MovingAverageFeatureService(["store_nbr", "item_nbr", "date"], 3)
ma_i_3d_fs = MovingAverageFeatureService(["item_nbr", "date"], 3)
ma_s_3d_fs = MovingAverageFeatureService(["store_nbr", "date"], 3)

all_feature_services = [store_fs, item_fs, date_fs, ma_s_i_7d_fs, ma_i_7d_fs, ma_s_7d_fs, ma_s_i_3d_fs, ma_i_3d_fs, ma_s_3d_fs,]

categorical = ['onpromotion']
continuous = []

for fs in all_feature_services:
    categorical.extend(fs.categorical)
    continuous.extend(fs.continuous)

In [7]:
train_set = train_df.copy()

# Joining the features together
for fs in all_feature_services:
    train_set = fs.join(train_set)
train_set['sample_weight'] = train_set['perishable'].map(lambda x: 1.0 if x==0 else 1.5)


  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)
  rolling_mean = df.groupby(group_cols).sum()['unit_sales'].rolling(window, min_periods=1).mean().unstack(id_cols).shift(1,freq="D").stack(id_cols)


# Feature transformation

In [8]:

# Create an ordinal encoder to map distinct categories to numbers
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
features = categorical + continuous

end_date = train_set['date'].max()

if EVAL_WEEKS > 0:
    eval_end_date = end_date
    eval_start_date = eval_end_date - timedelta(weeks=EVAL_WEEKS)
    train_end_date = eval_start_date
    train_start_date = train_end_date - timedelta(weeks=TRAIN_WEEKS)
else:
    train_end_date = end_date
    train_start_date = train_end_date - timedelta(weeks=TRAIN_WEEKS)


train_data = train_set[(train_set['date'] >= train_start_date) & (train_set['date'] < train_end_date)]
train_data = train_data.drop(columns=['date', 'id'])

# Fit the encoder to training set
train_data[categorical] = ordinal_encoder.fit_transform(train_data[categorical])


# X_train = train_data[features]
# y_train = train_data[target]
# w_train = train_data['sample_weight']
train_dataset = lgb.Dataset(train_data[features], label=train_data[target], weight=train_data['sample_weight'], categorical_feature=categorical)

valid_sets = [train_dataset]
if EVAL_WEEKS > 0:
    eval_data = train_set[(train_set['date'] >= eval_start_date) & (train_set['date'] <= eval_end_date)]
    eval_data = eval_data.drop(columns=['date', 'id'])
    # Apply the same mapping to the eval set
    eval_data[categorical] = ordinal_encoder.transform(eval_data[categorical])
    # X_eval = eval_data[features]
    # y_eval = eval_data[target]
    # w_eval = eval_data['sample_weight']
    eval_dataset = lgb.Dataset(eval_data[features], label=eval_data[target], weight=eval_data['sample_weight'], categorical_feature=categorical, reference=train_dataset)
    valid_sets.append(eval_dataset)


In [9]:
params = {
    'num_leaves': 33,
    'objective': 'regression',
    'min_data_in_leaf': 1500,
    'learning_rate': 0.02,
    'feature_fraction': 0.7,
    'min_split_gain': 0,
    'metric': 'l2',
    'subsample': 0.9,
    'drop_rate': 0.1,
    'min_child_samples': 10,
    'min_child_weight': 150,
    'max_drop': 50,
    'boosting':'gbdt',
    'num_threads':6,
}


nwrmsle = NWRMSLE()

# Train the model
model = lgb.train(
    params,
    train_dataset,
    num_boost_round=500,
    valid_sets=valid_sets,
    callbacks=[lgb.log_evaluation()],
    feval=nwrmsle.NWRMSLE_lgb,
)
# Mark the completion time
completion_time = datetime.now()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.087380 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5490
[LightGBM] [Info] Number of data points in the train set: 5892313, number of used features: 19
[LightGBM] [Info] Start training from score 1.710765
[1]	training's l2: 0.748064	training's NWRMSLE: 0.864907
[2]	training's l2: 0.730733	training's NWRMSLE: 0.854829
[3]	training's l2: 0.713035	training's NWRMSLE: 0.844414
[4]	training's l2: 0.696166	training's NWRMSLE: 0.834366
[5]	training's l2: 0.67982	training's NWRMSLE: 0.824512
[6]	training's l2: 0.664135	training's NWRMSLE: 0.814945
[7]	training's l2: 0.649031	training's NWRMSLE: 0.805625
[8]	training's l2: 0.634569	training's NWRMSLE: 0.796598
[9]	training's l2: 0.620673	training's NWRMSLE: 0.787828
[10]	training's l2: 0.607277	training's NWRMSLE: 0.77928
[11]	training's l2: 0.5

In [12]:
# We save model to two locations, one tagged with the completion time, the other being the latest.
# This is assuming that we never use old data to train a prod model.
# The following data is saved:
# Training metadata - this is for monitoring
# Encoder - this is to keep feature encoding the same
# Model artifact

for model_version in [completion_time.strftime("%Y-%m-%d_%H-%M-%S"), 'latest']:
    model_push_path = os.path.join(MODEL_PATH, model_version)
    pathlib.Path(model_push_path).mkdir(parents=True, exist_ok=True) 

    metadata = {
        "completion_time": completion_time.strftime('%s'),
        "training_set_end_date": str(train_set.date.max()),
        "training_set_start_date": str(train_set.date.min()),
        "categorical_features": categorical,
        "continuous_features": continuous
    }

    
    with open(os.path.join(model_push_path,'metadata.json'), 'w') as f:
        json.dump(metadata,f)

    with open(os.path.join(model_push_path,'encoder'), 'wb') as f:
        pickle.dump(ordinal_encoder, f)

    model.save_model(os.path.join(model_push_path,'model.txt'), num_iteration=model.best_iteration) 

    
