In [None]:
# read the csv file
import warnings
import random
import pandas as pd
data = pd.read_csv('../input/demand-forecasting-kernels-only/train.csv')
data['date'] = pd.to_datetime(data['date'])
pd.options.mode.chained_assignment = None  # default='warn'
data.head()

In [None]:
# combine store and item column as time_series
data['store'] = ['store_' + str(i) for i in data['store']]
data['item'] = ['item_' + str(i) for i in data['item']]
data['time_series'] = data[['store', 'item']].apply(lambda x: '_'.join(x), axis=1)
data.drop(['store', 'item'], axis=1, inplace=True)
data.head()

In [None]:
# extract features from date
data['month'] = [i.month for i in data['date']]
data['year'] = [i.year for i in data['date']]
data['day_of_week'] = [i.dayofweek for i in data['date']]
data['day_of_year'] = [i.dayofyear for i in data['date']]
data.head()

In [None]:
# check the unique time_series
data['time_series'].nunique()

# Visualize Time-Series

In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns
for i in list(data['time_series'].unique()[300:400]):
    subset = data[data['time_series'] == i]
    subset['moving_average'] = subset['sales'].rolling(30).mean()
    subset.dropna(inplace=True)
    warnings.simplefilter(action='error', category=FutureWarning)
    # set figure size
    plt.figure( figsize = ( 20, 10))

    # plot a simple time series plot
    # using seaborn.lineplot()
    sns.lineplot( x = 'date',
                 y = 'sales',
                 data = subset,
                 label = 'sales')

    # plot using rolling average
    sns.lineplot( x = 'date',
                 y = 'moving_average',
                 data = subset,
                 label = 'moving_average')
    plt.ylabel("Values")
    plt.grid()

    plt.title(i)

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
# install slim version (default)
!pip install pycaret

In [None]:
from pycaret.regression import *

all_ts = data['time_series'].unique()[300:400]

all_results = []
final_model = {}

for i in tqdm(all_ts):
    
    df_subset = data[data['time_series'] == i]
    
    # initialize setup from pycaret.regression
    s = setup(df_subset, target = 'sales', train_size = 0.99,
              data_split_shuffle = False, fold_strategy = 'timeseries', fold = 4,
              ignore_features = ['date', 'time_series'],
              numeric_features = ['day_of_year', 'year'],
              categorical_features = ['month', 'day_of_week'],
              silent = True, verbose = False, session_id = 42)
    
    # compare all models and select best one based on MAE
    best_model = compare_models(sort = 'MAE', verbose=False)
    
    # capture the compare result grid and store best model in list
    p = pull().iloc[0:1]
    p['time_series'] = str(i)
    all_results.append(p)
    
    # finalize model i.e. fit on entire data including test set
    f = finalize_model(best_model)
    
    # attach final model to a dictionary
    final_model[i] = f
    
    # save transformation pipeline and model as pickle file 
    save_model(f, model_name='./' + str(i), verbose=False)


In [None]:
concat_results = pd.concat(all_results,axis=0)
concat_results.head()

In [None]:
# create a date range from 2013 to 2019
all_dates = pd.date_range(start='2013-01-01', end = '2019-12-31', freq = 'D')

In [None]:
# create empty dataframe
score_df = pd.DataFrame()
score_df.head()

In [None]:
# add columns to dataset
score_df['date'] = all_dates
score_df['month'] = [i.month for i in score_df['date']]
score_df['year'] = [i.year for i in score_df['date']]
score_df['day_of_week'] = [i.dayofweek for i in score_df['date']]
score_df['day_of_year'] = [i.dayofyear for i in score_df['date']]
score_df.head()

In [None]:
from pycaret.regression import load_model, predict_model
all_score_df = []
for i in tqdm(data['time_series'].unique()[300:400]):
    l = load_model('./' + str(i), verbose=False)
    p = predict_model(l, data=score_df)
    p['time_series'] = i
    all_score_df.append(p)
concat_df = pd.concat(all_score_df, axis=0)
concat_df.head()

In [None]:
final_df = pd.merge(concat_df[['date', 'time_series',
                                     'Label']],data, how = 'left', left_on=['date', 'time_series'], right_on = ['date', 'time_series'])
final_df.head()

In [None]:
for i in data['time_series'].unique()[300:400]:
    sub_df = final_df[final_df['time_series'] == i]
    # set figure size
    plt.figure( figsize = ( 20, 10))

    # plot a simple time series plot
    # using seaborn.lineplot()
    sns.lineplot( x = 'date',
                 y = 'sales',
                 data = sub_df,
                 label = 'sales')

    # plot using rolling average
    sns.lineplot( x = 'date',
                 y = 'Label',
                 data = sub_df,
                 label = 'Label')
    plt.ylabel("Values")
    plt.grid()

    plt.title(i)