In [61]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import re
import xgboost

from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from statsmodels.graphics.tsaplots import plot_pacf
from xgboost import XGBRegressor
import lightgbm

In [62]:
all_csv = pd.read_csv('cleaned.csv.gz',
                 dtype = {
                     'store_nbr' : 'category',
                     'family' : 'category',
                     'sales': 'float',
                     'city': 'category',
                     'state': 'category',
                     'type': 'category',
                     'holiday_type': 'category',
                     'holiday_transferred': 'category'
                 },
                  parse_dates=['date'])
all_csv['date'] = pd.to_datetime(all_csv['date']).dt.to_period('D')

In [63]:
all = all_csv.copy()  # we can start experimenting from here without reloading the csv file

In [64]:
# this is for experimentation

filter_by_stores = None  # note: please use string here (unlike Mine.ipynb)
filter_by_family = None
filter_by_dates = None

#filter_by_stores = ['15']  # note: please use string here (unlike Mine.ipynb)
#filter_by_family = ['PRODUCE', 'AUTOMOBILE']
#filter_by_dates = '2014-06-05'

In [65]:
if filter_by_dates == None:
    train_start_date = '2015-06-15'
else:
    train_start_date = filter_by_dates
train_end_date = '2017-08-15'
test_start_date = '2017-08-16'
test_end_date = '2017-08-31'

In [66]:
if filter_by_family != None:
    all = all[all['family'].isin(filter_by_family)]
if filter_by_stores != None:
    all = all[all['store_nbr'].isin(filter_by_stores)]
if filter_by_dates != None:
    all = all[all['date'] >= filter_by_dates]

In [67]:
all['store_nbr'].unique()

['1', '10', '11', '12', '13', ..., '54', '6', '7', '8', '9']
Length: 54
Categories (54, object): ['1', '10', '11', '12', ..., '6', '7', '8', '9']

In [68]:
all.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3036528 entries, 0 to 3036527
Data columns (total 38 columns):
 #   Column               Dtype    
---  ------               -----    
 0   date                 period[D]
 1   store_nbr            category 
 2   family               category 
 3   sales                float64  
 4   onpromotion          int64    
 5   sales_lag_01         float64  
 6   sales_lag_02         float64  
 7   sales_lag_03         float64  
 8   sales_lag_04         float64  
 9   sales_lag_05         float64  
 10  sales_lag_06         float64  
 11  sales_lag_07         float64  
 12  sales_lag_08         float64  
 13  sales_lag_09         float64  
 14  sales_lag_10         float64  
 15  sales_lag_11         float64  
 16  sales_lag_12         float64  
 17  sales_lag_13         float64  
 18  sales_lag_14         float64  
 19  sales_lag_15         float64  
 20  sales_lag_16         float64  
 21  sales_lag_17         float64  
 22  sales_lag_18      

In [69]:


dtypes = all.dtypes.to_dict()

for col_name, typ in dtypes.items():
    if typ == 'float64':
        all =all.astype({col_name: 'float32'})
    if typ == 'int64':
        all =all.astype({col_name: 'int32'})

all['store_nbr']=all.store_nbr.astype('int8')

all['onpromotion']=all.onpromotion.astype('int16')
all = all.drop(['sales_lag_01', 'sales_lag_02', 'sales_lag_03', 'sales_lag_04', 'sales_lag_05', 'sales_lag_06', 'sales_lag_07', 'sales_lag_08', 'sales_lag_09', 'sales_lag_10'], axis=1)
all = all.drop(['sales_lag_11', 'sales_lag_12', 'sales_lag_13', 'sales_lag_14', 'sales_lag_15', 'sales_lag_16', 'sales_lag_17', 'sales_lag_18', 'sales_lag_19', 'sales_lag_20'], axis=1)

In [70]:
all.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3036528 entries, 0 to 3036527
Data columns (total 18 columns):
 #   Column               Dtype    
---  ------               -----    
 0   date                 period[D]
 1   store_nbr            int8     
 2   family               category 
 3   sales                float32  
 4   onpromotion          int16    
 5   city                 category 
 6   state                category 
 7   type                 category 
 8   cluster              int32    
 9   month                int32    
 10  day_of_month         int32    
 11  day_of_year          int32    
 12  week_of_year         int32    
 13  day_of_week          int32    
 14  weekday              int32    
 15  year                 int32    
 16  holiday_type         category 
 17  holiday_transferred  category 
dtypes: category(6), float32(1), int16(1), int32(8), int8(1), period[D](1)
memory usage: 153.5 MB


## One Hot Encoding

In [71]:
def one_hot_encode(df):
    ohe = OneHotEncoder()
    #ohe.fit_transform(df[column])
    return pd.get_dummies(data=df, columns=['store_nbr', 'family', 'city', 'state', 'type',
                                     'cluster', 'holiday_type', 'holiday_transferred', 'weekday'])    

In [72]:
all_ohe = one_hot_encode(all)
all_ohe = all_ohe.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))  # remove bad char in column names

X = all_ohe[all_ohe['date'] <= train_end_date]
X = X.drop(['sales'], axis=1)
y = all_ohe[['date', 'sales']][all_ohe['date'] <= train_end_date]
y.set_index('date', inplace=True)

X_test = all_ohe[all_ohe['date'] >= test_start_date]
X_test = X_test.drop(['sales'], axis=1)

X.drop('date', axis=1, inplace=True)
X_test.drop('date', axis=1, inplace=True)
y.set_index(X.index, inplace=True)

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1)

In [73]:
X_train

Unnamed: 0,onpromotion,month,day_of_month,day_of_year,week_of_year,day_of_week,year,store_nbr_1,store_nbr_2,store_nbr_3,...,holiday_type_Bridge,holiday_transferred_False,holiday_transferred_True,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
1198478,0,11,4,308,45,1,2014,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1649869,0,7,15,196,29,2,2015,0,0,0,...,0,0,0,0,0,1,0,0,0,0
615038,0,12,12,346,50,3,2013,0,0,0,...,0,0,0,0,0,0,1,0,0,0
413595,0,8,21,233,34,2,2013,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1398041,0,2,24,55,9,1,2015,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
836489,0,4,15,105,16,1,2014,0,0,1,...,0,0,0,0,1,0,0,0,0,0
491263,0,10,3,276,40,3,2013,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2568076,2,12,12,347,50,0,2016,0,0,0,...,0,0,0,1,0,0,0,0,0,0
491755,0,10,3,276,40,3,2013,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# LSTM Experiment


In [74]:
from keras.models import Sequential
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.layers import LSTM, Dense, Dropout, RepeatVector, TimeDistributed
from tqdm.keras import TqdmCallback

In [75]:
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()

In [83]:
generator = TimeseriesGenerator(X_train_np, y_train_np, length=7, batch_size = 1)


In [84]:
 
print('%s => %s' % (generator[0][0], generator[0][1]))

[[[ 0 11  4 ...  0  0  0]
  [ 0  7 15 ...  0  0  0]
  [ 0 12 12 ...  0  0  0]
  ...
  [ 0  2 24 ...  0  0  0]
  [ 0 11 15 ...  0  1  0]
  [ 0  3 30 ...  0  0  1]]] => [[0.6931472]]


In [85]:
model = Sequential()
model.add(LSTM(units = 50, activation='relu', return_sequences = True, input_shape = (7, len(X_train.columns)))) 
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 7, 50)             44200     
                                                                 
 dense_4 (Dense)             (None, 7, 1)              51        
                                                                 
Total params: 44,251
Trainable params: 44,251
Non-trainable params: 0
_________________________________________________________________


In [86]:
history = model.fit(generator, steps_per_epoch=len(generator), epochs=1, batch_size=1, verbose=1)

 454159/2256005 [=====>........................] - ETA: 1:07:55 - loss: 12.9875

## Test (Moment of Truth)

In [None]:
def main_predict(model, X_test):
    X_test_mod = X_test.copy()
    output = np.array([])
    start_day, end_day = X_test['day_of_month'].min(), X_test['day_of_month'].max()
        # we lost the dates, but we still have day_of_month, which is good enough for our experiment
        
    for day in range(start_day, end_day + 1):
        pred = model.predict(X_test_mod[X_test_mod['day_of_month'] == day])
        pred[pred < 0] = 0
        print(pred)
        output = np.concatenate([output, pred], axis=0)
        for future in range(day + 1, end_day + 1):
            X_test_mod.loc[X_test_mod[X_test_mod['day_of_month'] == future].index,
                           f'sales_lag_{(future - day):02d}'] = pred
            # fill out future values now that this sales figure is available
            
    return output

In [None]:
y_pred_test = main_predict(lgb, X_test)

In [None]:
delta_index = 3008016 - 3000888  # we inserted 4 Christmas days, 4 x 54 x 33 = 7128, which is the difference
submission = pd.DataFrame({'id': X_test.index - delta_index, 'sales': np.expm1(y_pred_test)})
submission.to_csv('submission.csv', index=False)

In [41]:
 
print('%s => %s' % (generator[0][0], generator[0][1]))

[[[0.         0.91629076 0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.6931472  0.6931472  ... 0.         0.         0.        ]
  ...
  [0.         1.7917595  1.0986123  ... 0.         0.         0.        ]
  [0.         1.0986123  0.6931472  ... 0.         0.         0.        ]
  [0.         5.379091   5.3112087  ... 0.         0.         0.        ]]] => [[5.6811314]]


In [None]:
model = Sequential()
model.add(LSTM(units = 4, activation='relu', return_sequences = True, input_shape = (25, len(X_train.columns)))) 
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.summary()

In [42]:
 
model = Sequential()    
    
    
model.add(LSTM(units = 50, activation='relu', return_sequences = True, input_shape = (25, len(X_train.columns)))) 
model.add(Dropout(0.3))
        
     
model.add(LSTM(units = 50,  activation='relu', return_sequences = True))                                    
model.add(Dropout(0.2))
        
    
model.add(LSTM(units = 50))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation = "linear"))
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_11 (LSTM)              (None, 25, 50)            48200     
                                                                 
 dropout_9 (Dropout)         (None, 25, 50)            0         
                                                                 
 lstm_12 (LSTM)              (None, 25, 50)            20200     
                                                                 
 dropout_10 (Dropout)        (None, 25, 50)            0         
                                                                 
 lstm_13 (LSTM)              (None, 50)                20200     
                                                                 
 dropout_11 (Dropout)        (None, 50)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                

In [55]:
model.compile(optimizer='adam', loss='mean_squared_error')
#model.fit(generator, steps_per_epoch=len(generator), epochs=10, verbose=2)
 


In [None]:
model.fit(generator, steps_per_epoch=len(generator), epochs=10, batch_size=1, verbose=1)


In [24]:
model = Sequential()
model.add(LSTM(units = 4, activation='relu', return_sequences = True, input_shape = (25, len(X_train.columns)))) 
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 25, 4)             2800      
                                                                 
 dense (Dense)               (None, 25, 1)             5         
                                                                 
Total params: 2,805
Trainable params: 2,805
Non-trainable params: 0
_________________________________________________________________


2022-11-30 12:28:23.043748: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [25]:
history = model.fit(generator, steps_per_epoch=len(generator), epochs=25, batch_size=1, verbose=1)

Epoch 1/25
   3174/2255987 [..............................] - ETA: 3:09:42 - loss: 6.4036

KeyboardInterrupt: 