<a href="https://colab.research.google.com/github/rtegao/M5Forecasting/blob/master/MultiFeature.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [0]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras import optimizers, regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler

# Functions

In [0]:
def normalizing_data(df_train,df_val,df_test):
  try:
    data_list = [df_train.copy(),df_val.copy(),df_test.copy()]
    min_list = df_train.min(axis=0)
    max_list = df_train.max(axis=0)
    for j in range(len(data_list)):
      for i in range(df_train.shape[1]):
        data_list[j].iloc[:,i] = (data_list[j].iloc[:,i]-min_list[i])/abs((min_list[i]-max_list[i]))
      data_list[j]=data_list[j].values
  except:
    df_train = df_train.values.reshape(-1,1)
    df_val = df_val.values.reshape(-1,1)
    df_test = df_test.values.reshape(-1,1)

    data_list = [df_train.copy(),df_val.copy(),df_test.copy()]
    min_list = df_train.min(axis=0)
    max_list = df_train.max(axis=0)
    for j in range(3):
      for i in range(df_train.shape[1]):
        data_list[j][:,i] = (data_list[j][:,i]-min_list[i])/abs((min_list[i]-max_list[i]))
        # data_list[j] = (data_list[j]-min_list)/abs((min_list-max_list))

  df_train_norm = data_list[0].copy()
  df_val_norm = data_list[1].copy()
  df_test_norm = data_list[2].copy()

  print(df_train_norm.max(axis=0))
  print(df_train_norm.min(axis=0))
  print('\n')

  print(df_val_norm.max(axis=0))
  print(df_val_norm.min(axis=0))
  print('\n')

  print(df_test_norm.max(axis=0))
  print(df_test_norm.min(axis=0))

  print('-------------------------')

  print(df_train_norm.shape)
  print(df_val_norm.shape)
  print(df_test_norm.shape)
  return df_train_norm,df_val_norm,df_test_norm

In [0]:
def Regressor(future_steps,time_steps,n_features,df_train,df_val,df_test):
  # future_steps: how many days in the furure to predict 
  # time_steps: number of samples the LSTM model will use for predict 28 days in the future
  # n_features: number of features (LSTM input) 

  #################### TRAIN #################################################
  # Defining the number of samples for the train data
  n_samples = df_train.shape[0]
  x_train = []
  y_train = []

  # shaping data to have the number of time_steps in the x_train ans the 28 future values in the y_train 
  for i in range(time_steps,n_samples - time_steps): 
    x_train.append(df_train[i - time_steps : i, :]) 
    y_train.append(df_train[i : i+future_steps,-1]) 

  x_train,y_train = np.array(x_train),np.array(y_train) # transforming in array
  # LSTM need 3D tensor as input 
  x_train = np.reshape(x_train,(x_train.shape[0],x_train.shape[1],n_features))  

  ##################### VALIDATION ###############################################

  # to predict the first 28 values of the validation data, LSTM need time_steps values. 
  # to make this possible I'm using the last 60 values that came before the validation data.
  df_val = np.concatenate((df_train[-time_steps:],df_val))

  n_samples = df_val.shape[0]
  x_val = []
  y_val = []

  for i in range(time_steps,n_samples - time_steps):
    x_val.append(df_val[i - time_steps : i, :]) 
    y_val.append(df_val[i : i+future_steps,-1]) 

  x_val,y_val = np.array(x_val),np.array(y_val)
  x_val = np.reshape(x_val,(x_val.shape[0],x_val.shape[1],n_features))

  ####################### TEST ################################################
  # the same methodologie implemented in training and validation data are here in the test data

  df_test = np.concatenate((df_val[-time_steps:],df_test))
  n_samples = df_test.shape[0]
  x_test = []
  y_test = []

  for i in range(time_steps,n_samples - time_steps):
    x_test.append(df_test[i - time_steps : i, :]) 
    y_test.append(df_test[i : i+future_steps,-1]) 

  x_test,y_test = np.array(x_test),np.array(y_test)
  x_test = np.reshape(x_test,(x_test.shape[0],x_test.shape[1],n_features))

  x_train = x_train.astype(np.float32)
  y_train = y_train.astype(np.float32)

  x_val = x_val.astype(np.float32)
  y_val = y_val.astype(np.float32)

  x_test = x_test.astype(np.float32)
  y_test = y_test.astype(np.float32)
  
  return x_train,x_test,x_val,y_train,y_val,y_test

# Importing Datasets

In [0]:
# Contains information about the dates the products are sold.
df_calendar = pd.read_csv('/content/drive/My Drive/Specialization/Projects/Recurrent Neural Networks/M5 - Data/calendar.csv')

# Contains information about the price of the products sold per store and date.
df_price = pd.read_csv('/content/drive/My Drive/Specialization/Projects/Recurrent Neural Networks/M5 - Data/sell_prices.csv')

#Contains the historical daily unit sales data per product and store.
df_sales = pd.read_csv('/content/drive/My Drive/Specialization/Projects/Recurrent Neural Networks/M5 - Data/sales_train_validation.csv')

In [3]:
df_calendar

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,Wednesday,5,6,2016,d_1965,,,,,0,1,1
1965,2016-06-16,11620,Thursday,6,6,2016,d_1966,,,,,0,0,0
1966,2016-06-17,11620,Friday,7,6,2016,d_1967,,,,,0,0,0
1967,2016-06-18,11621,Saturday,1,6,2016,d_1968,,,,,0,0,0


In [4]:
df_price

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26
...,...,...,...,...
6841116,WI_3,FOODS_3_827,11617,1.00
6841117,WI_3,FOODS_3_827,11618,1.00
6841118,WI_3,FOODS_3_827,11619,1.00
6841119,WI_3,FOODS_3_827,11620,1.00


In [5]:
df_sales

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,...,d_1874,d_1875,d_1876,d_1877,d_1878,d_1879,d_1880,d_1881,d_1882,d_1883,d_1884,d_1885,d_1886,d_1887,d_1888,d_1889,d_1890,d_1891,d_1892,d_1893,d_1894,d_1895,d_1896,d_1897,d_1898,d_1899,d_1900,d_1901,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,3,1,3,1,2,2,0,1,1,1,1,0,0,0,0,0,1,0,4,2,3,0,1,2,0,0,0,1,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,2,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,3,4,2,1,4,1,3,5,0,6,6,0,0,0,0,3,1,2,1,3,1,0,2,5,4,2,0,3,0,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,3,2,2,2,3,1,0,0,0,0,1,0,4,4,0,1,4,0,1,0,1,0,1,1,2,0,1,1,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,0,3,1,4,1,0,0,3,4,4,0,0,1,0,1,1,7,7,3,6,3,3,7,12,4,2,7,5,12,5,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,0,5,0,1,1,3,1,1,0,4,2,0,1,2,1,1,0,0,0,0,3,1,1,1,2,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,2,4,1,8,5,2,7,5,3,5,20,8,10,3,3,4,7,2,3,5,6,3,4,1,2,5,1,2,2,2,...,1,1,2,0,0,2,0,0,0,2,0,1,1,0,0,0,0,1,1,2,1,4,0,0,2,2,4,0,0,0,2,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,0,2,1,1,2,4,3,0,0,0,0,0,1,1,2,1,1,1,1,2,0,1,0,3,0,0,1,0,0,1,0,3,1,3


# Join DataFrames

By filtering one specific product and using the some valuable features from each DataFrame, it's possible to creat dis 'final' DataFrame

In [6]:
df_food =  df_sales.iloc[df_sales[df_sales['id'] == 'FOODS_3_495_WI_3_validation'].loc[:,'d_1':].index[0],6:]
df_calendar_price = df_calendar.set_index('wm_yr_wk').join(df_price[(df_price['store_id'] == df_sales[df_sales['id'] == 'FOODS_3_495_WI_3_validation']['store_id'].values[0]) & 
         (df_price['item_id'] == df_sales[df_sales['id'] == 'FOODS_3_495_WI_3_validation']['item_id'].values[0])].set_index('wm_yr_wk')).reset_index()
df_final = df_food.to_frame('qty_sales').reset_index().set_index('index').join(df_calendar_price.set_index('d'))
df_final = df_final[['sell_price','weekday','month','event_name_1','event_type_1','event_name_2','event_type_2','snap_WI','qty_sales']]
df_final

Unnamed: 0_level_0,sell_price,weekday,month,event_name_1,event_type_1,event_name_2,event_type_2,snap_WI,qty_sales
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
d_1,1.77,Saturday,1,,,,,0,1
d_2,1.77,Sunday,1,,,,,0,2
d_3,1.77,Monday,1,,,,,0,4
d_4,1.77,Tuesday,2,,,,,0,2
d_5,1.77,Wednesday,2,,,,,1,3
...,...,...,...,...,...,...,...,...,...
d_1909,1.98,Wednesday,4,,,,,0,1
d_1910,1.98,Thursday,4,,,,,0,2
d_1911,1.98,Friday,4,,,,,0,4
d_1912,1.98,Saturday,4,,,,,0,2


# One Hot Encoding

The information from week day,month, event name (1 or 2) and event type (1 or 2) need to be converted to binary features to be a input to a LSTM model.

In [7]:
months = pd.get_dummies(df_final['month'])
months.columns = ['January','February','March','April','May','June','July','August','September','October','November','December']
months

Unnamed: 0_level_0,January,February,March,April,May,June,July,August,September,October,November,December
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
d_1,1,0,0,0,0,0,0,0,0,0,0,0
d_2,1,0,0,0,0,0,0,0,0,0,0,0
d_3,1,0,0,0,0,0,0,0,0,0,0,0
d_4,0,1,0,0,0,0,0,0,0,0,0,0
d_5,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
d_1909,0,0,0,1,0,0,0,0,0,0,0,0
d_1910,0,0,0,1,0,0,0,0,0,0,0,0
d_1911,0,0,0,1,0,0,0,0,0,0,0,0
d_1912,0,0,0,1,0,0,0,0,0,0,0,0


In [8]:
weekday = pd.get_dummies(df_final['weekday'])
# weekday.columns = ['friday','monday','saturday','sunday','thursday','tuesday','wednesday']
weekday

Unnamed: 0_level_0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
d_1,0,0,1,0,0,0,0
d_2,0,0,0,1,0,0,0
d_3,0,1,0,0,0,0,0
d_4,0,0,0,0,0,1,0
d_5,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
d_1909,0,0,0,0,0,0,1
d_1910,0,0,0,0,1,0,0
d_1911,1,0,0,0,0,0,0
d_1912,0,0,1,0,0,0,0


In [9]:
event_name_1 = pd.get_dummies(df_final['event_name_1'])
event_name_1

Unnamed: 0_level_0,Chanukah End,Christmas,Cinco De Mayo,ColumbusDay,Easter,Eid al-Fitr,EidAlAdha,Father's day,Halloween,IndependenceDay,LaborDay,LentStart,LentWeek2,MartinLutherKingDay,MemorialDay,Mother's day,NBAFinalsEnd,NBAFinalsStart,NewYear,OrthodoxChristmas,OrthodoxEaster,Pesach End,PresidentsDay,Purim End,Ramadan starts,StPatricksDay,SuperBowl,Thanksgiving,ValentinesDay,VeteransDay
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
d_1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
d_2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
d_3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
d_4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
d_5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
d_1909,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
d_1910,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
d_1911,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
d_1912,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
event_type_1 = pd.get_dummies(df_final['event_type_1'])
event_type_1

Unnamed: 0_level_0,Cultural,National,Religious,Sporting
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d_1,0,0,0,0
d_2,0,0,0,0
d_3,0,0,0,0
d_4,0,0,0,0
d_5,0,0,0,0
...,...,...,...,...
d_1909,0,0,0,0
d_1910,0,0,0,0
d_1911,0,0,0,0
d_1912,0,0,0,0


In [11]:
event_name_2 = pd.get_dummies(df_final['event_name_2'])
event_name_2

Unnamed: 0_level_0,Cinco De Mayo,Easter,Father's day,OrthodoxEaster
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d_1,0,0,0,0
d_2,0,0,0,0
d_3,0,0,0,0
d_4,0,0,0,0
d_5,0,0,0,0
...,...,...,...,...
d_1909,0,0,0,0
d_1910,0,0,0,0
d_1911,0,0,0,0
d_1912,0,0,0,0


In [12]:
event_type_2 = pd.get_dummies(df_final['event_type_2'])
event_type_2

Unnamed: 0_level_0,Cultural,Religious
index,Unnamed: 1_level_1,Unnamed: 2_level_1
d_1,0,0
d_2,0,0
d_3,0,0
d_4,0,0
d_5,0,0
...,...,...
d_1909,0,0
d_1910,0,0
d_1911,0,0
d_1912,0,0


In [13]:
df_final = pd.concat([months,weekday,event_name_1,event_type_1,event_name_2,event_type_2,df_final[['snap_WI','sell_price','qty_sales']]],axis=1)
df_final

Unnamed: 0_level_0,January,February,March,April,May,June,July,August,September,October,November,December,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Chanukah End,Christmas,Cinco De Mayo,ColumbusDay,Easter,Eid al-Fitr,EidAlAdha,Father's day,Halloween,IndependenceDay,LaborDay,LentStart,LentWeek2,MartinLutherKingDay,MemorialDay,Mother's day,NBAFinalsEnd,NBAFinalsStart,NewYear,OrthodoxChristmas,OrthodoxEaster,Pesach End,PresidentsDay,Purim End,Ramadan starts,StPatricksDay,SuperBowl,Thanksgiving,ValentinesDay,VeteransDay,Cultural,National,Religious,Sporting,Cinco De Mayo,Easter,Father's day,OrthodoxEaster,Cultural,Religious,snap_WI,sell_price,qty_sales
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1
d_1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.77,1
d_2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.77,2
d_3,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.77,4
d_4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.77,2
d_5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.77,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
d_1909,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.98,1
d_1910,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.98,2
d_1911,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.98,4
d_1912,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.98,2


# Feature Importance

After the One-Hot-Encoding the DataFrame has 62 differents features, let's see if all of them are important.

In [14]:
df_final = df_final.astype(float)
corr = df_final.corr()
corr

Unnamed: 0,January,February,March,April,May,June,July,August,September,October,November,December,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Chanukah End,Christmas,Cinco De Mayo,ColumbusDay,Easter,Eid al-Fitr,EidAlAdha,Father's day,Halloween,IndependenceDay,LaborDay,LentStart,LentWeek2,MartinLutherKingDay,MemorialDay,Mother's day,NBAFinalsEnd,NBAFinalsStart,NewYear,OrthodoxChristmas,OrthodoxEaster,Pesach End,PresidentsDay,Purim End,Ramadan starts,StPatricksDay,SuperBowl,Thanksgiving,ValentinesDay,VeteransDay,Cultural,National,Religious,Sporting,Cinco De Mayo.1,Easter.1,Father's day.1,OrthodoxEaster.1,Cultural.1,Religious.1,snap_WI,sell_price,qty_sales
January,1.000000,-0.093706,-0.098469,-0.094911,-0.089094,-0.087520,-0.089094,-0.089094,-0.087520,-0.089094,-0.087520,-0.089094,0.002455,-0.002974,0.002003,0.002003,0.002455,-0.002974,-0.002974,-0.015360,-0.015360,-0.013735,-0.015360,-0.015360,-0.015360,-0.015360,-0.013735,-0.015360,-0.015360,-0.015360,-0.016830,-0.016830,0.170611,-0.015360,-0.015360,-0.015360,-0.015360,0.170611,0.170611,-0.013735,-0.015360,-0.016830,-0.016830,-0.015360,-0.016830,-0.016830,-0.015360,-0.016830,-0.015360,-0.040962,0.068231,0.008235,-0.027556,-0.006862,-0.006862,-0.006862,-0.006862,-0.011891,-0.006862,-0.008217,0.012586,0.026323
February,-0.093706,1.000000,-0.102491,-0.098787,-0.092733,-0.091095,-0.092733,-0.092733,-0.091095,-0.092733,-0.091095,-0.092733,-0.001367,0.003885,-0.001831,-0.001831,-0.001367,-0.001367,0.003885,-0.015987,-0.015987,-0.014296,-0.015987,-0.015987,-0.015987,-0.015987,-0.014296,-0.015987,-0.015987,-0.015987,0.113899,0.113899,-0.015987,-0.015987,-0.015987,-0.015987,-0.015987,-0.015987,-0.015987,-0.014296,-0.015987,0.179608,0.015337,-0.015987,-0.017518,0.179608,-0.015987,0.179608,-0.015987,0.039611,0.016740,0.049470,0.092350,-0.007142,-0.007142,-0.007142,-0.007142,-0.012377,-0.007142,0.015693,-0.274961,0.036671
March,-0.098469,-0.102491,1.000000,-0.103809,-0.097447,-0.095726,-0.097447,-0.097447,-0.095726,-0.097447,-0.095726,-0.097447,-0.002742,-0.002742,0.001809,0.001809,0.002302,0.002302,-0.002742,-0.016800,-0.016800,-0.015022,-0.016800,0.052314,-0.016800,-0.016800,-0.015022,-0.016800,-0.016800,-0.016800,0.044701,0.044701,-0.016800,-0.016800,-0.016800,-0.016800,-0.016800,-0.016800,-0.016800,-0.015022,-0.016800,-0.018408,0.139364,-0.016800,0.170919,-0.018408,-0.016800,-0.018408,-0.016800,0.060520,-0.054313,0.042794,-0.030140,-0.007505,-0.007505,-0.007505,-0.007505,-0.013006,-0.007505,-0.004710,-0.147822,-0.002072
April,-0.094911,-0.098787,-0.103809,1.000000,-0.093925,-0.092267,-0.093925,-0.093925,-0.092267,-0.093925,-0.092267,-0.093925,0.000878,0.000878,0.000404,0.000404,-0.004320,0.000878,0.000878,-0.016193,-0.016193,-0.014479,-0.016193,0.090624,-0.016193,-0.016193,-0.014479,-0.016193,-0.016193,-0.016193,-0.017743,-0.017743,-0.016193,-0.016193,-0.016193,-0.016193,-0.016193,-0.016193,-0.016193,0.104914,0.161834,-0.017743,-0.017743,-0.016193,-0.017743,-0.017743,-0.016193,-0.017743,-0.016193,-0.002489,-0.052350,0.036559,-0.029050,-0.007234,0.072299,-0.007234,0.072299,0.033406,0.072299,0.010434,0.051286,-0.014871
May,-0.089094,-0.092733,-0.097447,-0.093925,1.000000,-0.086612,-0.088168,-0.088168,-0.086612,-0.088168,-0.086612,-0.088168,0.004821,-0.006133,-0.001098,-0.001098,0.004821,-0.000656,-0.000656,-0.015200,-0.015200,0.154160,-0.015200,-0.015200,-0.015200,-0.015200,-0.013592,-0.015200,-0.015200,-0.015200,-0.016655,-0.016655,-0.015200,0.172401,0.172401,-0.015200,0.022320,-0.015200,-0.015200,0.028346,-0.015200,-0.016655,-0.016655,-0.015200,-0.016655,-0.016655,-0.015200,-0.016655,-0.015200,0.088111,0.010319,-0.037854,-0.006235,0.077019,-0.006791,-0.006791,-0.006791,0.036645,-0.006791,-0.004262,0.048143,0.016479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cultural,-0.011891,-0.012377,-0.013006,0.033406,0.036645,0.037583,-0.011768,-0.011768,-0.011560,-0.011768,-0.011560,-0.011768,-0.016170,-0.016170,-0.016204,0.096930,-0.016170,-0.016170,-0.016170,-0.002029,-0.002029,-0.001814,-0.002029,-0.002029,-0.002029,-0.002029,-0.001814,-0.002029,-0.002029,-0.002029,-0.002223,-0.002223,-0.002029,-0.002029,-0.002029,0.256711,-0.002029,-0.002029,-0.002029,0.576594,-0.002029,-0.002223,-0.002223,-0.002029,-0.002223,-0.002223,-0.002029,-0.002223,-0.002029,-0.005410,-0.006559,0.155853,0.141419,0.577048,0.577048,0.577048,-0.000906,1.000000,-0.000906,0.028448,0.006426,0.021643
Religious,-0.006862,-0.007142,-0.007505,0.072299,-0.006791,-0.006671,-0.006791,-0.006791,-0.006671,-0.006791,-0.006671,-0.006791,-0.009331,-0.009331,-0.009351,0.055933,-0.009331,-0.009331,-0.009331,-0.001171,-0.001171,-0.001047,-0.001171,0.446746,-0.001171,-0.001171,-0.001047,-0.001171,-0.001171,-0.001171,-0.001283,-0.001283,-0.001171,-0.001171,-0.001171,-0.001171,-0.001171,-0.001171,-0.001171,-0.001047,-0.001171,-0.001283,-0.001283,-0.001171,-0.001283,-0.001283,-0.001171,-0.001283,-0.001171,0.167521,-0.003785,-0.003823,-0.002100,-0.000523,-0.000523,-0.000523,1.000000,-0.000906,1.000000,-0.016026,0.003708,-0.028917
snap_WI,-0.008217,0.015693,-0.004710,0.010434,-0.004262,0.002487,-0.004262,-0.004262,0.002487,-0.004262,0.002487,-0.004262,-0.002881,-0.002881,0.005603,-0.003922,-0.002881,0.006659,0.000299,0.007698,-0.035872,0.065324,0.029483,0.007698,-0.014087,-0.014087,-0.032076,-0.035872,-0.035872,0.029483,0.000478,-0.019414,-0.035872,-0.035872,0.029483,0.007698,0.029483,-0.035872,-0.035872,0.040974,0.029483,-0.019414,0.000478,-0.014087,-0.039306,0.040263,-0.035872,0.080047,0.073053,0.028829,-0.033113,-0.007695,0.045566,0.032636,-0.016026,0.032636,-0.016026,0.028448,-0.016026,1.000000,-0.027198,0.397771
sell_price,0.012586,-0.274961,-0.147822,0.051286,0.048143,0.047293,0.048143,0.048143,0.047293,0.048143,0.047293,0.048143,-0.000069,-0.000069,0.000173,0.000173,-0.000069,-0.000069,-0.000069,0.008300,0.008300,0.007422,0.008300,0.008300,0.008300,0.008300,0.007422,0.008300,0.008300,0.008300,-0.050081,-0.050081,0.008300,0.008300,0.008300,0.008300,0.008300,0.008300,0.008300,0.007422,0.008300,-0.050081,0.009094,0.008300,-0.050081,-0.050081,0.008300,-0.050081,0.008300,-0.027245,0.006292,-0.013594,-0.021443,0.003708,0.003708,0.003708,0.003708,0.006426,0.003708,-0.027198,1.000000,-0.070568


## To optmizer the feature selection process, I'll just select those features that have a coefficient higher than the average from the absolute values.

In [17]:
corr['qty_sales'].abs().mean()

0.04663586608262574

In [15]:
corr[corr['qty_sales'].abs()>corr['qty_sales'].abs().mean()]['qty_sales']

August        0.054631
November     -0.051075
Sunday        0.103054
Christmas    -0.064728
EidAlAdha    -0.055460
snap_WI       0.397771
sell_price   -0.070568
qty_sales     1.000000
Name: qty_sales, dtype: float64

In [18]:
dataset = df_final.loc[:,corr[corr['qty_sales'].abs()>corr['qty_sales'].abs().mean()]['qty_sales'].index]
dataset

Unnamed: 0_level_0,August,November,Sunday,Christmas,EidAlAdha,snap_WI,sell_price,qty_sales
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
d_1,0.0,0.0,0.0,0.0,0.0,0.0,1.77,1.0
d_2,0.0,0.0,1.0,0.0,0.0,0.0,1.77,2.0
d_3,0.0,0.0,0.0,0.0,0.0,0.0,1.77,4.0
d_4,0.0,0.0,0.0,0.0,0.0,0.0,1.77,2.0
d_5,0.0,0.0,0.0,0.0,0.0,1.0,1.77,3.0
...,...,...,...,...,...,...,...,...
d_1909,0.0,0.0,0.0,0.0,0.0,0.0,1.98,1.0
d_1910,0.0,0.0,0.0,0.0,0.0,0.0,1.98,2.0
d_1911,0.0,0.0,0.0,0.0,0.0,0.0,1.98,4.0
d_1912,0.0,0.0,0.0,0.0,0.0,0.0,1.98,2.0


# Spliting Data

In [24]:
train_limit = int(round((dataset.shape[0])*0.7,0))
val_limit = int(round((dataset.shape[0])*0.85,0))

df_train = dataset[:train_limit]
df_val = dataset[train_limit:val_limit]
df_test = dataset[val_limit:]

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(1339, 8)
(287, 8)
(287, 8)


# Normalizing Data

In [25]:
df_train_norm,df_val_norm,df_test_norm = normalizing_data(df_train,df_val,df_test)

[1. 1. 1. 1. 1. 1. 1. 1.]
[0. 0. 0. 0. 0. 0. 0. 0.]


[0.  1.  1.  1.  1.  1.  1.  0.8]
[0. 0. 0. 0. 0. 0. 1. 0.]


[1.         1.         1.         1.         1.         1.
 1.         0.66666667]
[0. 0. 0. 0. 0. 0. 1. 0.]
-------------------------
(1339, 8)
(287, 8)
(287, 8)


# Data Pre-Processing

In [0]:
time_steps = 84
future_steps = 28
n_features = df_train_norm.shape[1]
x_train,x_test,x_val,y_train,y_val,y_test = Regressor(future_steps = future_steps,time_steps = time_steps,n_features = n_features,df_train = df_train_norm,df_val = df_val_norm,df_test = df_test_norm)

# LSTM

In [0]:
regressor = Sequential()
regressor.add(LSTM(units=50, return_sequences=False, input_shape=(time_steps, n_features))) 
regressor.add(Dropout(0.2))
regressor.add(Dense(units=future_steps))

#Compiling the LSTM
adam = optimizers.Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, decay = 0.0)
regressor.compile(optimizer = adam, loss = 'mse', metrics=['mae'])

#Fitting to traning set
history = regressor.fit(x_train, y_train, validation_data=(x_val,y_val), 
                        epochs = 500, batch_size=100, verbose = 0, shuffle= False)

plt.figure(figsize=(20,6))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['Train Loss', 'Val Loss'])

plt.figure(figsize=(20,6))
plt.plot(history.history['mae'])
plt.plot(history.history['val_mae'])
plt.legend(['Train Metric', 'Val Metric'])
plt.show()

print('\n')
print('loss and metric train data',regressor.evaluate(x_train,y_train))
print('loss and metric val data',regressor.evaluate(x_val,y_val))
print('loss and metric test data',regressor.evaluate(x_test,y_test))