In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings    
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

# Create a Train Set and Test Set

In [None]:
def createTargetFiles(sales_train_val, start_range, end_range):  
    cat_val_set = [f"d_{cat}" for cat in range(start_range, end_range+1)]
    val_demand_set = np.zeros([len(sales_train_val), len(cat_val_set)]).astype('uint8')
    val_demand_set = pd.DataFrame(val_demand_set)
    val_demand_set.columns = cat_val_set
    
    sales_train_basic_feat = sales_train_val.loc[:,['id']]
    val_set = pd.concat([sales_train_basic_feat, val_demand_set], axis=1)
        
    return val_set

In [None]:
def createTrainTestSet(train_set, start_range, end_range, calenderSet, priceSet):
    test_set = createTargetFiles(train_set, start_range, end_range)
    
    fullSet = train_set.merge(test_set, how='left', on='id')
    
    dropD = [f"d_{val}" for val in range(1, 1069-31)]
    fullSet.drop(dropD, axis=1, inplace=True)
    
    fullSet = pd.melt(fullSet,
                    id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                    var_name = 'day', value_name = 'demand')

    fullSet.columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd', 'demand']
    
    calenderSet = calenderSet.drop(['year', 'date', 'wday'], axis = 1)
    
    fullSet = fullSet.merge(calenderSet, how='left', on='d')
    
    fullSet['weekday'] = [1 if (day=='Saturday' or day=='Sunday') else '0' for day in fullSet['weekday']] 
    
    fullSet = fullSet.merge(sp, how='left', on=['store_id', 'item_id', 'wm_yr_wk'])
    fullSet.drop(['item_id','wm_yr_wk'], axis=1, inplace=True)
    
    avg_sellPrice_val = fullSet.groupby(by=['dept_id', 'store_id']).aggregate({'sell_price': 'mean'})
    avg_sellPrice_val.reset_index(drop=False, inplace=True)
    
    fullSet = fullSet.merge(avg_sellPrice_val, how='left', on=['dept_id', 'store_id'])

    gc.collect()
    del avg_sellPrice_val, calenderSet
    
    fullSet['sell_price_x'].fillna(fullSet['sell_price_y'], inplace=True)
    fullSet.drop('sell_price_y', axis=1, inplace=True)

    fullSet,_ = reduce_mem_usage(fullSet)
    
    cat_list = ['dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 
           'event_type_2']
    labelEncoder = [LabelEncoder() for i in range(len(cat_list))]
    
    for x in range(len(cat_list)):
        labelEncoder[x].fit(fullSet[cat_list[x]].astype(str))
        
    for x in range(len(cat_list)):
        print(x)
        fullSet[cat_list[x]] = labelEncoder[x].transform(fullSet[cat_list[x]].astype(str))
        gc.collect()    
        fullSet,_ = reduce_mem_usage(fullSet)
        
    fullSet['lag_t28'] = fullSet.groupby(['id'])['demand'].shift(28)
    fullSet['lag_t30'] = fullSet.groupby(['id'])['demand'].shift(30)
    fullSet['lag_t31'] = fullSet.groupby(['id'])['demand'].shift(31)
    fullSet['rolling_mean30'] = fullSet.groupby(['id'])['demand'].shift(1).rolling(30).mean()
    fullSet['rolling_mean31'] = fullSet.groupby(['id'])['demand'].shift(1).rolling(31).mean()
    fullSet['rolling_std30'] = fullSet.groupby(['id'])['demand'].shift(1).rolling(30).std()
    fullSet['rolling_std31'] = fullSet.groupby(['id'])['demand'].shift(1).rolling(31).std()
    
    fullSet = fullSet[fullSet['lag_t31'].notnull()]
    
    fullSet,_ = reduce_mem_usage(fullSet)
    
    return fullSet

In [None]:
def createTrainTestSet_Experiment(train_set, start_range, end_range, calenderSet, priceSet):
    test_set = createTargetFiles(train_set, start_range, end_range)
        
    fullSet = train_set.merge(test_set, how='left', on='id')
    
    dropD = [f"d_{val}" for val in range(1, 1069-30)]
    fullSet.drop(dropD, axis=1, inplace=True)
    
    fullSet = pd.melt(fullSet,
                    id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                    var_name = 'day', value_name = 'demand')

    fullSet.columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd', 'demand']
    
    calenderSet = calenderSet.drop(['year', 'date', 'wday'], axis = 1)
    
    fullSet = fullSet.merge(calenderSet, how='left', on='d')
    
    fullSet['weekday'] = [1 if (day=='Saturday' or day=='Sunday') else '0' for day in fullSet['weekday']] 
    
    fullSet = fullSet.merge(priceSet, how='left', on=['store_id', 'item_id', 'wm_yr_wk'])
    fullSet.drop(['item_id','wm_yr_wk', 'state_id', 'store_id'], axis=1, inplace=True)
    
    gc.collect()
    del calenderSet, priceSet
    
    fullSet.sell_price = fullSet.sell_price.fillna(0)

    fullSet,_ = reduce_mem_usage(fullSet)
    
    cat_list = ['dept_id', 'cat_id', 'event_name_1', 'event_type_1', 'event_name_2', 
               'event_type_2', 'snap_CA', 'snap_TX']
                
    labelEncoder = [LabelEncoder() for i in range(len(cat_list))]
    
    for x in range(len(cat_list)):
        labelEncoder[x].fit(fullSet[cat_list[x]].astype(str))
        
    for x in range(len(cat_list)):
        print(x)
        fullSet[cat_list[x]] = labelEncoder[x].transform(fullSet[cat_list[x]].astype(str))
        gc.collect()    
        fullSet,_ = reduce_mem_usage(fullSet)
    
    fullSet.drop(['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'month','snap_WI'], axis=1, inplace=True)
    
    #Use Lag and Rolling Features to enhanced the richness of the data
    
    fullSet['lag_t25'] = fullSet.groupby(['id'])['demand'].shift(25)
    fullSet['lag_t30'] = fullSet.groupby(['id'])['demand'].shift(30)
    fullSet['lag_t7'] = fullSet.groupby(['id'])['demand'].shift(7)
    fullSet['lag_t1'] = fullSet.groupby(['id'])['demand'].shift(1)
    fullSet['lag_t2'] = fullSet.groupby(['id'])['demand'].shift(2)
    fullSet['rolling_mean7'] = fullSet.groupby(['id'])['demand'].shift(1).rolling(7).mean()
    fullSet['rolling_std7'] = fullSet.groupby(['id'])['demand'].shift(1).rolling(7).std()
    fullSet['rolling_mean2'] = fullSet.groupby(['id'])['demand'].shift(1).rolling(2).mean()
    fullSet['rolling_std2'] = fullSet.groupby(['id'])['demand'].shift(1).rolling(2).std()
    fullSet['rolling_mean_t30'] = fullSet.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    fullSet['rolling_std_t30'] = fullSet.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    fullSet['rolling_skew_t30'] = fullSet.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).skew())
    fullSet['rolling_kurt_t30'] = fullSet.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).kurt())
    
    fullSet['rolling_pricemean7'] = fullSet.groupby(['id'])['sell_price'].shift(1).rolling(7).mean()
    fullSet['pricelag_t1'] = fullSet.groupby(['id'])['sell_price'].shift(1)
    
    fullSet['demand_price_mean7'] = fullSet['rolling_mean7']/fullSet['rolling_pricemean7']
    fullSet['demand_price_mean2'] = fullSet['rolling_mean2']/fullSet.groupby(['id'])['sell_price'].shift(1).rolling(2).mean()

    fullSet = fullSet[fullSet['lag_t30'].notnull()]
    
    fullSet,_ = reduce_mem_usage(fullSet)
    
    return fullSet

# Create a Validation Set

In [None]:
train_val = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')
train_val, _ = reduce_mem_usage(train_val)

In [None]:
cal = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')
cal, _ = reduce_mem_usage(cal)

In [None]:
sp = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')

In [None]:
validationSet = createTrainTestSet_Experiment(train_val, 1914, 1941, cal, sp)

In [None]:
validationSet.to_csv('validation_set.csv', index=False)

In [None]:
sp = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')
sp,_ = reduce_mem_usage(sp)

In [None]:
validationSet.head()

In [None]:
del validationSet

# Create the Evaluation Set

In [None]:
train_eval = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv')
train_eval, _ = reduce_mem_usage(train_eval)

In [None]:
evaluationSet = createTrainTestSet_Experiment(train_eval, 1942, 1969, cal, sp)

In [None]:
evaluationSet.to_csv('evaluation_set.csv', index=False)

# Check the Correlation Matrix Between Features and Target

In [None]:
corr = evaluationSet.drop(['id', 'd'], axis=1).corr()

In [None]:
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corr, mask=mask, cmap='BrBG', vmin=-1, vmax=1, annot=True)