## Credits:
*  We used the notebook: https://www.kaggle.com/rohitsingh9990/m5-lgbm-fe as a baseline for this notebook.

In [None]:
import pandas as pd
import os
import numpy as np
import dask.dataframe as dd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import preprocessing, metrics
import gc
import joblib
import warnings
from sklearn.neighbors import KNeighborsRegressor
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
import datetime

In [None]:
INPUT_DIR_PATH = '../input/m5-forecasting-accuracy/'

# Functions

In [None]:
def print_dir(path):
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            print(os.path.join(dirname, filename))


### functions for loading the data

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def read_data():
    sell_prices_df = pd.read_csv(INPUT_DIR_PATH + 'sell_prices.csv')
    sell_prices_df = reduce_mem_usage(sell_prices_df)
    print('Sell prices has {} rows and {} columns'.format(sell_prices_df.shape[0], sell_prices_df.shape[1]))

    calendar_df = pd.read_csv(INPUT_DIR_PATH + 'calendar.csv')
    calendar_df = reduce_mem_usage(calendar_df)
    print('Calendar has {} rows and {} columns'.format(calendar_df.shape[0], calendar_df.shape[1]))

    sales_train_validation_df = pd.read_csv(INPUT_DIR_PATH + 'sales_train_validation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation_df.shape[0], sales_train_validation_df.shape[1]))

    submission_df = pd.read_csv(INPUT_DIR_PATH + 'sample_submission.csv')
    return sell_prices_df, calendar_df, sales_train_validation_df, submission_df

def encode_categorical(df, cols):
    for col in cols:
        # Leave NaN as it is.
        le = preprocessing.LabelEncoder()
        not_null = df[col][df[col].notnull()]
        df[col] = pd.Series(le.fit_transform(not_null), index=not_null.index)

    return df



### Function for merging the data

In [None]:
def melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 55000000, merge = False):
    
    
    # melt sales data, get it ready for training
    sales_train_validation = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    print('Melted sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    sales_train_validation = reduce_mem_usage(sales_train_validation)
    
    sales_train_validation = sales_train_validation.iloc[-nrows:,:]
    
    
    # seperate test dataframes
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]
    
    # change column names
    test1.columns = ['id']+[f'd_19{x}' for x in range(14,42)]
    test2.columns = ['id']+[f'd_19{x}' for x in range(42,70)]
    
    # get product table
    product = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    
    # merge with product table
    test2['id'] = test2['id'].str.replace('_evaluation','_validation')
    test1 = test1.merge(product, how = 'left', on = 'id')
    test2 = test2.merge(product, how = 'left', on = 'id')
    test2['id'] = test2['id'].str.replace('_validation','_evaluation')
    
    # 
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    sales_train_validation['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'
    
    data = pd.concat([sales_train_validation, test1, test2], axis = 0)
    
    del sales_train_validation, test1, test2
    
    print(data.shape)
    
    # get only a sample for fst training
#     data = data.loc[nrows:]
    
    # drop some calendar features
    calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
    
    # delete test2 for now
    data = data[data['part'] != 'test2']
    
    if merge:
        # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
        data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
        data.drop(['day'], inplace = True, axis = 1)
        # get the sell price data (this feature should be very important)
        data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
        print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
    else: 
        pass
    
    gc.collect()
    
    return data

def transform(data):
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','before_hoiday','after_holiday']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    return data


def simple_fe(data):
    
    # rolling demand features
    
    for val in [i for i in range(28,32)]:
        data[f"shift_t{val}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(val))
    for val in [7, 30]:
        data[f"rolling_std_t{val}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(val).std())
    for lag in [28]:
        for window in [7, 28, 35]:
            data[f"rolling_mean_{lag}_{window}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(lag).rolling(window).mean())

    #data["rolling_skew_t30"] = data.groupby(["id"])["demand"].transform( lambda x: x.shift(28).rolling(30).skew())
    #data["rolling_kurt_t30"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(30).kurt())
    
    # price features
    data['lag_price_t1'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    data['price_change_t1'] = (data['lag_price_t1'] - data['sell_price']) / (data['lag_price_t1'])
    #data['rolling_price_max_t365'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    #data['price_change_t365'] = (data['rolling_price_max_t365'] - data['sell_price']) / (data['rolling_price_max_t365'])
    data['rolling_price_std_t7'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
    data['rolling_price_std_t30'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
    #data.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)
    
   
    
#     # time features
    data['date'] = pd.to_datetime(data['date'])
    attrs = ["year", "quarter", "month", "week", "day", "dayofweek", "is_year_end", "is_year_start", "is_quarter_end", \
        "is_quarter_start", "is_month_end","is_month_start",
    ]
     #own features
    data['rolling_price_mean_t30'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).mean())
    data['rolling_price_mean_t7'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).mean())
    

    for attr in attrs:
        dtype = np.int16 if attr == "year" else np.int8
        data[attr] = getattr(data['date'].dt, attr).astype(dtype)
    data["is_weekend"] = data["dayofweek"].isin([5, 6]).astype(np.int8)
    
    return data

# Main part

### Load the data

In [None]:
prices, calendar, sales, submission = read_data()
NUM_ITEMS = sales.shape[0]  # 30490
DAYS_PRED = 28
nrows = 20000000


calendar = encode_categorical(calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]).pipe(reduce_mem_usage)
sales = encode_categorical(sales, ["item_id", "dept_id", "cat_id", "store_id", "state_id"]).pipe(reduce_mem_usage)
prices = encode_categorical(prices, ["item_id", "store_id"]).pipe(reduce_mem_usage)

## Before and after holiday feature calculation

In [None]:
event_names = [26., 28., 22., 11., 12., 25., 23., 20., 21.,  2., 15., 14.,
       17., 16.,  7.,  9., 24.,  5., 10.,  3.,  8.,  6., 29., 27.,  1.,
        0., 18., 19., 13.,  4., 1.,  0.,  3.,  2.] 

In [None]:
def check_before_holiday(date):
    
    if(len(calendar[calendar['date'] == date +  datetime.timedelta(days=1)]['event_name_1'].values) == 0):
        return False
    for i in [1]:
        if (calendar[calendar['date'] == date +  datetime.timedelta(days=i)]['event_name_1'].values[0] in event_names):
            return True
    return False
            
        

    

In [None]:
def check_after_holiday(date):
    if(len(calendar[calendar['date'] == date -  datetime.timedelta(days=1)]['event_name_1'].values) == 0):
        return False
    
    for i in [1]:
        if (calendar[calendar['date'] == date -  datetime.timedelta(days=i)]['event_name_1'].values[0] in event_names):
            return True
    return False
            
    

In [None]:
calendar['date'] = pd.to_datetime(calendar['date'])

In [None]:
calendar['before_holiday'] = False
calendar['after_holiday'] = False
for index, row in calendar.iterrows():
    calendar.loc[calendar['date'] == row['date'], 'before_holiday'] = check_before_holiday(row['date'])
    calendar.loc[calendar['date'] == row['date'], 'after_holiday'] = check_after_holiday(row['date'])


In [None]:
calendar.head(30)

### Melting and Mering the data

In [None]:
data = melt_and_merge(calendar, prices, sales, submission, nrows = nrows, merge = True)
data.shape

In [None]:
del calendar, sales, prices

### Feature extraction

In [None]:
data = simple_fe(data)
data.info()

In [None]:
data = reduce_mem_usage(data)
data.info()

# Storing features

In [None]:
folds = np.array_split(data, 5)
counter = 1
for fold in folds:
    fold.to_pickle(f'Features{counter}.pkl')
    counter += 1