# Objective

* Make a baseline model that predict the validation (28 days). 
* This competition has 2 stages, so the main objective is to make a model that can predict the demand for the next 28 days

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import dask.dataframe as dd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import seaborn as sns
# import lightgbm as lgb
# import dask_xgboost as xgb
import dask.dataframe as dd
from sklearn import preprocessing, metrics
import gc
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))  
from fastai import *
from fastai.tabular import *

In [None]:
# helps with memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


# function to read the data and merge it (ignoring some columns, this is a very fst model)
def read_data():
    print('Reading files...')
    calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
    sell_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
    sell_prices = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
    sales_train_validation = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
    return calendar, sell_prices, sales_train_validation, submission


# set up the data for fe and modeling
def melt_and_merge(calendar, sell_prices, sales_train_validation, submission, merge = False):
    
    # melt sales data, get it ready for training
    sales_train_validation = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    print('Melted sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    sales_train_validation = reduce_mem_usage(sales_train_validation)
    
    # seperate test dataframes
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]
    
    # change column names
    test1.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']
    test2.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                      'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']
    
    # get product table
    product = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    
    # merge with product table
    test2['id'] = test2['id'].str.replace('_evaluation','_validation')
    test1 = test1.merge(product, how = 'left', on = 'id')
    test2 = test2.merge(product, how = 'left', on = 'id')
    test2['id'] = test2['id'].str.replace('_validation','_evaluation')
    
    # 
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    sales_train_validation['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'
    
    data = pd.concat([sales_train_validation, test1, test2], axis = 0)
    
    del sales_train_validation, test1, test2
    
    # next experiment
    
    # data = data[data['dept_id'] == 'HOBBIES_1'] # can we get a better RMSE than 1.7? Yes, likely under 1.3...

    temp_calendar = calendar[['date', 'd']]
    data = pd.merge(data, temp_calendar, how = 'left', left_on = ['day'], right_on = ['d'])
    del temp_calendar
    
    data = reduce_mem_usage(data)
    
    # temporary, so I can train and learn fast
    data['date'] = pd.to_datetime(data['date'])
    data['month'] = data['date'].dt.month
    
    print('got past the month conversion')
    
    # data = data[data['date'] >= '2016-02-01']
    data = data[(data['date'] >= '2015-02-01') & (data['month'].isin([2,3,4,5]))]
    data.drop(['d', 'date'], inplace = True, axis = 1)
    
    # drop some calendar features
    calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
    
    # delete test2 for now (test2 is for final part of contest which I never got to)
    data = data[data['part'] != 'test2']

    if merge:
        # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
        data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
        data.drop(['d', 'day'], inplace = True, axis = 1)
        # get the sell price data (this feature should be very important)
        data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
        print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
    else: 
        pass
    
    gc.collect()
    
    return data
        
calendar, sell_prices, sales_train_validation, submission = read_data()
data = melt_and_merge(calendar, sell_prices, sales_train_validation, submission, merge = True)

print(data.shape)
print(data['dept_id'].unique())
print(data.head())

In [None]:
# calendar.head()
# temp_calendar = calendar[['date', 'd']]
# temp_calendar.head()
# sales_train_validation.head()
# sales_train_validation['dept_id'].unique()

# print(data.shape)
data.head()
# data['part'].unique()
# data['month'].unique()
# data[data['month'].isin([3,4,5])].head()

In [None]:
## Exploratory box
# data.hist(column='sell_price')
# data.columns
# data.groupby(['item_id'])['date'].agg(['count']).sort_values('count', ascending=True)
# data.sort_values(['item_id', 'date'], ascending=False).head(10)
# data['date'].min()
# data.groupby(['item_id'])['demand'].agg({"median_demand":["median"]})
# test = data['sell_price'].apply(lambda x: 1 if x < 1 else 0)

* We have the data to build our first model, let's build a baseline and predict the validation data (in our case is test1)

In [None]:
# data = data.sort_values(by=['id', 'date'])
# data['test'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
# data[data['id'] == 'HOBBIES_1_001_CA_1_validation']
# data.drop(['test'], inplace = True, axis = 1)
# data.head()

In [None]:
def transform(data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    # I decided to do this with the FastAI library instead...
    # cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    # for feature in cat:
    #     encoder = preprocessing.LabelEncoder()
    #     data[feature] = encoder.fit_transform(data[feature])
    
    return data

def simple_fe(data):
    
    data = data.sort_values(by=['id', 'date'])
    
    # rolling demand features
    data['lag_t28'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    data['lag_t29'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(29))
    data['lag_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(30))
    data['rolling_mean_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    data['rolling_std_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    data['rolling_mean_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    # data['rolling_mean_t90'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
    # data['rolling_mean_t180'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
    data['rolling_std_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    # data['rolling_skew_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).skew(skipna=True))
    # data['rolling_kurt_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).kurt(skipna=True))
    
    # price features
    # data['lag_price_t1'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    # data['price_change_t1'] = (data['lag_price_t1'] - data['sell_price']) / (data['lag_price_t1'])
    # data['rolling_price_max_t365'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    # data['price_change_t365'] = (data['rolling_price_max_t365'] - data['sell_price']) / (data['rolling_price_max_t365'])
    data['rolling_price_std_t7'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
    data['rolling_price_std_t30'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
    # data.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)
    
    # time features
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['week'] = data['date'].dt.week
    data['day'] = data['date'].dt.day
    data['dayofweek'] = data['date'].dt.dayofweek
    
    # frank features
    data['price_binary_under_1'] = data['sell_price'].apply(lambda x: 1 if x <= 1 else 0)
    data['price_binary_under_6'] = data['sell_price'].apply(lambda x: 1 if x <= 6 else 0)
    
    avg_demand = data.groupby(['id'])['demand'].agg('median')
    data = pd.merge(data, avg_demand, how = 'left', left_on = ['id'], right_on = ['id'])
    data['med_demand_above_0'] = data['demand_y'].apply(lambda x: 1 if x > 0 else 0)
    data.drop(['demand_y'], inplace = True, axis = 1)
    data = data.rename(columns={"demand_x": "demand"})
    
    return data

    
# ok - predict reformats the predictions you made in run_fastai
def predict(test, submission):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    # final.to_csv('test_fastai_submission.csv', index = False)
    
    return final

def process_data(data):
    data = transform(data)
    data = simple_fe(data)
    return data

fe_data = process_data(data)
print('all done')

In [None]:
# print(fe_data['dept_id'].unique())
# no values yet
# WHEN YOU RUN THE ENTIRE DATASET ON GCP, TAKE THIS OUT
print(fe_data.shape)
# fe_data.drop(['rolling_mean_t90', 'rolling_mean_t180', 'price_change_t365', 'rolling_kurt_t30'], inplace = True, axis = 1)
fe_data.head()

In [None]:
# print(fe_data.shape)
# fe_data = fe_data.dropna()
# print(fe_data.shape)
# fe_data[fe_data['rolling_mean_t30'].isna()].head()

In [None]:
# train_df = data[(data['date'] >= '2016-01-01') & (data['date'] <= '2016-04-24')].reset_index()
train_df = fe_data[fe_data['date'] <= '2016-04-24']
train_df = train_df.dropna().reset_index()
val_idx = (train_df[(train_df['date'] > '2016-03-27') & (train_df['date'] <= '2016-04-24')]).index
test = fe_data[(fe_data['date'] > '2016-04-24')]

In [None]:
# train_df.dtypes
# train_df.shape
# train_df.head()
# train_df[train_df['sell_price'].isna()].head()
# train_df['dept_id'].unique()
# print(len(val_idx))
# test.shape
# test.head()
# test['dept_id'].unique()
# test[test['rolling_skew_t30'].isna()].head()
# train_df.columns

In [None]:
path = '/kaggle'
dep_var = 'demand'
cat_names = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'
            ,'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'
            ,'snap_CA', 'snap_TX', 'snap_WI'
            ,'price_binary_under_1', 'price_binary_under_6', 'med_demand_above_0'
            ,'year', 'month', 'week', 'day', 'dayofweek'
            ]
cont_names = ['sell_price'
             ,'lag_t28', 'lag_t29', 'lag_t30'
             ,'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_std_t30'
             ,'rolling_price_std_t7', 'rolling_price_std_t30'
             # ,'rolling_mean_t90','rolling_mean_t180', 'price_change_t1', 'price_change_t365', 'rolling_skew_t30', 'rolling_kurt_t30'
            ] 

procs = [Categorify] # FillMissing,  Normalize

In [None]:
# Normalize(cat_names = [], cont_names = train_df['sell_price'])
# train_df.isnull().sum(axis = 0)

In [None]:
path

In [None]:
# put the data into the form that FastAI understands... this is a FastAI function
new_data = (TabularList.from_df(df=train_df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs) 
                       .split_by_idx(val_idx)
                       .label_from_df(cols=dep_var, label_cls=FloatList, log=False) # perhaps change log= back to True?
                       .add_test(test)
                       .databunch(bs=128))

In [None]:
new_data.show_batch(rows=10)

In [None]:
## consider this - constrain predictions to certain range
max_y = np.max(train_df['demand'])*1.2 
y_range = torch.tensor([0, max_y], device=defaults.device)

In [None]:
# learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04, y_range=y_range, metrics=rmse)
data_learn = tabular_learner(new_data, layers=[200,100], ps=[0.001,0.01], emb_drop=0.02, y_range=y_range, metrics=root_mean_squared_error)
data_learn.fit(1, 1e-3)
# seems like higher embedding number makes model perform worse (0.02 feels fine). learning rate of 1e-3 seems to be the best all around. changing ps doesn't 
# seem to change anything. layers of [200,100] seem to be optimal.

In [None]:
data_learn.recorder.plot_losses()
# this produces something unhelpful...

In [None]:
data_learn.lr_find()
data_learn.recorder.plot()
# right before 1e-03 feels about right...

In [None]:
# try a different learning rate...
# data_learn.fit(1, 1e-4)

In [None]:
# data_learn.fit_one_cycle(1, 1e-5)

In [None]:
# data_learn.fit_one_cycle(1, 1e-2)

In [None]:
# data_learn.fit_one_cycle(1, 1e-3)

In [None]:
# use model to get predictions on test (aka validation set)
test_list = TabularList.from_df(test,
                            cat_names=cat_names,
                            cont_names=cont_names,
                            procs=procs)

data_learn.export()
learner = load_learner(path, test=test_list)
test_preds, test_targs = learner.get_preds(ds_type=DatasetType.Test)

In [None]:
test_predictions = [i[0] for i in test_preds.tolist()]
test['demand'] = test_predictions

In [None]:
# remember, the predict function defined above does some format manipulation
output_df = predict(test, submission)

In [None]:
output_df.head()

In [None]:
# import the modules we'll need
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

# function that takes in a dataframe and creates a text link to  
# download it (will only work for files < 2MB or so)
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(df = output_df, filename ="attempt_2_embeddings")

In [None]:
#### OLD

In [None]:
def run_fastai(data):
    

    path = '/kaggle'
    dep_var = 'demand'
    cont_names = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
            'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
            'snap_CA', 'snap_TX', 'snap_WI',
            # 'sell_price',
            'price_binary_under_1', 'price_binary_under_5', 'med_demand_above_0',
            'lag_t28', 'lag_t29', 'lag_t30',
            'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90',
            'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365',
            'rolling_price_std_t7', 'rolling_price_std_t30', 'rolling_skew_t30', 'rolling_kurt_t30',
            'month', 'week', 'day', 'dayofweek'
    ] # took out year as a variable - I don't think it helps looks forward and the rolling metrics should capture current 'level'
    
    procs = [Normalize] # Categorify, FillMissing, 

    new_data = (TabularList.from_df(df=train_df, path=path, cat_names = [], cont_names=cont_names, procs=procs) # cat_names=cat_names, 
                           .split_by_idx(val_idx)
                           .label_from_df(cols=dep_var, label_cls=FloatList, log=False) # perhaps change log= back to True?
                           .add_test(test)
                           .databunch())
    
    data_learn = tabular_learner(new_data, layers=[200,100], metrics=root_mean_squared_error)
    
    data_learn.fit(1, 1e-3)
    
    test_list = TabularList.from_df(test,
                                # cat_names=cat_names,
                                cont_names=cont_names,
                                procs=procs)
    
    data_learn.export()
    learner = load_learner(path, test=test_list)
    test_preds, test_targs = learner.get_preds(ds_type=DatasetType.Test)
    test_predictions = [i[0] for i in test_preds.tolist()]
    test['demand'] = test_predictions
    return test

In [None]:
# train_df = data[(data['date'] >= '2016-01-01') & (data['date'] <= '2016-04-24')].reset_index()
train_df = data[(data['date'] >= '2016-02-01') & (data['date'] <= '2016-04-24')].reset_index()
val_idx = (train_df[(train_df['date'] > '2016-03-27') & (train_df['date'] <= '2016-04-24')]).index
test = data[(data['date'] > '2016-04-24')]

In [None]:
test.head()

In [None]:
path = '/kaggle'
dep_var = 'demand'
cont_names = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
        'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
        'snap_CA', 'snap_TX', 'snap_WI',
        # 'sell_price',
        'price_binary_under_1', 'price_binary_under_5', 'med_demand_above_0',
        'lag_t28', 'lag_t29', 'lag_t30',
        'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90',
        'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365',
        'rolling_price_std_t7', 'rolling_price_std_t30', 'rolling_skew_t30', 'rolling_kurt_t30',
        'month', 'week', 'day', 'dayofweek'
]

In [None]:
procs = [Normalize] # Categorify, FillMissing, 

In [None]:
new_data = (TabularList.from_df(df=train_df, path=path, cat_names = [], cont_names=cont_names, procs=procs) # cat_names=cat_names, 
                       .split_by_idx(val_idx)
                       .label_from_df(cols=dep_var, label_cls=FloatList, log=False) # perhaps change log= back to True?
                       .add_test(test)
                       .databunch())

In [None]:
data_learn = tabular_learner(new_data, layers=[200,100], metrics=root_mean_squared_error)

data_learn.fit(1, 1e-3)

In [None]:
test_list = TabularList.from_df(test,
                            # cat_names=cat_names,
                            cont_names=cont_names,
                            procs=procs)

In [None]:
data_learn.export()
learner = load_learner(path, test=test_list)

In [None]:
test_preds, test_targs = learner.get_preds(ds_type=DatasetType.Test)
test_predictions = [i[0] for i in test_preds.tolist()]
test['demand'] = test_predictions