# Rossman

The goal is to predict what quantity of an item will be sold by a grocery store.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
pd.__version__

In [None]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

PATH='../input/rossmann-store-sales/'

In [None]:
PATH_EXTERNAL = '../input/rossmann-store-extra/'      

# Create Datasets

In [None]:
table_names = ['train','test','store']
external_table_names = ['store_states','state_names','googletrend','weather']

In [None]:
#Lets load all the csvs as dataframes into the list tables
tables = [pd.read_csv(f'{PATH}{fname}.csv', low_memory=False) for fname in table_names] + \
  [pd.read_csv(f'{PATH_EXTERNAL}{fname}.csv', low_memory=False) for fname in external_table_names]

In [None]:
from IPython.display import HTML, display

In [None]:
for t in tables: display(t.head())

return a summary of each table

In [None]:
for t in tables: display(DataFrameSummary(t).summary())

# Data Cleaning / Feature Engineering


In [None]:
train, test, store, store_states, state_names, googletrend, weather = tables

In [None]:
len(train),len(test)

Need to turn holidays into booleans for convient modelling.

In [None]:
train.StateHoliday = train.StateHoliday!='0'
test.StateHoliday = test.StateHoliday!='0'

`join_df` will join tables on specifi fields. We doing a left outer join of right on the left, using the given fields of each table. 

In [None]:
def join_df(left, right, left_on, right_on=None, suffix='_y'):
    if right_on is None: right_on = left_on
    return left.merge(right, how='left',left_on=left_on, right_on=right_on,
                     suffixes=("", suffix))   

In [None]:
# join weather/state names
weather = join_df(weather, state_names, "file", "StateName")

We need to replace instances of state name 'NI'  to mage the rest of the data: 'HB,NI'

In [None]:
googletrend['Date'] = googletrend.week.str.split(' - ', expand=True)[0]
googletrend['State'] = googletrend.file.str.split('_', expand=True)[2]
googletrend.loc[googletrend.State=='NI', "State"] = 'HB,NI'


To make categoricals we need to extract certain date fields. 

In [None]:
add_datepart(weather, "Date", drop=False)
add_datepart(googletrend, "Date", drop=False)
add_datepart(train, "Date", drop=False)
add_datepart(test, "Date", drop=False)



In [None]:
#googletrends has a special category for germany
trend_de = googletrend[googletrend.file == 'Rossmann_DE']


We are about to outer join the data into a single dataframe and then check for null values. If we did an innerjoin we would be assuming that we do not have any missing data.

In [None]:
store = join_df(store, store_states, "Store")
len(store[store.State.isnull()])

In [None]:
joined = join_df(train, store, "Store")
joined_test = join_df(test, store, "Store")
len(joined[joined.StoreType.isnull()]), len(joined_test[joined_test.StoreType.isnull()])

In [None]:
joined = join_df(joined, googletrend,["State","Year","Week"])
joined_test = join_df(joined_test, googletrend, ["State", "Year","Week"])
len(joined[joined.trend.isnull()]), len(joined_test[joined_test.trend.isnull()])

In [None]:
joined = joined.merge(trend_de, 'left', ["Year", "Week"], suffixes=('', '_DE'))
joined_test = joined_test.merge(trend_de, 'left', ["Year", "Week"], suffixes=('', '_DE'))
len(joined[joined.trend_DE.isnull()]),len(joined_test[joined_test.trend_DE.isnull()])

In [None]:
joined = join_df(joined, weather, ["State","Date"])
joined_test = join_df(joined_test, weather, ["State", "Date"])
len(joined[joined.Mean_TemperatureC.isnull()]), len(joined_test[joined_test.Mean_TemperatureC.isnull()])

In [None]:
for df in (joined, joined_test):
    for c in df.columns:
        if c.endswith('_y'):
            if c in df.columns: df.drop(c, inplace=True, axis=1)

Need to fill in missing values

In [None]:
for df in (joined,joined_test):
    df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32)
    df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32)
    df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32)
    df['Promo2SinceWeek'] = df.Promo2SinceWeek.fillna(1).astype(np.int32)

Extract features "CompetitionOpenSince" and "CompetitionDaysOpen"

In [None]:
for df in (joined,joined_test):
    df["CompetitionOpenSince"] = pd.to_datetime(dict(year=df.CompetitionOpenSinceYear, 
                                                     month=df.CompetitionOpenSinceMonth, day=15))
    df["CompetitionDaysOpen"] = df.Date.subtract(df.CompetitionOpenSince).dt.days

Replace errors and outliers

In [None]:
for df in (joined, joined_test):
    df.loc[df.CompetitionDaysOpen<0, "CompetitionDaysOpen"] = 0
    df.loc[df.CompetitionOpenSinceYear<1990, "CompetitionDaysOpen"] = 0

The CompitionMonthsOpen field needs to be limited to max 2 years to limit the number of unique catergories

In [None]:
for df in (joined, joined_test):
    df["CompetitionMonthsOpen"] = df["CompetitionDaysOpen"]//30
    df.loc[df.CompetitionMonthsOpen>24, "CompetitionMonthsOpen"] = 24
joined.CompetitionMonthsOpen.unique()

Need to do the same for Promo dates

In [None]:
joined.head(200)

In [None]:
for df in (joined,joined_test):
    df["Promo2Since"] = pd.to_datetime(df.apply(lambda x: Week(
        x.Promo2SinceYear, x.Promo2SinceWeek).monday(), axis=1).astype(pd.datetime))
    df["Promo2Days"] = df.Date.subtract(df["Promo2Since"]).dt.days

In [None]:
for df in (joined,joined_test):
    df.loc[df.Promo2Days<0,"Promo2Days"] = 0
    df.loc[df.Promo2SinceYear<1990,"Promo2Days"]=0
    df["Promo2Weeks"] = df["Promo2Days"]//7
    df.loc[df.Promo2Weeks<0, "Promo2Weeks"]=0
    df.loc[df.Promo2Weeks>25, "Promo2Weeks"]=25
    df.Promo2Weeks.unique()

In [None]:
PATH_WRITE = "/kaggle/working/"

In [None]:
joined.to_feather(f'{PATH_WRITE}joined')
joined_test.to_feather(f'{PATH_WRITE}joined_test')

# Durations

We need to extract data that shows us relationships across rows and not columns as we are using time series data.

A function called get_elapsed for cumulative counting across a sorted dataframe.  this func will tract time since last occurance of that field. When it sees that field again it will set its counter to zero. 

In [None]:
def get_elapsed(fld, pre):
    day1 = np.timedelta64(1, 'D')
    last_date = np.datetime64()
    last_store = 0
    res = []

    for s,v,d in zip(df.Store.values,df[fld].values, df.Date.values):
        if s != last_store:
            last_date = np.datetime64()
            last_store = s
        if v: last_date = d
        res.append(((d-last_date).astype('timedelta64[D]') / day1))
    df[pre+fld] = res

In [None]:
columns = ["Date", "Store", "Promo", "StateHoliday", "SchoolHoliday"]

In [None]:
df = train[columns].append(test[columns])

In [None]:
fld = 'SchoolHoliday'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')

In [None]:
fld = 'StateHoliday'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')

In [None]:
fld = 'Promo'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')

In [None]:
#set the active index to Date
df = df.set_index("Date")

In [None]:
#set null values from elapsed field calculations to 0
columns = ['SchoolHoliday', 'StateHoliday', 'Promo']

In [None]:
for o in ['Before', 'After']:
    for p in columns:
        a = o+p
        df[a] = df[a].fillna(0).astype(int)

We need to calculate rolling quantities. We will sort by date and count # of events of interest, grouped by store.

In [None]:
bwd = df[['Store']+columns].sort_index().groupby("Store").rolling(7, min_periods=1).sum()

In [None]:
fwd = df[['Store']+columns].sort_index(ascending=False).groupby("Store").rolling(7, min_periods=1).sum()

Now we need to drop Store indices grouped together.

In [None]:
bwd.drop('Store',1,inplace=True)
bwd.reset_index(inplace=True)

In [None]:
fwd.drop('Store',1,inplace=True)
fwd.reset_index(inplace=True)

In [None]:
df.reset_index(inplace=True)

We will merge the  values into the dataframe

In [None]:
df = df.merge(bwd, 'left', ['Date', 'Store'], suffixes=['', '_bw'])
df = df.merge(fwd, 'left', ['Date', 'Store'], suffixes=['', '_fw'])

In [None]:
df.drop(columns,1,inplace=True)

In [None]:
df.head()

We need to back up large tables of extracted features.

In [None]:
df.to_feather(f'{PATH_WRITE}df')

In [None]:
df = pd.read_feather(f'{PATH_WRITE}df')

In [None]:
df["Date"] = pd.to_datetime(df.Date)

In [None]:
df.columns

In [None]:
joined = join_df(joined, df, ['Store','Date'])

In [None]:
joined_test = join_df(joined_test,df, ['Store','Date'])

In [None]:
joined = joined[joined.Sales!=0]

In [None]:
joined.reset_index(inplace=True)
joined_test.reset_index(inplace=True)

In [None]:
joined.to_feather(f'{PATH_WRITE}joined')
joined_test.to_feather(f'{PATH_WRITE}joined_test')

Create Features

In [None]:
joined.head().T.head(40)

We need to convert to input that is compatible with a NN.

In [None]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw']

contin_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']

n = len(joined); n

In [None]:
dep = 'Sales'
joined = joined[cat_vars+contin_vars+[dep, 'Date']].copy()

In [None]:
joined_test[dep] = 0
joined_test = joined_test[cat_vars+contin_vars+[dep,'Date','Id']].copy()

In [None]:
for v in cat_vars: joined[v] = joined[v].astype('category').cat.as_ordered()

In [None]:
apply_cats(joined_test, joined)

In [None]:
for v in contin_vars:
    joined[v] = joined[v].fillna(0).astype('float32')
    joined_test[v] = joined_test[v].fillna(0).astype('float32')

In [None]:
idxs = get_cv_idxs(n, val_pct=150000/n)
joined_samp = joined.iloc[idxs].set_index("Date")
samp_size = len(joined_samp); samp_size

In [None]:
##to run on full dataset
samp_size = n
joined_samp = joined.set_index("Date")

In [None]:
#Process the data
joined_samp.head(2)

In [None]:
df, y, nas, mapper = proc_df(joined_samp, 'Sales', do_scale=True)
yl = np.log(y)

In [None]:
joined_test = joined_test.set_index("Date")

In [None]:
df_test, _, nas, maopper = proc_df(joined_test, 'Sales', do_scale=True, skip_flds=['Id'],
                                  mapper=mapper, na_dict=nas)

In [None]:
df.head(2)

We will take the last 25% of rows as our validation set

In [None]:
train_ratio = 0.75
train_size = int(samp_size * train_ratio); train_size
val_idx = list(range(train_size, len(df)))

    We will use the exact same time period as the test set for the validation set

In [None]:
val_idx = np.flatnonzero(
    (df.index<=datetime.datetime(2014,9,17)) & (df.index>=datetime.datetime(2014,8,1)))

In [None]:
val_idx=[0]

# Time to put our model together! :D

This Kaggle compition uses the RMSE as its metric.

In [None]:
def inv_y(a): return np.exp(a)

def exp_rmspe(y_pred, targ):
    targ = inv_y(targ)
    pct_var = (targ - inv_y(y_pred))/targ
    return math.sqrt((pct_var**2).mean())

max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)

A ModelData object is created directly from our data frame

In [None]:

class _ColumnarModelData(ColumnarModelData):
    @classmethod
    def from_data_frames(cls, path, trn_df, val_df, trn_y, val_y, cat_flds, bs, is_reg, test_df=None):
        test_ds = ColumnarDataset.from_data_frame(test_df, cat_flds, None, is_reg) if test_df is not None else None
        return cls(path, ColumnarDataset.from_data_frame(trn_df, cat_flds, trn_y, is_reg),
                    ColumnarDataset.from_data_frame(val_df, cat_flds, val_y, is_reg), bs, test_ds=test_ds)


In [None]:
md = _ColumnarModelData.from_data_frame(PATH, val_idx, df, yl.astype(np.float32), cat_flds=cat_vars, bs=1,
                                       test_df=df_test, is_reg=True)


Some vat_vars have a lot more level than others

In [None]:
cat_sz = [(c, len(joined_samp[c].cat.categories)+1) for c in cat_vars]

In [None]:
cat_sz

We ned to check out the cardinality of each variable to determine its embedding size.

In [None]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [None]:
emb_szs

In [None]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                  0.04, 1, [1000,500], [0.001,0.01], y_range=y_range,
                  tmp_name=f"{PATH_WRITE}tmp", models_name=f"{PATH_WRITE}models")

In [None]:
lr = 1e-3
m.lr_find()