# Fast ai v3 on Rossemann

Combining two notebook in fastai lesson6, rossmann data clean and rossmann notebook to create an end to end submission.

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
from fastai.basics import *

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Getting the data together

The extra data from rossmann one has to add to ones notebook.**

In [None]:
PATH = Config().data_path()/Path('rossmann/')

In [None]:
#!mkdir {PATH}
#mkdir: cannot create directory ‘/root/.fastai/data/rossmann’: No such file or directory
!mkdir /root/.fastai/data/
!mkdir /root/.fastai/data/rossmann

In [None]:
!cp /kaggle/input/rossmann/*.csv {PATH}

In [None]:
table_names = ['train', 'store', 'store_states', 'state_names', 'googletrend', 'weather', 'test']

In [None]:
tables = [pd.read_csv(PATH/f'{fname}.csv', low_memory=False) for fname in table_names]
train, store, store_states, state_names, googletrend, weather, test = tables
googletrend.tail()

In [None]:
store.head()

In [None]:
store_states.head()

In [None]:
weather.tail()

In [None]:
len(train), len(test)

# Data cleaning

1. turning holidays into booleans to make the more convenient for modelling.

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
train.StateHoliday = train.StateHoliday != '0'
test.StateHoliday = test.StateHoliday != '0'

2. creating a custom join_df function for joining tables on specific fields

In [None]:
def join_df(left, right, left_on, right_on=None, suffix='_y'):
    if(right_on is None):
        right_on = left_on
    return left.merge(right, how='left', left_on=left_on, right_on=right_on, suffixes=("", suffix))

3. joining weather and state_names

In [None]:
weather = join_df(weather, state_names, "file", "StateName" )

In [None]:
weather.head()

4. Adding new columns to googletrends , and replace all instance of NI to HB,NI as it is usedelsewhere in all datasets

In [None]:
googletrend['Date'] = googletrend.week.str.split(' - ', expand=True)[0]
googletrend['State'] = googletrend.file.str.split('_', expand=True)[2]

googletrend.loc[googletrend.State=='NI', "State"] = 'HB,NI'


5. Getting particular date fields from a complete datetime. We should always consider this stepwhen working with date time. this we will add to every table with a date field

In [None]:
def add_datepart(df, fldname, drop=True, time=False):
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64
    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: 
        attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr:
        df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop :
        df.drop(fldname, axis=1, inplace=True)
            

In [None]:
add_datepart(weather, "Date", drop=False)
add_datepart(googletrend, "Date", drop=False)
add_datepart(train, "Date", drop=False)
add_datepart(test, "Date", drop=False)

In [None]:
len(train), len(test)

6. The Google trends data has a special category for the whole of germany - we'll pull that out to use explicitly

In [None]:
trend_germany = googletrend[googletrend.file == "Rossmann_DE"]

7. Now we will perform a outer join all datasets and then do a null check to see if the all records are consistent. 

In [None]:
store = join_df(store, store_states, "Store")
len(store[store.State.isnull()])

# is isnull is zero it means the rows are consistent

In [None]:
combined  = join_df(train, store, "Store")
combined_test = join_df(test, store, "Store")
len(combined[combined.StoreType.isnull()]), len(combined_test[combined_test.StoreType.isnull()])

In [None]:
#len(combined), len(combined_test)

In [None]:
combined  = join_df(combined, googletrend, ["State", "Year", "Week"])
combined_test = join_df(combined_test, googletrend, ["State", "Year", "Week"])
len(combined[combined.trend.isnull()]), len(combined_test[combined_test.trend.isnull()])

In [None]:
#len(combined), len(combined_test)

In [None]:
combined = combined.merge(trend_germany, 'left', ["Year", "Week"], suffixes=('', '_DE'))
combined_test = combined_test.merge(trend_germany, 'left', ["Year", "Week"], suffixes=('', '_DE'))
len(combined[combined.trend_DE.isnull()]),len(combined_test[combined_test.trend_DE.isnull()])

In [None]:
#len(combined), len(combined_test)

In [None]:
combined = join_df(combined, weather, ["State","Date"])
combined_test = join_df(combined_test, weather, ["State","Date"])
len(combined[combined.Mean_TemperatureC.isnull()]),len(combined_test[combined_test.Mean_TemperatureC.isnull()])

In [None]:
#len(combined), len(combined_test)

In [None]:
for df in (combined, combined_test):
    for c in df.columns:
        if c.endswith('_y'):
            if c in df.columns:
                df.drop(c, inplace=True, axis=1)

In [None]:
#len(combined), len(combined_test)

8. We will fill in the missing values to avoid complications with NA's. Na is when missing values invade a dataframe this is how pandas indicates missing values, many models have a problem 

In [None]:
for df in (combined, combined_test):
    df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32)
    df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32)
    df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32)
    df['Promo2SinceWeek'] = df.Promo2SinceWeek.fillna(1).astype(np.int32)

9. Next we'll extract features "CompetitionOpenSince" and "CompetitionDaysOpen". 

In [None]:
#len(combined), len(combined_test)

In [None]:
for df in (combined,combined_test):
    df["CompetitionOpenSince"] = pd.to_datetime(dict(year=df.CompetitionOpenSinceYear, 
                                                     month=df.CompetitionOpenSinceMonth, day=15))
    df["CompetitionDaysOpen"] = df.Date.subtract(df.CompetitionOpenSince).dt.days

10. Replacing some erroneous data

In [None]:
#len(combined), len(combined_test)

In [None]:
for df in (combined, combined_test):
    df.loc[df.CompetitionDaysOpen<0, "CompetitionDaysOpen"] = 0
    df.loc[df.CompetitionOpenSinceYear<1990, "CompetitionDaysOpen"] = 0

11. We add "CompetitionMonthsOpen" field, limiting the maximum to 2 years to limit number of unique categories.

In [None]:
for df in (combined,combined_test):
    df["CompetitionMonthsOpen"] = df["CompetitionDaysOpen"]//30
    df.loc[df.CompetitionMonthsOpen>24, "CompetitionMonthsOpen"] = 24
combined.CompetitionMonthsOpen.unique()

12. Same we will do for promo dates

In [None]:
!pip install isoweek

In [None]:
from isoweek import Week
for df in (combined,combined_test):
    df["Promo2Since"] = pd.to_datetime(df.apply(lambda x: Week(
        x.Promo2SinceYear, x.Promo2SinceWeek).monday(), axis=1))
    df["Promo2Days"] = df.Date.subtract(df["Promo2Since"]).dt.days

In [None]:
for df in (combined,combined_test):
    df.loc[df.Promo2Days<0, "Promo2Days"] = 0
    df.loc[df.Promo2SinceYear<1990, "Promo2Days"] = 0
    df["Promo2Weeks"] = df["Promo2Days"]//7
    df.loc[df.Promo2Weeks<0, "Promo2Weeks"] = 0
    df.loc[df.Promo2Weeks>25, "Promo2Weeks"] = 25
    df.Promo2Weeks.unique()

13. Converting to pickle for future

In [None]:
#len(combined), len(combined_test)

In [None]:
combined.to_pickle(PATH/'combined')
combined_test.to_pickle(PATH/'combined_test')

### Durations
It is common when working with time series data to extract data that explains relationships across rows as opposed to columns, e.g.:

Running averages
Time until next event
Time since last event

We'll define a function `get_elapsed` for cumulative counting across a sorted dataframe. Given a particular field `fld` to monitor, this function will start tracking time since the last occurrence of that field. When the field is seen again, the counter is set to zero.

Upon initialization, this will result in datetime na's until the field is encountered. This is reset every time a new store is seen. We'll see how to use this shortly.

In [None]:
def get_elapsed(fld, pre):
    day1 = np.timedelta64(1, 'D')
    last_date = np.datetime64()
    last_store = 0
    res = []

    for s,v,d in zip(df.Store.values,df[fld].values, df.Date.values):
        if s != last_store:
            last_date = np.datetime64()
            last_store = s
        if v: last_date = d
        res.append(((d-last_date).astype('timedelta64[D]') / day1))
    df[pre+fld] = res

14. applying to a subset of columns

In [None]:
columns = ["Date", "Store", "Promo", "StateHoliday", "SchoolHoliday"]
df = train[columns].append(test[columns])

An example.

Say we're looking at School Holiday. We'll first sort by Store, then Date, and then call add_elapsed('SchoolHoliday', 'After'): This will apply to each row with School Holiday:

A applied to every row of the dataframe in order of store and date

Will add to the dataframe the days since seeing a School Holiday

If we sort in the other direction, this will count the days until another holiday.


In [None]:
fld = 'SchoolHoliday'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')

In [None]:
df.head()

In [None]:
# for 2 more fiellds

fld = 'StateHoliday'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')

In [None]:
df.head()

In [None]:
fld = 'Promo'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')

In [None]:
df.tail()

In [None]:
# setting active index to DAte

df = df.set_index("Date")

# setting null values from elapsed field to 0
columns = ['SchoolHoliday', 'StateHoliday', 'Promo']
for o in ['Before', 'After']:
    for p in columns:
        a = o+p
        df[a] = df[a].fillna(0).astype(int)


Next we'll demonstrate window functions in pandas to calculate rolling quantities.

Here we're sorting by date (sort_index()) and counting the number of events of interest (sum()) defined in columns in the following week (rolling()), grouped by Store (groupby()). We do the same in the opposite direction.

In [None]:
bwd = df[['Store']+columns].sort_index().groupby("Store").rolling(7, min_periods=1).sum()
fwd = df[['Store']+columns].sort_index(ascending=False
                                      ).groupby("Store").rolling(7, min_periods=1).sum()


Next we want to drop the Store indices grouped together in the window function.

Often in pandas, there is an option to do this in place. This is time and memory efficient when working with large datasets.

In [None]:
bwd.drop('Store',1,inplace=True)
bwd.reset_index(inplace=True)
fwd.drop('Store',1,inplace=True)
fwd.reset_index(inplace=True)
df.reset_index(inplace=True)

In [None]:
# we will merge these values into the dif
df = df.merge(bwd, 'left', ['Date', 'Store'], suffixes=['', '_bw'])
df = df.merge(fwd, 'left', ['Date', 'Store'], suffixes=['', '_fw'])
df.drop(columns,1,inplace=True)
df.head()


It's usually a good idea to back up large tables of extracted / wrangled features before you join them onto another one, that way you can go back to it easily if you need to make changes to it.



In [None]:
df.to_pickle(PATH/'df')
df["Date"] = pd.to_datetime(df.Date)
df.columns

In [None]:
joined = pd.read_pickle(PATH/'combined')
joined_test = pd.read_pickle(PATH/f'combined_test')
joined = join_df(joined, df, ['Store', 'Date'])
joined_test = join_df(joined_test, df, ['Store', 'Date'])

In [None]:
#len(joined), len(joined_test)

The original authors also removed all instances where the store had zero sale / was closed. We speculate that this may have cost them a higher standing in the competition. One reason this may be the case is that a little exploratory data analysis reveals that there are often periods where stores are closed, typically for refurbishment. Before and after these periods, there are naturally spikes in sales that one might expect. By ommitting this data from their training, the authors gave up the ability to leverage information about these periods to predict this otherwise volatile behavior.

In [None]:
joined = joined[joined.Sales!=0]



In [None]:
len(joined), len(joined_test)

Backing up this data as well

In [None]:
joined.reset_index(inplace=True)
joined_test.reset_index(inplace=True)

joined.to_pickle(PATH/'train_clean')
joined_test.to_pickle(PATH/'test_clean')



In [None]:
len(joined), len(joined_test)

### lets look at final training data

In [None]:
#PATH
train_df = pd.read_pickle(PATH/'train_clean')

In [None]:
train_df.head().T

In [None]:
n = len(train_df)
print(n)

# Experimenting with a sample

In [None]:
from fastai.tabular import *

In [None]:
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars =  ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars + small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars + small_cont_vars + ['Sales']]

In [None]:
small_train_df.head()

In [None]:
small_test_df.head()

In [None]:
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)

Categorify does basically the same thing that .classes thing for image recognition does for a dependent variable. It's going to take these strings, it's going to find all of the possible unique values of it, and it's going to create a list of them, and then it's going to turn the strings into numbers. So if I call it on my training set, that'll create categories there (small_train_df) and then I call it on my test set passing in test=true, that makes sure it's going to use the same categories that I had before. Now when I say .head, it looks exactly the same:

In [None]:
small_test_df.head()

That's because Pandas has turned this into a categorical variable which internally is storing numbers but externally is showing me the strings. But I can look inside promo interval to look at the cat.categories, this is all standard Pandas here, to show me a list of all of what we would call "classes" in fast.ai or would be called just "categories" in Pandas.

In [None]:
small_train_df.PromoInterval.cat.categories

In [None]:
small_train_df['PromoInterval'].cat.codes[:5]

So then if I look at the cat.codes, you can see here this list here is the numbers that are actually stored (-1, -1, 1, -1, 1). What are these minus ones? The minus ones represent NaN - they represent "missing". So Pandas uses the special code -1 to be mean missing.

As you know, these are going to end up in an embedding matrix, and we can't look up item -1 in an embedding matrix. So internally in fast.ai, we add one to all of these.

In [None]:
fill_missing = FillMissing(small_cat_vars,small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)

Another useful preprocessor is FillMissing. Again, you can call it on the data frame, you can call on the test passing in test=true.

In [None]:
small_train_df[small_train_df['CompetitionDistance_na'] == True]


This will create, for anything that has a missing value, it'll create an additional column with the column name underscore na (e.g. CompetitionDistance_na) and it will set it for true for any time that was missing. Then what we do is, we replace competition distance with the median for those. Why do we do this? Well, because very commonly the fact that something's missing is of itself interesting (i.e. it turns out the fact that this is missing helps you predict your outcome). So we certainly want to keep that information in a convenient boolean column, so that our deep learning model can use it to predict things.

But then, we need competition distance to be a continuous variable so we can use it in the continuous variable part of our model. So we can replace it with almost any number because if it turns out that the missingness is important, it can use the interaction of CompetitionDistance_na and CompetitionDistance to make predictions. So that's what FillMissing does.

# Preparing full dataset

In [None]:
train_df = pd.read_pickle(PATH/'train_clean')
test_df = pd.read_pickle(PATH/'test_clean')

In [None]:
#len(train_df), len(test_df)

The main thing you have to do if you want to create a data bunch of tabular data is tell it what are your categorical variables and what are your continuous variables. As we discussed last week briefly, your categorical variables are not just strings and things, but also I include things like day of week and month and day of month. Even though they're numbers, I make them categorical variables. Because, for example, day of month, I don't think it's going to have a nice smooth curve. I think that the fifteenth of the month and the first of the month and the 30th of the month are probably going to have different purchasing behavior to other days of the month. Therefore, if I make it a categorical variable, it's going to end up creating an embedding matrix and those different days of the month can get different behaviors.

You've actually got to think carefully about which things should be categorical variables. On the whole, if in doubt and there are not too many levels in your category (that's called the cardinality), if your cardinality is not too high, I would put it as a categorical variable. You can always try an each and see which works best.

In [None]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw']

cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']

You don't have to manually call preprocesses yourself. When you call any kind of item list creator, you can pass in a list of pre processes which you can create like this:

In [None]:
procs = [FillMissing, Categorify, Normalize]

### Find validation set indexes

Our final data frame that we're going to pass in is going to be a training set with the categorical variables, the continuous variables, the dependent variable, and the date. The date, we're just going to use to create a validation set where we are basically going to say the validation set is going to be the same number of records at the end of the time period that the test set is for Kaggle. That way, we should be able to validate our model nicely.

In [None]:
dep_var = 'Sales'
df = train_df[cat_vars + cont_vars + [dep_var,'Date']].copy()
test_df['Date'].min(), test_df['Date'].max()

In [None]:
#print(len(train_df), len(test_df))

In [None]:
cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()



In [None]:
valid_idx = range(cut)

In [None]:
df[dep_var].head()

# getting data from fastai

In [None]:
data = (TabularList.from_df(df, path=PATH, cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
                .split_by_idx(valid_idx)
                .label_from_df(cols=dep_var, label_cls=FloatList, log=True)
                .add_test(TabularList.from_df(test_df, path=PATH, cat_names=cat_vars, cont_names=cont_vars))
                .databunch())

This is saying "ok, I want to fill missing, I want to categorify, I want to normalize (i.e. for continuous variables, it'll subtract the mean and divide by the standard deviation to help a train more easily)." So you just say, those are my procs and then you can just pass it in there and that's it.

Later on, you can go data.export and it'll save all the metadata for that data bunch so you can, later on, load it in knowing exactly what your category codes are, exactly what median values used for replacing the missing values, and exactly what means and standard deviations you normalize by.

### Creating model

In [None]:
max_log_y = np.log(np.max(train_df['Sales']) * 1.2)
y_range = torch.tensor([0, max_log_y],
                      device = defaults.device)


In [None]:
learn = tabular_learner(data, layers= [1000,500], ps=[0.001,0.01], emb_drop=0.04, y_range=y_range, metrics =exp_rmspe)

In [None]:
learn.model

In [None]:
len(data.train_ds.cont_names)

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(5, 1e-3, wd=0.2)

In [None]:
learn.save('1')

In [None]:
learn.recorder.plot_losses(skip_start=1000)

In [None]:
learn.load('1')

In [None]:
learn.fit_one_cycle(5, 3e-4)

In [None]:
learn.fit_one_cycle(5, 3e-4)

# submit

In [None]:
test_preds = learn.get_preds(DatasetType.Test)
test_df["Sales"] = np.exp(test_preds[0].data).numpy().T[0]
test_df[["Id", "Sales"]] = test_df[["Id", "Sales"]].astype("int")
test_df[["Id", "Sales"]].to_csv("rossmann_submission.csv", index=False)