In [1]:
%matplotlib inline
from fastai import *
from fastai.tabular import *

# Rossmann Data Preparation

In [2]:
path = Path('data/rossmann')
train_df = pd.read_pickle(path/'train_clean')

In [4]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
index,0,1,2,3,4
Store,1,2,3,4,5
DayOfWeek,5,5,5,5,5
Date,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00
Sales,5263,6064,8314,13995,4822
Customers,555,625,821,1498,559
Open,1,1,1,1,1
Promo,1,1,1,1,1
StateHoliday,False,False,False,False,False
SchoolHoliday,1,1,1,1,1


In [5]:
n = len(train_df);
n

844338

## Create a small sample for experimentation

In [6]:
idx = np.random.permutation(range(n))[:2000]
idx.sort()

In [7]:
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars = ['Store', 'DayOfWeek', 'PromoInterval']

In [8]:
small_train_df = small_train_df[small_cat_vars + small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars + small_cont_vars + ['Sales']]

In [9]:
small_train_df.head()

Unnamed: 0,Store,DayOfWeek,PromoInterval,CompetitionDistance,Mean_Humidity,Sales
335,337,5,"Feb,May,Aug,Nov",10600.0,54,7345
2121,1011,4,"Feb,May,Aug,Nov",490.0,73,9227
3380,42,2,"Jan,Apr,Jul,Oct",290.0,66,10896
4098,761,2,,2390.0,69,9584
4633,182,1,"Mar,Jun,Sept,Dec",1390.0,69,9271


In [10]:
small_test_df.head()

Unnamed: 0,Store,DayOfWeek,PromoInterval,CompetitionDistance,Mean_Humidity,Sales
420009,327,3,,1390.0,65,7473
421344,548,2,,3760.0,53,5107
421390,594,2,,1790.0,72,5867
422772,863,1,,21370.0,76,11627
423118,64,6,"Jan,Apr,Jul,Oct",22560.0,48,9325


We'll learn how to use Pre-processors. Please recall that Pre-processors are applied only once on the data, unlike transforms

In [11]:
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test = True)

In [12]:
small_train_df.head()

Unnamed: 0,Store,DayOfWeek,PromoInterval,CompetitionDistance,Mean_Humidity,Sales
335,337,5,"Feb,May,Aug,Nov",10600.0,54,7345
2121,1011,4,"Feb,May,Aug,Nov",490.0,73,9227
3380,42,2,"Jan,Apr,Jul,Oct",290.0,66,10896
4098,761,2,,2390.0,69,9584
4633,182,1,"Mar,Jun,Sept,Dec",1390.0,69,9271


In [13]:
small_train_df.PromoInterval.cat.categories

Index(['Feb,May,Aug,Nov', 'Jan,Apr,Jul,Oct', 'Mar,Jun,Sept,Dec'], dtype='object')

In [14]:
small_train_df['PromoInterval'].cat.codes

335       0
2121      0
3380      1
4098     -1
4633      2
5034      2
6012      1
6125     -1
6182      1
6659      2
7679     -1
7962      1
8465      1
8759     -1
9374      0
9552     -1
10218     1
10253    -1
10579    -1
10759     1
10928    -1
11097     1
11107    -1
11424     1
11996    -1
12674    -1
13031    -1
13154    -1
14533     1
14752    -1
         ..
408296   -1
408318    2
409759    1
410501   -1
411104    0
411151    1
411445    2
411506   -1
411599   -1
412480   -1
412938    1
413465   -1
413596   -1
413623   -1
414003   -1
414192   -1
414395   -1
414419   -1
414959    0
415104   -1
415317    1
416409    1
417908    0
418064    1
418815   -1
418930    0
419238    0
419242    0
419808    0
419933   -1
Length: 1000, dtype: int8