# Set-up

In [1]:
# DATA MANIPULATION
import numpy as np # linear algebra
import pandas as pd # data processing
import random 
import datetime # manipulating date formats

# VIZUALIZATION
import matplotlib.pyplot as plt # basic plotting
%matplotlib inline

# SUPERVISED LEARNING
from sklearn import metrics
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

# Reducing train data

Try reducing the dataset size by choosing a subset of stores/items.

In [2]:
dtypes = {'id':'int64', 'item_nbr':'int32', 'store_nbr':'int8', 'onpromotion':str}

if False:
    # Reading train data
    train = pd.read_csv('../input/train.csv', dtype=dtypes, parse_dates=['date'])

    # Filter out some stores
    #sales = pd.read_csv('../input/processed/sales+.csv', parse_dates=['date'])
    u_stores = train.store_nbr.unique()
    random.seed(115599)
    random_stores = sorted(u_stores[random.sample(range(len(u_stores)), 2) ])

    train = train[lambda df: (df.store_nbr.isin(random_stores)) & (df.date>="2015-01-01")]
    train.to_csv('../input/processed/train_min.csv',index=False)
    

    # For test data
    test = pd.read_csv('../input/test.csv', dtype=dtypes, parse_dates=['date'])
    test = test[lambda df: (df.store_nbr.isin(random_stores))]
    test.to_csv('../input/processed/test_min.csv', index=False)

# Load datasets

In [3]:
# Reading train
train = pd.read_csv('../input/processed/train_min.csv', dtype=dtypes, parse_dates=['date'])
#train = pd.read_csv('../input/processed/train.csv', dtype=dtypes, parse_dates=['date'])

def merge_with_extra_datasets(df):
    # Reading extra datasets
    sales = pd.read_csv('../input/processed/sales+.csv', parse_dates=['date']) # (completed) sales 
    items = pd.read_csv('../input/items.csv')

    # Merging datasets
    df = pd.merge(df, items, how='left')
    df = pd.merge(df, sales,how='left')
    
    df.drop(['lag_7','lag_annual'],1,inplace=True) # Unnecessary - from transaction prediction
    
    del sales, items
    
    return df 
    
train = merge_with_extra_datasets(train)

In [4]:
train[train.transactions.isnull()].groupby(['date','store_nbr']).item_nbr.count()

date        store_nbr
2016-01-03  21           2060
            32            986
2016-01-04  21           1872
Name: item_nbr, dtype: int64

In [5]:
train.dropna(inplace=True)
#print([(c,train[c].isnull().sum()) for c in train.columns])

## There are a couple of days for which there are not transactions. 
## Since they are only two days, it might be safe to ignore them 

# Feature engineering

In [6]:
# Transform target
def transform_target(df):
    df.loc[df.unit_sales < 0., 'unit_sales'] = 0.
    df['unit_sales'] = np.log1p(df.unit_sales)
    return df
    
## Transform onpromotion
def transform_onpromotion(df):
    df['onpromotion'] = df['onpromotion'].map({'False': 0, 'True': 1})
    return df

## Some features were not very useful: year,month,day
def add_date_features(df):
    #df['year'] = df['date'].dt.year
    #df['month'] = df['date'].dt.month
    #df['day'] = df['date'].dt.day
    df['week'] = df['date'].dt.week
    df['dow'] = df['date'].dt.dayofweek
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayoff']=[x in [5,6] for x in df.dow] ## Weekends
    return df

# Add weight feature (from perishable feature) 
# for calculating error metric
def add_weight_feature(df):
    df['perishable_w'] = df['perishable'].map({0:1.0, 1:1.25})
    return df


train = transform_target(train)
train = transform_onpromotion(train)
train = add_date_features(train)
train = add_weight_feature(train)

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2263890 entries, 0 to 2268807
Data columns (total 20 columns):
id              int64
date            datetime64[ns]
store_nbr       int8
item_nbr        int32
unit_sales      float64
onpromotion     int64
family          object
class           int64
perishable      int64
transactions    float64
city            object
state           object
type            object
cluster         float64
dcoilwtico      float64
week            int64
dow             int64
dayofyear       int64
dayoff          bool
perishable_w    float64
dtypes: bool(1), datetime64[ns](1), float64(5), int32(1), int64(7), int8(1), object(4)
memory usage: 323.9+ MB


In [8]:
## Categorical features
def encode(df, column) -> pd.DataFrame:
    one_hot = pd.get_dummies(df[column], drop_first=True, prefix=column)
    #return (one_hot - one_hot.mean()) / one_hot.std()
    return one_hot

def encode_categorical_features(df):
    #cat_columns = ['item_nbr']
    cat_columns = ['family']
    
    for column in cat_columns:
        column_enc = encode(df, column)
        df = pd.concat([df,column_enc], axis=1)
    
    return df


#train = encode_categorical_features(train)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2263890 entries, 0 to 2268807
Data columns (total 20 columns):
id              int64
date            datetime64[ns]
store_nbr       int8
item_nbr        int32
unit_sales      float64
onpromotion     int64
family          object
class           int64
perishable      int64
transactions    float64
city            object
state           object
type            object
cluster         float64
dcoilwtico      float64
week            int64
dow             int64
dayofyear       int64
dayoff          bool
perishable_w    float64
dtypes: bool(1), datetime64[ns](1), float64(5), int32(1), int64(7), int8(1), object(4)
memory usage: 323.9+ MB


In [10]:
print_cols = [c for c in train.columns if not c.startswith('class_')]

In [11]:
def add_lag_features(df):
    
    u_store = df.store_nbr.unique()
    df['lag_7'] = np.nan
    df_2 = pd.DataFrame()

    for s in u_store:
        print("Processing store %d..."%(s))
        tmp = df[df.store_nbr==s]
        dates_range = pd.date_range(tmp.date.min(), tmp.date.max())
        u_items = tmp.item_nbr.unique()

        # Reindex by date and item
        tmp.set_index(["date", "item_nbr"], inplace=True)
        tmp = tmp.reindex(
            pd.MultiIndex.from_product(
                [dates_range, u_items],
                names=["date", "item_nbr"]
            )
        )
        tmp.sort_index(inplace=True)

        # Create lag features
        tmp['lag_7'] = tmp['unit_sales'].shift(7*len(u_items))

        # Delete temporal df 
        tmp = tmp.reset_index()

        # Drop nan in transactions column
        tmp.dropna(subset=['unit_sales'], inplace=True)

        # Correlation coefficient
        print(tmp[['unit_sales','lag_7']].corr())

        df_2 = pd.concat([df_2,tmp])
        
    return df_2

    
train = add_lag_features(train)

Processing store 32...
            unit_sales     lag_7
unit_sales    1.000000  0.575435
lag_7         0.575435  1.000000
Processing store 21...
            unit_sales     lag_7
unit_sales    1.000000  0.646389
lag_7         0.646389  1.000000


### Note
Ignore items which are not in the test data
It is necessary to create a stage 2 for creating lag features from seasonal data

### Check the missing data

This is necessary in this point, because in the next step a lot of rows will be injected to provide complete time series.

In [12]:
# Drop nan in transactions column
#train.dropna(subset=['unit_sales'], inplace=True)
train.dropna(inplace=True)

# Prediction


### Error metric

In [13]:
def NWRMSLE(y, pred, w):
    return metrics.mean_squared_error(y, pred, sample_weight=w)**0.5

### Splitting data

In [14]:
cols = [c for c in train if c not in ['id','date','store_nbr','city','state','type','cluster',
                                      'item_nbr','family','class','perishable_w',
                                      'unit_sales']]
cols

['onpromotion',
 'perishable',
 'transactions',
 'dcoilwtico',
 'week',
 'dow',
 'dayofyear',
 'dayoff',
 'lag_7']

In [15]:
X1 = train.loc[(train.date<'2017-08-01') & (train.date>='2016-01-01')]
X2 = train.loc[(train.date>='2017-08-01')]

target_column = 'unit_sales' 
y1 = X1[target_column].values
y2 = X2[target_column].values

### Regressors

In [16]:
from sklearn import metrics

np.random.seed(1122)

number_regressors_to_test = 3
for method in range(1, number_regressors_to_test+1):
    print('\nmethod = ', method)
    
    if (method==1):
        print('Multilayer perceptron (MLP) neural network 01')
        str_method = 'MLP model01'    
        r = MLPRegressor(hidden_layer_sizes=(3,), max_iter=100)
    if (method==2):
        print('Bagging Regressor 01')
        str_method = 'BaggingRegressor01'
        r = BaggingRegressor(DecisionTreeRegressor(max_depth=6,max_features=0.85))
    if (method==3):
        print('GradientBoosting 01')
        str_method = 'GradientBoosting01'
        r = GradientBoostingRegressor()        

    r.fit(X1[cols], y1)
    yh2 = r.predict(X2[cols])
    #X2['prediction_%d'%method] = yh2
    m = NWRMSLE(y2, yh2, X2['perishable_w'])


    print("Error: %f" % (m))


method =  1
Multilayer perceptron (MLP) neural network 01
Error: 0.628859

method =  2
Bagging Regressor 01
Error: 0.615988

method =  3
GradientBoosting 01
Error: 0.613598


In [17]:
X1[cols].head()

Unnamed: 0,onpromotion,perishable,transactions,dcoilwtico,week,dow,dayofyear,dayoff,lag_7
919437,0.0,0.0,6.498282,36.97,53.0,5.0,2.0,True,1.609438
919441,0.0,0.0,6.498282,36.97,53.0,5.0,2.0,True,1.098612
919443,0.0,0.0,6.498282,36.97,53.0,5.0,2.0,True,1.791759
919452,0.0,0.0,6.498282,36.97,53.0,5.0,2.0,True,1.609438
919453,0.0,0.0,6.498282,36.97,53.0,5.0,2.0,True,1.098612


# Conclusions

Improvements:
- Initial: 0.79
- lag_7: 0.61

# Predictions for test data

In [None]:
train = pd.read_csv('../input/processed/train_from2017.csv', dtype=dtypes, parse_dates=['date'])
train.set_index(['date','store_nbr','item_nbr'], inplace=True)

test = pd.read_csv('../input/processed/test_min.csv', dtype=dtypes, parse_dates=['date'])
test = merge_with_extra_datasets(test)

test = transform_onpromotion(test)
test = add_date_features(test)
test = add_weight_feature(test)
#test = add_lag_features(test)

### Problem detected:

Some time series (store-item) in the test dataset do not exist in the training dataset. This is a huge problem for the lag features

In [76]:
train[(train.index.get_level_values('date')=='2017-08-09') & (train.index.get_level_values('item_nbr')==96995) ]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,unit_sales,onpromotion
date,store_nbr,item_nbr,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-08-09,3,96995,124780231,1.0,False
2017-08-09,6,96995,124786783,1.0,False
2017-08-09,7,96995,124789082,1.0,False
2017-08-09,36,96995,124835745,1.0,False
2017-08-09,44,96995,124851075,1.0,False
2017-08-09,45,96995,124853825,2.0,False
2017-08-09,47,96995,124859062,1.0,False
2017-08-09,48,96995,124861771,1.0,False


In [78]:
test[(test.date=='2017-08-16') & (test.item_nbr==96995)]

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,transactions,city,state,type,cluster,dcoilwtico,week,dow,dayofyear,dayoff,perishable_w
0,125575060,2017-08-16,21,96995,0,GROCERY I,1093,0,,,,,,,33,2,228,False,1.0
3901,125617971,2017-08-16,32,96995,0,GROCERY I,1093,0,,,,,,,33,2,228,False,1.0
