# Set-up

In [1]:
# DATA MANIPULATION
import numpy as np # linear algebra
import pandas as pd # data processing
import random 
import datetime # manipulating date formats

# VIZUALIZATION
import matplotlib.pyplot as plt # basic plotting
%matplotlib inline

# SUPERVISED LEARNING
from sklearn import metrics
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [1]:
dtypes = {'id':'int64', 'item_nbr':'int32', 'store_nbr':'int8', 'onpromotion':str}


# Load datasets

In [2]:
# Reading train
test = pd.read_csv('../input/processed/test.csv', dtype=dtypes, parse_dates=['date'])

train = merge_with_extra_datasets(train)

NameError: name 'pd' is not defined

In [10]:
train.dropna(inplace=True)
#print([(c,train[c].isnull().sum()) for c in train.columns])

[('id', 0), ('date', 0), ('store_nbr', 0), ('item_nbr', 0), ('unit_sales', 0), ('onpromotion', 0), ('family', 0), ('class', 0), ('perishable', 0), ('transactions', 0), ('city', 0), ('state', 0), ('type', 0), ('cluster', 0), ('dcoilwtico', 0), ('week', 0), ('dow', 0), ('dayofyear', 0), ('dayoff', 0)]


# Feature engineering

In [11]:
## Transform onpromotion
def transform_onpromotion(df):
    df['onpromotion'] = df['onpromotion'].map({'False': 0, 'True': 1})
    return df

## Some features were not very useful: year,month,day
def add_date_features(df):
    #df['year'] = df['date'].dt.year
    #df['month'] = df['date'].dt.month
    #df['day'] = df['date'].dt.day
    df['week'] = df['date'].dt.week
    df['dow'] = df['date'].dt.dayofweek
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayoff']=[x in [5,6] for x in df.dow] ## Weekends
    return df

## Transform target variable
def transform_unit_sales(df):
    df.loc[df.unit_sales < 0., 'unit_sales'] = 0.
    df['unit_sales'] = np.log1p(df.unit_sales)
    return df

## Categorical features
def encode(df, column) -> pd.DataFrame:
    one_hot = pd.get_dummies(df[column], drop_first=True, prefix=column)
    #return (one_hot - one_hot.mean()) / one_hot.std()
    return one_hot

def encode_categorical_features(df):
    cat_columns = ['item_nbr']
    
    for column in cat_columns:
        column_enc = encode(df, column)
        df = pd.concat([df,column_enc], axis=1)
    
    return df

def add_weigth_feature(df):
    df['perishable_w'] = df['perishable'].map({0:1.0, 1:1.25})
    return df


train = transform_unit_sales(train)
train = transform_onpromotion(train)
train = add_date_features(train)
train = add_weigth_feature(train)
#train = encode_categorical_features(train)

In [12]:
## Count items by store
train.groupby(['store_nbr','item_nbr']).unit_sales.size().head(10)

store_nbr  item_nbr
21         103501      575
           103520      332
           103665      417
           105576      581
           105693      472
           105857      538
           106716      543
           108079      379
           108634      122
           108696      541
Name: unit_sales, dtype: int64

In [13]:
def add_lag_features(df):
    
    u_store = df.store_nbr.unique()
    df_2 = pd.DataFrame()

    df['lag_7'] = np.nan
    #train['lag_annual'] = np.nan

    for s in u_store:
        tmp = train[train.store_nbr==s]
        u_items = tmp.item_nbr.unique()  
        dates_range = pd.date_range(tmp.date.min(), tmp.date.max())

        # Reindex by date and item
        tmp.set_index(["date", "item_nbr"], inplace=True)
        tmp = tmp.reindex(
            pd.MultiIndex.from_product(
                [dates_range, u_items],
                names=["date", "item_nbr"]
            )
        )
        tmp.sort_index(inplace=True)

        # Create lag features
        tmp['lag_7'] = tmp['unit_sales'].shift(7*len(u_items))

        # Delete temporal df 
        tmp = tmp.reset_index()

        # Drop nan in transactions column
        tmp.dropna(subset=['unit_sales'], inplace=True)



        # Correlation coefficient
        print(tmp[['unit_sales','lag_7']].corr())

        df_2 = pd.concat([df_2,tmp])
        
    return df_2

    
train = add_lag_features(df)

            unit_sales     lag_7
unit_sales    1.000000  0.580451
lag_7         0.580451  1.000000
            unit_sales     lag_7
unit_sales    1.000000  0.524472
lag_7         0.524472  1.000000


### Note
Ignore items which are not in the test data
It is necessary to create a stage 2 for creating lag features from seasonal data

### Check the missing data

This is necessary in this point, because in the next step a lot of rows will be injected to provide complete time series.

md = [(c,train[c].isnull().sum()) for c in train.columns]

for a in md:
    if a[1]>0:
        print(a)

In [16]:
# Drop nan in transactions column
#train.dropna(subset=['unit_sales'], inplace=True)
train.dropna(inplace=True)

# Prediction


### Error metric

In [17]:
def NWRMSLE(y, pred, w):
    return metrics.mean_squared_error(y, pred, sample_weight=w)**0.5

### Splitting data

In [18]:
cols = [c for c in train if c not in ['id','date','store_nbr','city','state','type','cluster',
                                      'item_nbr','family','class', 'perishable_w',
                                      'unit_sales']]
cols

['onpromotion',
 'perishable',
 'transactions',
 'dcoilwtico',
 'week',
 'dow',
 'dayofyear',
 'dayoff',
 'lag_7']

In [19]:
X1 = train.loc[(train.date<'2017-08-01') & (train.date>='2016-01-01')]
X2 = train.loc[train.date>='2017-08-01'].copy()

target_column = 'unit_sales' 
y1 = X1[target_column].values
y2 = X2[target_column].values

In [20]:
X1[cols].head()

Unnamed: 0,onpromotion,perishable,transactions,dcoilwtico,week,dow,dayofyear,dayoff,lag_7
21637,0.0,0.0,7.126891,32.607,1.0,5.0,9.0,True,1.223156
21639,0.0,1.0,7.126891,32.607,1.0,5.0,9.0,True,0.959135
21640,0.0,0.0,7.126891,32.607,1.0,5.0,9.0,True,1.457646
21642,0.0,0.0,7.126891,32.607,1.0,5.0,9.0,True,0.959135
21643,0.0,0.0,7.126891,32.607,1.0,5.0,9.0,True,1.026672


### Regressors

In [21]:
from sklearn import metrics

np.random.seed(1122)

number_regressors_to_test = 3
for method in range(1, number_regressors_to_test+1):
    print('\nmethod = ', method)
    
    if (method==1):
        print('Multilayer perceptron (MLP) neural network 01')
        str_method = 'MLP model01'    
        r = MLPRegressor(hidden_layer_sizes=(3,), max_iter=100)
    if (method==2):
        print('Bagging Regressor 01')
        str_method = 'BaggingRegressor01'
        r = BaggingRegressor(DecisionTreeRegressor(max_depth=6,max_features=0.85))
    if (method==3):
        print('GradientBoosting 01')
        str_method = 'GradientBoosting01'
        r = GradientBoostingRegressor()        

    r.fit(X1[cols], y1)
    yh2 = r.predict(X2[cols])
    X2['prediction_%d'%method] = yh2
    #m = metrics.mean_squared_error(y2, yh2)**0.5
    m = NWRMSLE(y2, yh2, X2['perishable_w'])


    print("Error: %f" % (m))


method =  1
Multilayer perceptron (MLP) neural network 01
Error: 0.236785

method =  2
Bagging Regressor 01
Error: 0.231168

method =  3
GradientBoosting 01
Error: 0.230324


In [22]:
X1[cols].head()

Unnamed: 0,onpromotion,perishable,transactions,dcoilwtico,week,dow,dayofyear,dayoff,lag_7
21637,0.0,0.0,7.126891,32.607,1.0,5.0,9.0,True,1.223156
21639,0.0,1.0,7.126891,32.607,1.0,5.0,9.0,True,0.959135
21640,0.0,0.0,7.126891,32.607,1.0,5.0,9.0,True,1.457646
21642,0.0,0.0,7.126891,32.607,1.0,5.0,9.0,True,0.959135
21643,0.0,0.0,7.126891,32.607,1.0,5.0,9.0,True,1.026672


# Conclusions

Improvements:
- Initial: 0.79
- Extra features (sales, items) + lag_7: 0.23

# Predictions for test data

In [23]:
## Train best method
np.random.seed(1122)
r = GradientBoostingRegressor()   
r.fit(X1[cols], y1)
yh2 = r.predict(X2[cols])
m = NWRMSLE(y2, yh2, X2['perishable_w'])
print("Error: %f" % (m))

Error: 0.230324


In [24]:
import pickle
pickle.dump(r, open('../input/models/simple.sav', 'wb'))

In [25]:
test = pd.read_csv('../input/test.csv', dtype=dtypes, parse_dates=['date'])

In [None]:
u_stores = 

In [None]:
## Create initial dataset
dates_range =  pd.date_range('2017-08-16', '2017-08-31')

a = pd.DataFrame({'date':dates_range, 'key':0})
b = pd.DataFrame({'store_nbr':sorted(u_stores), 'key':0})

X3 = a.merge(b, how='outer').drop('key',1)

## Pre-processing
X3 = merge_sales(X3)
add_date_features(X3)
enable_holidays(X3)
X3 = encode_categorical_features(X3)