In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model, metrics
import gc; gc.enable()
import random
import time

from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import TheilSenRegressor, BayesianRidge

from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.tree import DecisionTreeRegressor

np.random.seed(1122)


# Read datasets
st = time.time()
dtypes = {'id':'int64', 'item_nbr':'int32', 'store_nbr':'int8', 'onpromotion':str}
train = pd.read_csv('../input/train.csv', dtype=dtypes, parse_dates=['date'])
test = pd.read_csv('../input/test.csv', dtype=dtypes, parse_dates=['date'])
items = pd.read_csv('../input/items.csv')
stores = pd.read_csv('../input/stores.csv')
transactions = pd.read_csv('../input/transactions.csv', parse_dates=['date'])
holidays = pd.read_csv('../input/holidays_events.csv', dtype={'transferred':bool}, parse_dates=['date'])
oil = pd.read_csv('../input/oil.csv', parse_dates=['date'])

print("runtime: {0:.2f} min", (time.time()-st)/60)

runtime: {0:.2f} min 1.5055775403976441


In [2]:
# Dataset processing
print('Datasets processing')

train2017 = train[(train['date'].dt.year == 2017)]
train201608 = train[(train['date'].dt.year == 2016) & (train['date'].dt.day > 15)]
train = pd.concat([train2017,train201608])
del train2017, train201608; gc.collect();

Datasets processing


In [3]:
# Transform target
target = train['unit_sales'].values
target[target < 0.] = 0.
train['unit_sales'] = np.log1p(target)

In [4]:
def df_lbl_enc(df):
    for c in df.columns:
        if df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            df[c] = lbl.fit_transform(df[c])
            print(c)
    return df

def df_transform(df):
    df['date'] = pd.to_datetime(df['date'])
    df['yea'] = df['date'].dt.year
    df['mon'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['date'] = df['date'].dt.dayofweek # Replace!!!
    df['onpromotion'] = df['onpromotion'].map({'False': 0, 'True': 1})
    df['perishable'] = df['perishable'].map({0:1.0, 1:1.25})
    df = df.fillna(-1)
    return df

In [5]:
items = pd.read_csv('../input/items.csv', dtype={'perishable':bool})
items.head()

Unnamed: 0,item_nbr,family,class,perishable
0,96995,GROCERY I,1093,False
1,99197,GROCERY I,1067,False
2,103501,CLEANING,3008,False
3,103520,GROCERY I,1028,False
4,103665,BREAD/BAKERY,2712,True


In [6]:
items['family'].describe()

count          4100
unique           33
top       GROCERY I
freq           1334
Name: family, dtype: object

In [7]:
items = pd.get_dummies(items, columns = ['family'] )
items.head()

Unnamed: 0,item_nbr,class,perishable,family_AUTOMOTIVE,family_BABY CARE,family_BEAUTY,family_BEVERAGES,family_BOOKS,family_BREAD/BAKERY,family_CELEBRATION,...,family_MAGAZINES,family_MEATS,family_PERSONAL CARE,family_PET SUPPLIES,family_PLAYERS AND ELECTRONICS,family_POULTRY,family_PREPARED FOODS,family_PRODUCE,family_SCHOOL AND OFFICE SUPPLIES,family_SEAFOOD
0,96995,1093,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,99197,1067,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,103501,3008,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,103520,1028,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,103665,2712,True,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
transactions.head()
# Notes:
# The number of transactions is available only for the training data set
# Two options:
# 1) Create a feature as a combination of the historical data for the forecasting. For instance, what was the average
#    nbr of transactions per date and store 
# 2) Predict the number of transactions as a first step before predicting the store_sales

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [9]:
stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [10]:
stores = pd.get_dummies(stores, columns = ['type'] )
stores.head()

Unnamed: 0,store_nbr,city,state,cluster,type_A,type_B,type_C,type_D,type_E
0,1,Quito,Pichincha,13,0,0,0,1,0
1,2,Quito,Pichincha,13,0,0,0,1,0
2,3,Quito,Pichincha,8,0,0,0,1,0
3,4,Quito,Pichincha,9,0,0,0,1,0
4,5,Santo Domingo,Santo Domingo de los Tsachilas,4,0,0,0,1,0


In [11]:
holidays = pd.read_csv('../input/holidays_events.csv', dtype={'transferred':bool}, parse_dates=['date'])
#data['hol']['transferred'].map({'False': False, 'True': True})
holidays.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [12]:
stores['city'].unique()

array(['Quito', 'Santo Domingo', 'Cayambe', 'Latacunga', 'Riobamba',
       'Ibarra', 'Guaranda', 'Puyo', 'Ambato', 'Guayaquil', 'Salinas',
       'Daule', 'Babahoyo', 'Quevedo', 'Playas', 'Libertad', 'Cuenca',
       'Loja', 'Machala', 'Esmeraldas', 'Manta', 'El Carmen'], dtype=object)

In [13]:
holidays['locale_name'].unique()

array(['Manta', 'Cotopaxi', 'Cuenca', 'Libertad', 'Riobamba', 'Puyo',
       'Guaranda', 'Imbabura', 'Latacunga', 'Machala', 'Santo Domingo',
       'El Carmen', 'Cayambe', 'Esmeraldas', 'Ecuador', 'Ambato', 'Ibarra',
       'Quevedo', 'Santo Domingo de los Tsachilas', 'Santa Elena', 'Quito',
       'Loja', 'Salinas', 'Guayaquil'], dtype=object)

In [14]:
holidays.loc[lambda df: (df.description.str.contains('Independencia de Guayaquil')),]

Unnamed: 0,date,type,locale,locale_name,description,transferred
19,2012-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True
20,2012-10-12,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
72,2013-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True
73,2013-10-11,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
135,2014-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True
136,2014-10-10,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
190,2015-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,False
274,2016-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,False
331,2017-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,False


In [15]:
# First attempt to use the holidays
# Create a binary field for national holidays which is expected to affect overall sales
# In a next step, regional holidays should be introduced
holidays['national_holiday'] = False 
holidays.loc[lambda df: (df.type=='Holiday') & (df.locale=='National') & ~(df.transferred), 'national_holiday']=True
holidays.loc[lambda df: (df.type=='Transfer') & (df.locale=='National'), 'national_holiday']=True

holidays.loc[lambda df: (df.description.str.contains('Independencia de Guayaquil')),]

Unnamed: 0,date,type,locale,locale_name,description,transferred,national_holiday
19,2012-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True,False
20,2012-10-12,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False,True
72,2013-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True,False
73,2013-10-11,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False,True
135,2014-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True,False
136,2014-10-10,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False,True
190,2015-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,False,True
274,2016-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,False,True
331,2017-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,False,True


In [19]:
holidays[['date','national_holiday']].head()

Unnamed: 0,date,national_holiday
0,2012-03-02,False
1,2012-04-01,False
2,2012-04-12,False
3,2012-04-14,False
4,2012-04-21,False


In [1]:
train = pd.merge(train, items, how='left', on=['item_nbr'])
train = pd.merge(train, transactions, how='left', on=['date','store_nbr'])
train = pd.merge(train, stores, how='left', on=['store_nbr'])
train = pd.merge(train, holidays[['date','national_holiday']], how='left', on=['date'])
train = pd.merge(train, oil, how='left', on=['date'])

NameError: name 'pd' is not defined

In [None]:
#train.head()
train.columns

In [18]:
test = pd.merge(test, items, how='left', on=['item_nbr'])
test = pd.merge(test, transactions, how='left', on=['date','store_nbr'])
test = pd.merge(test, stores, how='left', on=['store_nbr'])
test = pd.merge(test, holidays[['date','national_holiday']], how='left', on=['date'])
test = pd.merge(test, oil, how='left', on=['date'])
test.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,class,perishable,family_AUTOMOTIVE,family_BABY CARE,family_BEAUTY,...,city,state,cluster,type_A,type_B,type_C,type_D,type_E,national_holiday,dcoilwtico
0,125497040,2017-08-16,1,96995,False,1093,False,0,0,0,...,Quito,Pichincha,13,0,0,0,1,0,,46.8
1,125497041,2017-08-16,1,99197,False,1067,False,0,0,0,...,Quito,Pichincha,13,0,0,0,1,0,,46.8
2,125497042,2017-08-16,1,103501,False,3008,False,0,0,0,...,Quito,Pichincha,13,0,0,0,1,0,,46.8
3,125497043,2017-08-16,1,103520,False,1028,False,0,0,0,...,Quito,Pichincha,13,0,0,0,1,0,,46.8
4,125497044,2017-08-16,1,103665,False,2712,True,0,0,0,...,Quito,Pichincha,13,0,0,0,1,0,,46.8
