In [56]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import cross_validation
import xgboost as xgb

## Get the data

In [57]:
# dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)
stores = pd.read_csv('data/store.csv', low_memory=False)

In [58]:
test.isnull().sum(axis=0)

Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64

In [35]:
# merge datasets with Stores data

train = pd.merge(train, stores, on='Store')
test = pd.merge(test, stores, on='Store')

## Prepare the data

### 1. Feature engineering

In [36]:
# Thanks to Chenglong Chen for providing this in the forum
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w


def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe


def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

In [37]:
# Gather some features
def build_features(features, data):
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
                     'CompetitionOpenSinceYear', 'Open', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear'])

    # add some more with a bit of preprocessing
    features.append('SchoolHoliday')
    data['SchoolHoliday'] = data['SchoolHoliday'].astype(float)
    
    features.append('StateHoliday')
    data.loc[data['StateHoliday'] == 'a', 'StateHoliday'] = '1'
    data.loc[data['StateHoliday'] == 'b', 'StateHoliday'] = '2'
    data.loc[data['StateHoliday'] == 'c', 'StateHoliday'] = '3'
    data['StateHoliday'] = data['StateHoliday'].astype(float)
    
    features.append('DayOfWeek')
    features.append('Month')
    features.append('Day')
    features.append('Year')
    
    dates = pd.DatetimeIndex(data.Date)
    data['Day'] = dates.day
    data['Month'] = dates.month
    data['Year'] = dates.year

    
    features.append('StoreType')
    data.loc[data['StoreType'] == 'a', 'StoreType'] = '1'
    data.loc[data['StoreType'] == 'b', 'StoreType'] = '2'
    data.loc[data['StoreType'] == 'c', 'StoreType'] = '3'
    data.loc[data['StoreType'] == 'd', 'StoreType'] = '4'
    data['StoreType'] = data['StoreType'].astype(float)

    features.append('Assortment')
    data.loc[data['Assortment'] == 'a', 'Assortment'] = '1'
    data.loc[data['Assortment'] == 'b', 'Assortment'] = '2'
    data.loc[data['Assortment'] == 'c', 'Assortment'] = '3'
    data['Assortment'] = data['Assortment'].astype(float)

In [38]:
features = []

print("Features are: ")
build_features(features, train)
build_features([], test)
print(features)

Features are: 
['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Open', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'StoreType', 'Assortment']


## Train XGBoost and predict sales 

In [41]:
# Set params

params = {"objective": "reg:linear",
          "eta": 0.2,
          "max_depth": 8,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "silent": 1
          }
num_trees = 600

### 1. Check XGB score 

In [49]:
X_train, X_test = cross_validation.train_test_split(train, test_size=0.0125)
dtrain = xgb.DMatrix(X_train[features].as_matrix(), np.log(X_train["Sales"] + 1))
dvalid = xgb.DMatrix(X_test[features].as_matrix(), np.log(X_test["Sales"] + 1))
dtest = xgb.DMatrix(test[features].as_matrix())
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True)

Will train until train error hasn't decreased in 50 rounds.
[0]	eval-rmspe:0.911543	train-rmspe:0.910109
[1]	eval-rmspe:0.908345	train-rmspe:0.906936
[2]	eval-rmspe:0.899959	train-rmspe:0.898579
[3]	eval-rmspe:0.882699	train-rmspe:0.881178
[4]	eval-rmspe:0.853712	train-rmspe:0.851308
[5]	eval-rmspe:0.812923	train-rmspe:0.807457
[6]	eval-rmspe:0.763684	train-rmspe:0.750653
[7]	eval-rmspe:0.712890	train-rmspe:0.684710
[8]	eval-rmspe:0.668216	train-rmspe:0.614073
[9]	eval-rmspe:0.637428	train-rmspe:0.544435
[10]	eval-rmspe:0.621369	train-rmspe:0.479576
[11]	eval-rmspe:0.627478	train-rmspe:0.423610
[12]	eval-rmspe:0.629487	train-rmspe:0.378931
[13]	eval-rmspe:0.634977	train-rmspe:0.344358
[14]	eval-rmspe:0.663393	train-rmspe:0.321908
[15]	eval-rmspe:0.691389	train-rmspe:0.308194
[16]	eval-rmspe:0.731932	train-rmspe:0.300183
[17]	eval-rmspe:0.756068	train-rmspe:0.297978
[18]	eval-rmspe:0.774775	train-rmspe:0.297651
[19]	eval-rmspe:0.787788	train-rmspe:0.299391
[20]	eval-rmspe:0.796904	train

In [50]:
# Validate predictions
train_probs = gbm.predict(xgb.DMatrix(X_test[features].as_matrix()))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
print('error', error)

('error', 0.87372938780907095)


In [53]:
#Make predictions on the test set
test_probs = gbm.predict(xgb.DMatrix(test[features].as_matrix()))
indices = test_probs < 0
test_probs[indices] = 0

Make predictions on the test set


In [55]:
submission = pd.DataFrame({"Id": test["Id"], "Sales": np.exp(test_probs) - 1})
submission.to_csv("submit.csv", index=False)