In [64]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import cross_validation
import xgboost as xgb

## Get the data

In [65]:
# dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')
train = pd.read_csv('data/train.csv', low_memory=False, parse_dates = ['Date'])
test = pd.read_csv('data/test.csv', low_memory=False, parse_dates = ['Date'])
stores = pd.read_csv('data/store.csv', low_memory=False)

In [66]:
print train.isnull().sum(axis=0)
print test.isnull().sum(axis=0)
print stores.isnull().sum(axis=0)

Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64
Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64
Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64


In [67]:
test.fillna(1, inplace=True)
train = train[train["Open"] != 0]

In [68]:
# merge datasets with Stores data
train = pd.merge(train, stores, on='Store')
test = pd.merge(test, stores, on='Store')

## Prepare the data

### 1. Feature engineering

In [69]:
def build_features(data):
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1

    dates = pd.DatetimeIndex(data.Date)
    data['Day'] = data.Date.apply(lambda x: x.day)
    data['Month'] = data.Date.apply(lambda x: x.month)
    data['Year'] = data.Date.apply(lambda x: x.year)
    data['WeekOfYear'] = data.Date.apply(lambda x: x.weekofyear)
    
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + (data.Month - data.CompetitionOpenSinceMonth)
    data['CompetitionOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)

    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + (data.WeekOfYear - data.Promo2SinceWeek) / float(4)
    data['PromoOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)

    data['p_1'] = data.PromoInterval.apply(lambda x: x[:3] if type(x) == str else 0)
    data['p_2'] = data.PromoInterval.apply(lambda x: x[4:7] if type(x) == str else 0)
    data['p_3'] = data.PromoInterval.apply(lambda x: x[8:11] if type(x) == str else 0)
    data['p_4'] = data.PromoInterval.apply(lambda x: x[12:15] if type(x) == str else 0)
    

In [70]:
build_features(train)
build_features(test)

In [71]:
train.dtypes

Store                                 int64
DayOfWeek                             int64
Date                         datetime64[ns]
Sales                                 int64
Customers                             int64
Open                                  int64
Promo                                 int64
StateHoliday                         object
SchoolHoliday                         int64
StoreType                            object
Assortment                           object
CompetitionDistance                 float64
CompetitionOpenSinceMonth           float64
CompetitionOpenSinceYear            float64
Promo2                                int64
Promo2SinceWeek                     float64
Promo2SinceYear                     float64
PromoInterval                        object
Day                                   int64
Month                                 int64
Year                                  int64
WeekOfYear                            int64
CompetitionOpen                 

In [72]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for c in ['StateHoliday', 'StoreType', 'Assortment', 'p_1', 'p_2', 'p_3', 'p_4']:
    train[c] = lbl_enc.fit_transform(train[c])
    test[c] = lbl_enc.transform(test[c])

In [73]:
# Choose columns
features = list(train.columns)
[features.remove(c) for c in ['Date', 'Sales', 'Customers', 'PromoInterval', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'Promo2SinceWeek']]
print "Features: "
print features

Features: 
['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'Promo2', 'Day', 'Month', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'p_1', 'p_2', 'p_3', 'p_4']


In [74]:
train.columns

Index([u'Store', u'DayOfWeek', u'Date', u'Sales', u'Customers', u'Open',
       u'Promo', u'StateHoliday', u'SchoolHoliday', u'StoreType',
       u'Assortment', u'CompetitionDistance', u'CompetitionOpenSinceMonth',
       u'CompetitionOpenSinceYear', u'Promo2', u'Promo2SinceWeek',
       u'Promo2SinceYear', u'PromoInterval', u'Day', u'Month', u'Year',
       u'WeekOfYear', u'CompetitionOpen', u'PromoOpen', u'p_1', u'p_2', u'p_3',
       u'p_4'],
      dtype='object')

## Train XGBoost and predict sales 

In [82]:
# Set params

params = {"objective": "reg:linear",
          "booster": "gbtree",
          "eta": 0.02,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1
          }
num_trees = 3000

In [77]:
# Thanks to Chenglong Chen for providing this in the forum
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w


def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe


def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

### 1. Check XGB score 

In [78]:
X_train, X_test = cross_validation.train_test_split(train, test_size=0.0113)
dtrain = xgb.DMatrix(X_train[features].as_matrix(), np.log(X_train["Sales"] + 1))
dvalid = xgb.DMatrix(X_test[features].as_matrix(), np.log(X_test["Sales"] + 1))
dtest = xgb.DMatrix(test[features].as_matrix())
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

Will train until train error hasn't decreased in 100 rounds.
[0]	eval-rmspe:0.999837	train-rmspe:0.999805
[1]	eval-rmspe:0.999779	train-rmspe:0.999747
[2]	eval-rmspe:0.999711	train-rmspe:0.999680
[3]	eval-rmspe:0.999634	train-rmspe:0.999603
[4]	eval-rmspe:0.999547	train-rmspe:0.999515
[5]	eval-rmspe:0.999447	train-rmspe:0.999415
[6]	eval-rmspe:0.999333	train-rmspe:0.999302
[7]	eval-rmspe:0.999205	train-rmspe:0.999174
[8]	eval-rmspe:0.999060	train-rmspe:0.999029
[9]	eval-rmspe:0.998896	train-rmspe:0.998866
[10]	eval-rmspe:0.998714	train-rmspe:0.998683
[11]	eval-rmspe:0.998509	train-rmspe:0.998479
[12]	eval-rmspe:0.998280	train-rmspe:0.998251
[13]	eval-rmspe:0.998026	train-rmspe:0.997997
[14]	eval-rmspe:0.997744	train-rmspe:0.997715
[15]	eval-rmspe:0.997431	train-rmspe:0.997402
[16]	eval-rmspe:0.997085	train-rmspe:0.997057
[17]	eval-rmspe:0.996704	train-rmspe:0.996677
[18]	eval-rmspe:0.996286	train-rmspe:0.996259
[19]	eval-rmspe:0.995825	train-rmspe:0.995799
[20]	eval-rmspe:0.995322	trai

In [79]:
# Validate predictions
train_probs = gbm.predict(xgb.DMatrix(X_test[features].as_matrix()))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
print('error', error)

('error', 0.10235136901226866)


In [80]:
#Make predictions on the test set
test_probs = gbm.predict(xgb.DMatrix(test[features].as_matrix()))
indices = test_probs < 0
test_probs[indices] = 0

In [81]:
submission = pd.DataFrame({"Id": test["Id"], "Sales": np.exp(test_probs) - 1})
submission.to_csv("submit.csv", index=False)