In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import cross_validation
import xgboost as xgb

## Get the data

In [2]:
# dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)
stores = pd.read_csv('data/store.csv', low_memory=False)

In [3]:
print train.isnull().sum(axis=0)
print test.isnull().sum(axis=0)
print stores.isnull().sum(axis=0)

Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64
Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64
Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64


In [4]:
# merge datasets with Stores data
train = pd.merge(train, stores, on='Store')
test = pd.merge(test, stores, on='Store')

In [5]:
train.fillna(0, inplace=True)
train = train.loc[train.Open == 1]
train = train.loc[train.Sales != 0]

In [6]:
test.fillna(0, inplace=True)

## Prepare the data

### 1. Feature engineering

In [8]:
train.dtypes

Store                          int64
DayOfWeek                      int64
Date                          object
Sales                          int64
Customers                      int64
Open                           int64
Promo                          int64
StateHoliday                  object
SchoolHoliday                  int64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
dtype: object

In [9]:
# Format dates in Train
dates = pd.DatetimeIndex(train.Date)
train['Day'] = dates.day
train['Month'] = dates.month
train['Year'] = dates.year

In [10]:
# Format dates in Test
dates = pd.DatetimeIndex(test.Date)
test['Day'] = dates.day
test['Month'] = dates.month
test['Year'] = dates.year

In [11]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for c in ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']:
    train[c] = lbl_enc.fit_transform(train[c])
    test[c] = lbl_enc.fit_transform(test[c])

In [16]:
train.columns

Index([u'Store', u'DayOfWeek', u'Date', u'Sales', u'Customers', u'Open',
       u'Promo', u'StateHoliday', u'SchoolHoliday', u'StoreType',
       u'Assortment', u'CompetitionDistance', u'CompetitionOpenSinceMonth',
       u'CompetitionOpenSinceYear', u'Promo2', u'Promo2SinceWeek',
       u'Promo2SinceYear', u'PromoInterval', u'Day', u'Month', u'Year'],
      dtype='object')

In [18]:
# Choose columns
features = list(train.columns)
[features.remove(c) for c in ['Date', 'Year', 'Sales', 'Customers']]
print "Features: "
print features

Features: 
['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Day', 'Month']


## Train XGBoost and predict sales 

In [19]:
# Set params

params = {"objective": "reg:linear",
          "booster": "gbtree",
          "eta": 0.025,
          "max_depth": 10,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "silent": 1
          }
num_trees = 2000

In [20]:
# Thanks to Chenglong Chen for providing this in the forum
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w


def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe


def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

### 1. Check XGB score 

In [21]:
X_train, X_test = cross_validation.train_test_split(train, test_size=0.0125)
dtrain = xgb.DMatrix(X_train[features].as_matrix(), np.log(X_train["Sales"] + 1))
dvalid = xgb.DMatrix(X_test[features].as_matrix(), np.log(X_test["Sales"] + 1))
dtest = xgb.DMatrix(test[features].as_matrix())
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

Will train until train error hasn't decreased in 100 rounds.
[0]	eval-rmspe:0.999824	train-rmspe:0.999823
[1]	eval-rmspe:0.999746	train-rmspe:0.999746
[2]	eval-rmspe:0.999654	train-rmspe:0.999654
[3]	eval-rmspe:0.999546	train-rmspe:0.999545
[4]	eval-rmspe:0.999419	train-rmspe:0.999418
[5]	eval-rmspe:0.999270	train-rmspe:0.999269
[6]	eval-rmspe:0.999096	train-rmspe:0.999095
[7]	eval-rmspe:0.998895	train-rmspe:0.998894
[8]	eval-rmspe:0.998662	train-rmspe:0.998661
[9]	eval-rmspe:0.998395	train-rmspe:0.998393
[10]	eval-rmspe:0.998089	train-rmspe:0.998087
[11]	eval-rmspe:0.997739	train-rmspe:0.997738
[12]	eval-rmspe:0.997342	train-rmspe:0.997341
[13]	eval-rmspe:0.996893	train-rmspe:0.996891
[14]	eval-rmspe:0.996386	train-rmspe:0.996384
[15]	eval-rmspe:0.995817	train-rmspe:0.995815
[16]	eval-rmspe:0.995178	train-rmspe:0.995176
[17]	eval-rmspe:0.994466	train-rmspe:0.994465
[18]	eval-rmspe:0.993674	train-rmspe:0.993673
[19]	eval-rmspe:0.992797	train-rmspe:0.992797
[20]	eval-rmspe:0.991825	trai

In [22]:
# Validate predictions
train_probs = gbm.predict(xgb.DMatrix(X_test[features].as_matrix()))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
print('error', error)

('error', 0.12404202766598532)


In [23]:
#Make predictions on the test set
test_probs = gbm.predict(xgb.DMatrix(test[features].as_matrix()))
indices = test_probs < 0
test_probs[indices] = 0

In [24]:
submission = pd.DataFrame({"Id": test["Id"], "Sales": np.exp(test_probs) - 1})
submission.to_csv("submit.csv", index=False)