In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt

In [3]:
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

# Gather some features
def build_features(features, data):
    # remove NaNs
    #data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # CompetionOpen en PromoOpen from https://www.kaggle.com/ananya77041/rossmann-store-sales/randomforestpython/code
    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data

In [114]:
## Start of main script

print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("Drugstore_data/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("Drugstore_data/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("Drugstore_data/store.csv")

print("Assume store open, if not provided")
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []

print("augment features")
build_features(features, train)
build_features([], test)
print(features)

print('training data processed')

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.2,
          "max_depth": 30,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300

Load the training, test and store data using pandas
Assume store open, if not provided
Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero. Simplifies calculation of rmspe
Join with store
augment features
['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']
training data processed


In [116]:
print("Train a XGBoost model")

X_train = train[(train['Date'] <= '2014-06-01') | (train['Date'] >= '2014-08-01')]
X_valid = train[(train['Date'] > '2014-06-01') & (train['Date']< '2014-08-01')]
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

Train a XGBoost model


In [123]:
?xgb.train()

In [117]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)

[0]	train-rmse:6.61558	eval-rmse:6.64013	train-rmspe:0.998701	eval-rmspe:0.998738
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:5.29647	eval-rmse:5.32193	train-rmspe:0.994711	eval-rmspe:0.994871
[2]	train-rmse:4.2414	eval-rmse:4.26692	train-rmspe:0.984664	eval-rmspe:0.985118
[3]	train-rmse:3.3982	eval-rmse:3.42414	train-rmspe:0.964315	eval-rmspe:0.965424
[4]	train-rmse:2.72292	eval-rmse:2.74769	train-rmspe:0.93062	eval-rmspe:0.932645
[5]	train-rmse:2.183	eval-rmse:2.2063	train-rmspe:0.881946	eval-rmspe:0.885035
[6]	train-rmse:1.75262	eval-rmse:1.77819	train-rmspe:0.81893	eval-rmspe:0.823995
[7]	train-rmse:1.40703	eval-rmse:1.43068	train-rmspe:0.746477	eval-rmspe:0.752739
[8]	train-rmse:1.13138	eval-rmse:1.15458	train-rmspe:0.668214	eval-rmspe:0.675879
[9]	train-rmse:0.912375	eval-rmse:0.935292	train-rmspe:0.588342	eval-rmspe:0.597154
[10]	train-rmse:0.73558	eval-rmse:0.757

[96]	train-rmse:0.001197	eval-rmse:0.131323	train-rmspe:0.001197	eval-rmspe:0.129289
[97]	train-rmse:0.001197	eval-rmse:0.131323	train-rmspe:0.001197	eval-rmspe:0.129289
[98]	train-rmse:0.001197	eval-rmse:0.131323	train-rmspe:0.001197	eval-rmspe:0.129289
[99]	train-rmse:0.001191	eval-rmse:0.131323	train-rmspe:0.001191	eval-rmspe:0.12929
[100]	train-rmse:0.001176	eval-rmse:0.131323	train-rmspe:0.001177	eval-rmspe:0.129289
[101]	train-rmse:0.001176	eval-rmse:0.131323	train-rmspe:0.001177	eval-rmspe:0.129289
[102]	train-rmse:0.001176	eval-rmse:0.131323	train-rmspe:0.001177	eval-rmspe:0.129289
[103]	train-rmse:0.001176	eval-rmse:0.131323	train-rmspe:0.001177	eval-rmspe:0.129289
[104]	train-rmse:0.00116	eval-rmse:0.131323	train-rmspe:0.00116	eval-rmspe:0.129289
[105]	train-rmse:0.001151	eval-rmse:0.131323	train-rmspe:0.001151	eval-rmspe:0.12929
[106]	train-rmse:0.001151	eval-rmse:0.131323	train-rmspe:0.001151	eval-rmspe:0.12929
[107]	train-rmse:0.001151	eval-rmse:0.131323	train-rmspe:0.0011

In [118]:
result = pd.DataFrame({"Id": test["ID"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_10_submission.csv", index=False)

In [119]:
result.loc[test['Open']==0,'Sales'] = 0

In [120]:
sorted_result = result.sort_values('Id').reset_index().drop(columns = 'index')

In [121]:
rmspe(test_ans['Sales'],sorted_result['Sales'])

0.16279381155105946