In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt

In [10]:
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

In [20]:
def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

# Gather some features
def build_features(features, data):
    # remove NaNs
    #data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment']) #, 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    #data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # CompetionOpen en PromoOpen from https://www.kaggle.com/ananya77041/rossmann-store-sales/randomforestpython/code
    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique()[1:4]:
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data

In [62]:
## Start of main script

print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(int),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("Drugstore_data/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("Drugstore_data/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("Drugstore_data/store.csv")

print("Assume store open, if not provided")
#train.fillna(1, inplace=True)
#test.fillna(1, inplace=True)

print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []

print("augment features")
build_features(features, train)
build_features([], test)
print(features)

print('training data processed')

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.01,
          "subsample": 0.5,
          "max_depth": 8,
          "silent": 1,
          "seed": 4428
          }
num_boost_round = 4000

Load the training, test and store data using pandas
Assume store open, if not provided
Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero. Simplifies calculation of rmspe
Join with store
augment features
['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', 'StoreType', 'Assortment', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']
training data processed


In [63]:
print("Train a XGBoost model")

X_train = train[(train['Date'] <= '2014-12-31')] 
X_valid = train[(train['Date'] > '2014-12-31')]
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

Train a XGBoost model


In [64]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)

[0]	train-rmse:7.43672	eval-rmse:7.46565	train-rmspe:0.999524	eval-rmspe:0.999541
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:6.69467	eval-rmse:6.72452	train-rmspe:0.998819	eval-rmspe:0.998861
[2]	train-rmse:6.02696	eval-rmse:6.05737	train-rmspe:0.997545	eval-rmspe:0.997634
[3]	train-rmse:5.42611	eval-rmse:5.45744	train-rmspe:0.995404	eval-rmspe:0.99557
[4]	train-rmse:4.88535	eval-rmse:4.91669	train-rmspe:0.992027	eval-rmspe:0.992309
[5]	train-rmse:4.39887	eval-rmse:4.42944	train-rmspe:0.986971	eval-rmspe:0.987412
[6]	train-rmse:3.96088	eval-rmse:3.99184	train-rmspe:0.979826	eval-rmspe:0.980496
[7]	train-rmse:3.56702	eval-rmse:3.59807	train-rmspe:0.970092	eval-rmspe:0.971061
[8]	train-rmse:3.21272	eval-rmse:3.2423	train-rmspe:0.95741	eval-rmspe:0.958696
[9]	train-rmse:2.89407	eval-rmse:2.92322	train-rmspe:0.941499	eval-rmspe:0.943187
[10]	train-rmse:2.60737	eval-rmse:2.6

[96]	train-rmse:0.044831	eval-rmse:0.19716	train-rmspe:0.045498	eval-rmspe:0.215918
[97]	train-rmse:0.044369	eval-rmse:0.196978	train-rmspe:0.045027	eval-rmspe:0.215777
[98]	train-rmse:0.043614	eval-rmse:0.196785	train-rmspe:0.04425	eval-rmspe:0.21555
[99]	train-rmse:0.04285	eval-rmse:0.19675	train-rmspe:0.043454	eval-rmspe:0.215533
[100]	train-rmse:0.042487	eval-rmse:0.196709	train-rmspe:0.04309	eval-rmspe:0.215446
[101]	train-rmse:0.042051	eval-rmse:0.196732	train-rmspe:0.042644	eval-rmspe:0.215487
[102]	train-rmse:0.041542	eval-rmse:0.196599	train-rmspe:0.042109	eval-rmspe:0.215338
[103]	train-rmse:0.041107	eval-rmse:0.196539	train-rmspe:0.041662	eval-rmspe:0.21525
[104]	train-rmse:0.040297	eval-rmse:0.196588	train-rmspe:0.040823	eval-rmspe:0.215316
[105]	train-rmse:0.039587	eval-rmse:0.196659	train-rmspe:0.04009	eval-rmspe:0.215399
[106]	train-rmse:0.038975	eval-rmse:0.19669	train-rmspe:0.039453	eval-rmspe:0.215446
[107]	train-rmse:0.038815	eval-rmse:0.196613	train-rmspe:0.039291	e

[192]	train-rmse:0.018486	eval-rmse:0.196093	train-rmspe:0.018555	eval-rmspe:0.214871
[193]	train-rmse:0.018321	eval-rmse:0.196093	train-rmspe:0.01839	eval-rmspe:0.214877
[194]	train-rmse:0.018277	eval-rmse:0.196089	train-rmspe:0.018345	eval-rmspe:0.214873
[195]	train-rmse:0.018074	eval-rmse:0.196098	train-rmspe:0.01814	eval-rmspe:0.214875
[196]	train-rmse:0.017935	eval-rmse:0.196104	train-rmspe:0.018	eval-rmspe:0.214883
[197]	train-rmse:0.017869	eval-rmse:0.196103	train-rmspe:0.017934	eval-rmspe:0.214879
[198]	train-rmse:0.017819	eval-rmse:0.196101	train-rmspe:0.017883	eval-rmspe:0.214873
[199]	train-rmse:0.017734	eval-rmse:0.1961	train-rmspe:0.017797	eval-rmspe:0.21487
[200]	train-rmse:0.017703	eval-rmse:0.1961	train-rmspe:0.017766	eval-rmspe:0.214869
[201]	train-rmse:0.017578	eval-rmse:0.196088	train-rmspe:0.017641	eval-rmspe:0.214856
[202]	train-rmse:0.017463	eval-rmse:0.19609	train-rmspe:0.017525	eval-rmspe:0.214855
[203]	train-rmse:0.017397	eval-rmse:0.196093	train-rmspe:0.017458

In [65]:
result = pd.DataFrame({"Id": test["ID"], 'Sales': np.expm1(test_probs)})
result.loc[test['Open']==0,'Sales'] = 0
sorted_result = result.sort_values('Id').reset_index().drop(columns = 'index')
test_ans = pd.read_csv("Drugstore_data/test_ans.csv")
rmspe(test_ans['Sales'],sorted_result['Sales'])

0.17010021168813647

In [52]:
sorted_result.to_csv("xgboost_submission.csv", index=False)

Unnamed: 0,Id,Sales
0,1,5171.888672
1,2,4965.195801
2,3,4646.04541
3,4,5846.708984
4,5,4629.85791


In [2]:
test_ans = pd.read_csv('Drugstore_data/test_ans.csv')

In [72]:
#sub = pd.read_csv('Rossmann_submission_1.csv')
sub = pd.read_csv('Rossmann_submission_3.csv')

In [73]:
sub2 =sub.sort_values(by='ID').reset_index().drop(columns = 'index')

In [74]:
sub2.loc[test_ans['Sales']==0,'Sales'] = 0

In [75]:
sub2.to_csv('team1_sub.csv',index=False)

In [76]:
test_ans['Sales'].shape

(46830,)

In [77]:
test_ans['Sales'][test_ans['Sales']!=0].shape

(40282,)

In [78]:
rmspe(test_ans['Sales'],sub2['Sales'])

0.1398152422709307

In [48]:
rmspe(test_ans['Sales'][test_ans['Sales']!=0],sub2['Sales'][sub2['Sales']!=0])

0.13914959000590033

In [106]:
pd.to_numeric(train['Date'])[0]-pd.to_numeric(train['Date'])[1]

-86400000000000

In [105]:
train['Date']

0        2013-01-01
1        2013-01-02
2        2013-01-03
3        2013-01-04
4        2013-01-05
5        2013-01-06
6        2013-01-07
7        2013-01-08
8        2013-01-09
9        2013-01-10
10       2013-01-11
11       2013-01-12
12       2013-01-13
13       2013-01-14
14       2013-01-15
15       2013-01-16
16       2013-01-17
17       2013-01-18
18       2013-01-19
19       2013-01-20
20       2013-01-21
21       2013-01-22
22       2013-01-23
23       2013-01-24
24       2013-01-25
25       2013-01-26
26       2013-01-27
27       2013-01-28
28       2013-01-29
29       2013-01-30
            ...    
785697   2015-05-02
785698   2015-05-03
785699   2015-05-04
785700   2015-05-05
785701   2015-05-06
785702   2015-05-07
785703   2015-05-08
785704   2015-05-09
785705   2015-05-10
785706   2015-05-11
785707   2015-05-12
785708   2015-05-13
785709   2015-05-14
785710   2015-05-15
785711   2015-05-16
785712   2015-05-17
785713   2015-05-18
785714   2015-05-19
785715   2015-05-20
