In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
import xgboost as xgb



In [2]:
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w


def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe


def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

In [3]:
# Gather some features
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
                     'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear'])

    # add some more with a bit of preprocessing
    features.append('SchoolHoliday')
    data['SchoolHoliday'] = data['SchoolHoliday'].astype(float)
    #
    #features.append('StateHoliday')
    #data.loc[data['StateHoliday'] == 'a', 'StateHoliday'] = '1'
    #data.loc[data['StateHoliday'] == 'b', 'StateHoliday'] = '2'
    #data.loc[data['StateHoliday'] == 'c', 'StateHoliday'] = '3'
    #data['StateHoliday'] = data['StateHoliday'].astype(float)

    features.append('DayOfWeek')
    features.append('month')
    features.append('day')
    features.append('year')
    data['year'] = data.Date.apply(lambda x: x.split('-')[0])
    data['year'] = data['year'].astype(float)
    data['month'] = data.Date.apply(lambda x: x.split('-')[1])
    data['month'] = data['month'].astype(float)
    data['day'] = data.Date.apply(lambda x: x.split('-')[2])
    data['day'] = data['day'].astype(float)

    features.append('StoreType')
    data.loc[data['StoreType'] == 'a', 'StoreType'] = '1'
    data.loc[data['StoreType'] == 'b', 'StoreType'] = '2'
    data.loc[data['StoreType'] == 'c', 'StoreType'] = '3'
    data.loc[data['StoreType'] == 'd', 'StoreType'] = '4'
    data['StoreType'] = data['StoreType'].astype(float)

    features.append('Assortment')
    data.loc[data['Assortment'] == 'a', 'Assortment'] = '1'
    data.loc[data['Assortment'] == 'b', 'Assortment'] = '2'
    data.loc[data['Assortment'] == 'c', 'Assortment'] = '3'
    data['Assortment'] = data['Assortment'].astype(float)

In [4]:
print("Load the training, test and store data using pandas")
train = pd.read_csv("C:/Users/Rahul Pundir/Desktop/capstone dataset/train.csv")
test = pd.read_csv("C:/Users/Rahul Pundir/Desktop/capstone dataset/test.csv")
store = pd.read_csv("C:/Users/Rahul Pundir/Desktop/capstone dataset/store.csv")

Load the training, test and store data using pandas


  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [6]:
test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,17-09-2015,1.0,1,0,0
1,2,3,4,17-09-2015,1.0,1,0,0
2,3,7,4,17-09-2015,1.0,1,0,0
3,4,8,4,17-09-2015,1.0,1,0,0
4,5,9,4,17-09-2015,1.0,1,0,0


In [7]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,31-07-2015,5263,555,1,1,0,1
1,2,5,31-07-2015,6064,625,1,1,0,1
2,3,5,31-07-2015,8314,821,1,1,0,1
3,4,5,31-07-2015,13995,1498,1,1,0,1
4,5,5,31-07-2015,4822,559,1,1,0,1


In [8]:
print("Assume store open, if not provided")
test.fillna(1, inplace=True)

Assume store open, if not provided


In [9]:
print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]

Consider only open stores for training. Closed stores wont count into the score.


In [10]:
print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

Join with store


In [11]:
features = []

In [12]:
print("augment features")
build_features(features, train)

augment features


In [13]:
build_features([], test)
print(features)

['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'DayOfWeek', 'month', 'day', 'year', 'StoreType', 'Assortment']


In [14]:
params = {"objective": "reg:linear",
          "eta": 0.20,
          "max_depth": 8,
          "subsample": 0.8,
          "colsample_bytree": 0.7,
          "silent": 1
          }
num_trees = 700

In [15]:
print("Train a XGBoost model")
val_size = 100000

print(train.tail(1)['Date'])

X_train, X_test = cross_validation.train_test_split(train, test_size=0.01)

dtrain = xgb.DMatrix(X_train[features], np.log(X_train["Sales"] + 1))
dvalid = xgb.DMatrix(X_test[features], np.log(X_test["Sales"] + 1))
dtest  = xgb.DMatrix(test[features])

watchlist = [(dvalid, 'eval'), (dtrain, 'train')]

gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=200, feval=rmspe_xg, verbose_eval=True)


Train a XGBoost model
844391    02-01-2013
Name: Date, dtype: object


  if getattr(data, 'base', None) is not None and \


[0]	eval-rmse:6.61518	train-rmse:6.61833	eval-rmspe:0.998701	train-rmspe:0.998674
Multiple eval metrics have been passed: 'train-rmspe' will be used for early stopping.

Will train until train-rmspe hasn't improved in 200 rounds.
[1]	eval-rmse:5.29626	train-rmse:5.29926	eval-rmspe:0.994699	train-rmspe:0.994684
[2]	eval-rmse:4.24079	train-rmse:4.24447	eval-rmspe:0.984528	train-rmspe:0.984556
[3]	eval-rmse:3.3977	train-rmse:3.40168	eval-rmspe:0.963958	train-rmspe:0.964074
[4]	eval-rmse:2.72493	train-rmse:2.72867	eval-rmspe:0.929398	train-rmspe:0.929646
[5]	eval-rmse:2.18739	train-rmse:2.19172	eval-rmspe:0.879389	train-rmspe:0.879932
[6]	eval-rmse:1.75924	train-rmse:1.76412	eval-rmspe:0.815323	train-rmspe:0.816368
[7]	eval-rmse:1.41836	train-rmse:1.42379	eval-rmspe:0.741201	train-rmspe:0.743047
[8]	eval-rmse:1.14816	train-rmse:1.15409	eval-rmspe:0.662235	train-rmspe:0.665122
[9]	eval-rmse:0.934722	train-rmse:0.941006	eval-rmspe:0.584023	train-rmspe:0.588323
[10]	eval-rmse:0.767723	train-r

[96]	eval-rmse:0.146598	train-rmse:0.159209	eval-rmspe:0.161798	train-rmspe:0.202618
[97]	eval-rmse:0.145949	train-rmse:0.158681	eval-rmspe:0.161001	train-rmspe:0.20212
[98]	eval-rmse:0.145194	train-rmse:0.157933	eval-rmspe:0.16011	train-rmspe:0.201736
[99]	eval-rmse:0.144536	train-rmse:0.157314	eval-rmspe:0.159504	train-rmspe:0.201247
[100]	eval-rmse:0.143872	train-rmse:0.156627	eval-rmspe:0.158871	train-rmspe:0.200702
[101]	eval-rmse:0.1433	train-rmse:0.156062	eval-rmspe:0.158357	train-rmspe:0.200233
[102]	eval-rmse:0.142367	train-rmse:0.155141	eval-rmspe:0.157333	train-rmspe:0.199394
[103]	eval-rmse:0.141648	train-rmse:0.154506	eval-rmspe:0.156758	train-rmspe:0.198622
[104]	eval-rmse:0.141004	train-rmse:0.153966	eval-rmspe:0.156121	train-rmspe:0.198172
[105]	eval-rmse:0.140469	train-rmse:0.15337	eval-rmspe:0.155454	train-rmspe:0.197584
[106]	eval-rmse:0.140083	train-rmse:0.152998	eval-rmspe:0.155035	train-rmspe:0.197275
[107]	eval-rmse:0.139719	train-rmse:0.152606	eval-rmspe:0.15452

[192]	eval-rmse:0.117854	train-rmse:0.13061	eval-rmspe:0.133395	train-rmspe:0.176121
[193]	eval-rmse:0.117681	train-rmse:0.130428	eval-rmspe:0.133217	train-rmspe:0.176002
[194]	eval-rmse:0.117554	train-rmse:0.130293	eval-rmspe:0.133093	train-rmspe:0.17588
[195]	eval-rmse:0.117392	train-rmse:0.130103	eval-rmspe:0.132987	train-rmspe:0.175536
[196]	eval-rmse:0.117093	train-rmse:0.129827	eval-rmspe:0.132694	train-rmspe:0.175327
[197]	eval-rmse:0.117053	train-rmse:0.129785	eval-rmspe:0.132657	train-rmspe:0.175253
[198]	eval-rmse:0.116984	train-rmse:0.12968	eval-rmspe:0.132595	train-rmspe:0.175516
[199]	eval-rmse:0.116944	train-rmse:0.129625	eval-rmspe:0.132563	train-rmspe:0.173737
[200]	eval-rmse:0.116849	train-rmse:0.129461	eval-rmspe:0.132489	train-rmspe:0.173593
[201]	eval-rmse:0.116701	train-rmse:0.129309	eval-rmspe:0.13232	train-rmspe:0.173403
[202]	eval-rmse:0.116611	train-rmse:0.129204	eval-rmspe:0.132242	train-rmspe:0.173705
[203]	eval-rmse:0.116538	train-rmse:0.129062	eval-rmspe:0.

[288]	eval-rmse:0.108222	train-rmse:0.119493	eval-rmspe:0.117647	train-rmspe:0.160018
[289]	eval-rmse:0.108202	train-rmse:0.119438	eval-rmspe:0.117757	train-rmspe:0.160007
[290]	eval-rmse:0.108133	train-rmse:0.119322	eval-rmspe:0.117693	train-rmspe:0.159893
[291]	eval-rmse:0.108019	train-rmse:0.119223	eval-rmspe:0.117545	train-rmspe:0.159827
[292]	eval-rmse:0.107941	train-rmse:0.119113	eval-rmspe:0.117462	train-rmspe:0.15974
[293]	eval-rmse:0.10784	train-rmse:0.119038	eval-rmspe:0.117332	train-rmspe:0.159692
[294]	eval-rmse:0.107766	train-rmse:0.118984	eval-rmspe:0.117285	train-rmspe:0.159563
[295]	eval-rmse:0.107667	train-rmse:0.118892	eval-rmspe:0.117181	train-rmspe:0.159468
[296]	eval-rmse:0.1076	train-rmse:0.118809	eval-rmspe:0.11712	train-rmspe:0.160723
[297]	eval-rmse:0.107554	train-rmse:0.11877	eval-rmspe:0.117075	train-rmspe:0.16069
[298]	eval-rmse:0.10749	train-rmse:0.118629	eval-rmspe:0.117019	train-rmspe:0.160772
[299]	eval-rmse:0.107421	train-rmse:0.118597	eval-rmspe:0.1169

[384]	eval-rmse:0.102617	train-rmse:0.112884	eval-rmspe:0.109316	train-rmspe:0.145068
[385]	eval-rmse:0.102529	train-rmse:0.112756	eval-rmspe:0.109209	train-rmspe:0.144767
[386]	eval-rmse:0.102483	train-rmse:0.112688	eval-rmspe:0.109155	train-rmspe:0.144696
[387]	eval-rmse:0.10245	train-rmse:0.112634	eval-rmspe:0.109106	train-rmspe:0.144623
[388]	eval-rmse:0.102378	train-rmse:0.112586	eval-rmspe:0.10901	train-rmspe:0.144564
[389]	eval-rmse:0.102312	train-rmse:0.11251	eval-rmspe:0.108932	train-rmspe:0.144492
[390]	eval-rmse:0.102304	train-rmse:0.112392	eval-rmspe:0.108927	train-rmspe:0.144474
[391]	eval-rmse:0.102257	train-rmse:0.112347	eval-rmspe:0.108879	train-rmspe:0.144418
[392]	eval-rmse:0.10222	train-rmse:0.112302	eval-rmspe:0.108841	train-rmspe:0.144364
[393]	eval-rmse:0.102178	train-rmse:0.11224	eval-rmspe:0.10881	train-rmspe:0.144309
[394]	eval-rmse:0.102144	train-rmse:0.112158	eval-rmspe:0.10878	train-rmspe:0.144274
[395]	eval-rmse:0.102116	train-rmse:0.112102	eval-rmspe:0.108

[480]	eval-rmse:0.0995	train-rmse:0.107728	eval-rmspe:0.106101	train-rmspe:0.126518
[481]	eval-rmse:0.099464	train-rmse:0.107519	eval-rmspe:0.106064	train-rmspe:0.126519
[482]	eval-rmse:0.099452	train-rmse:0.107497	eval-rmspe:0.106117	train-rmspe:0.126491
[483]	eval-rmse:0.099422	train-rmse:0.107471	eval-rmspe:0.106024	train-rmspe:0.126477
[484]	eval-rmse:0.099416	train-rmse:0.107446	eval-rmspe:0.106018	train-rmspe:0.126402
[485]	eval-rmse:0.099415	train-rmse:0.107415	eval-rmspe:0.106015	train-rmspe:0.126381
[486]	eval-rmse:0.099385	train-rmse:0.10738	eval-rmspe:0.105975	train-rmspe:0.126353
[487]	eval-rmse:0.099361	train-rmse:0.107342	eval-rmspe:0.105949	train-rmspe:0.12632
[488]	eval-rmse:0.099342	train-rmse:0.107292	eval-rmspe:0.105922	train-rmspe:0.126355
[489]	eval-rmse:0.099346	train-rmse:0.10727	eval-rmspe:0.105928	train-rmspe:0.126336
[490]	eval-rmse:0.099342	train-rmse:0.107233	eval-rmspe:0.105909	train-rmspe:0.12629
[491]	eval-rmse:0.099294	train-rmse:0.107199	eval-rmspe:0.10

[576]	eval-rmse:0.09734	train-rmse:0.103908	eval-rmspe:0.102233	train-rmspe:0.121856
[577]	eval-rmse:0.097333	train-rmse:0.103843	eval-rmspe:0.102231	train-rmspe:0.121843
[578]	eval-rmse:0.097325	train-rmse:0.103819	eval-rmspe:0.102229	train-rmspe:0.121832
[579]	eval-rmse:0.097324	train-rmse:0.103794	eval-rmspe:0.102218	train-rmspe:0.121813
[580]	eval-rmse:0.097315	train-rmse:0.103777	eval-rmspe:0.102203	train-rmspe:0.121768
[581]	eval-rmse:0.097299	train-rmse:0.10375	eval-rmspe:0.102186	train-rmspe:0.121735
[582]	eval-rmse:0.097265	train-rmse:0.103718	eval-rmspe:0.102166	train-rmspe:0.121682
[583]	eval-rmse:0.097251	train-rmse:0.103684	eval-rmspe:0.102153	train-rmspe:0.121669
[584]	eval-rmse:0.097234	train-rmse:0.103647	eval-rmspe:0.102142	train-rmspe:0.121646
[585]	eval-rmse:0.097185	train-rmse:0.103605	eval-rmspe:0.102103	train-rmspe:0.121604
[586]	eval-rmse:0.097165	train-rmse:0.103508	eval-rmspe:0.102077	train-rmspe:0.121555
[587]	eval-rmse:0.09715	train-rmse:0.103401	eval-rmspe:0

[672]	eval-rmse:0.09618	train-rmse:0.100977	eval-rmspe:0.101045	train-rmspe:0.117045
[673]	eval-rmse:0.096175	train-rmse:0.100879	eval-rmspe:0.101044	train-rmspe:0.11694
[674]	eval-rmse:0.096161	train-rmse:0.100858	eval-rmspe:0.101035	train-rmspe:0.116937
[675]	eval-rmse:0.096147	train-rmse:0.100845	eval-rmspe:0.101036	train-rmspe:0.116928
[676]	eval-rmse:0.096133	train-rmse:0.100822	eval-rmspe:0.101041	train-rmspe:0.116948
[677]	eval-rmse:0.096123	train-rmse:0.100787	eval-rmspe:0.101037	train-rmspe:0.116919
[678]	eval-rmse:0.096113	train-rmse:0.10077	eval-rmspe:0.101028	train-rmspe:0.116909
[679]	eval-rmse:0.096132	train-rmse:0.100713	eval-rmspe:0.101036	train-rmspe:0.116871
[680]	eval-rmse:0.096122	train-rmse:0.100698	eval-rmspe:0.101037	train-rmspe:0.116856
[681]	eval-rmse:0.095863	train-rmse:0.100631	eval-rmspe:0.100242	train-rmspe:0.116581
[682]	eval-rmse:0.095867	train-rmse:0.100618	eval-rmspe:0.10025	train-rmspe:0.116541
[683]	eval-rmse:0.095874	train-rmse:0.100585	eval-rmspe:0.

In [24]:
gbm.best_score

0.109785