In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
import xgboost as xgb
%matplotlib inline

In [2]:
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w


def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe


def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

def toBinary(featureCol, df):
    values = set(df[featureCol].unique())
    newCol = [featureCol + '_' + val for val in values]
    for val in values:
        df[featureCol + '_' + val] = df[featureCol].map(lambda x: 1 if x == val else 0)
    return newCol

# Gather some features
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
                     'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear'])

    # add some more with a bit of preprocessing
    features.append('SchoolHoliday')
    data['SchoolHoliday'] = data['SchoolHoliday'].astype(float)
    #
    #features.append('StateHoliday')
    #data.loc[data['StateHoliday'] == 'a', 'StateHoliday'] = '1'
    #data.loc[data['StateHoliday'] == 'b', 'StateHoliday'] = '2'
    #data.loc[data['StateHoliday'] == 'c', 'StateHoliday'] = '3'
    #data['StateHoliday'] = data['StateHoliday'].astype(float)

    features.append('DayOfWeek')
    features.append('month')
    features.append('day')
    features.append('year')
    data['year'] = data.Date.apply(lambda x: x.split('-')[0])
    data['year'] = data['year'].astype(float)
    data['month'] = data.Date.apply(lambda x: x.split('-')[1])
    data['month'] = data['month'].astype(float)
    data['day'] = data.Date.apply(lambda x: x.split('-')[2])
    data['day'] = data['day'].astype(float)

    # features.append('StoreType')
    for x in ['a', 'b', 'c', 'd']:
        features.append('StoreType_' + x)
        data['StoreType_' + x] = data['StoreType'].map(lambda y: 1 if y == x else 0)
        
    # data.loc[data['StoreType'] == 'a', 'StoreType'] = '1'
    # data.loc[data['StoreType'] == 'b', 'StoreType'] = '2'
    # data.loc[data['StoreType'] == 'c', 'StoreType'] = '3'
    # data.loc[data['StoreType'] == 'd', 'StoreType'] = '4'
    # data['StoreType'] = data['StoreType'].astype(float)

    newCol = toBinary('Assortment', data)
    features += newCol
    # features.append('Assortment')
    # data.loc[data['Assortment'] == 'a', 'Assortment'] = '1'
    # data.loc[data['Assortment'] == 'b', 'Assortment'] = '2'
    # data.loc[data['Assortment'] == 'c', 'Assortment'] = '3'
    # data['Assortment'] = data['Assortment'].astype(float)


In [3]:
print("Load the training, test and store data using pandas")
train = pd.read_csv("train.csv",dtype={'StateHoliday': str})
test = pd.read_csv("test.csv")
store = pd.read_csv("store.csv")

Load the training, test and store data using pandas


In [4]:
print("Assume store open, if not provided")
test.fillna(1, inplace=True)

Assume store open, if not provided


In [5]:
print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]

Consider only open stores for training. Closed stores wont count into the score.


In [6]:
print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

Join with store


In [7]:
features = []

print("augment features")
build_features(features, train)
build_features([], test)
print(features)

augment features
['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'DayOfWeek', 'month', 'day', 'year', 'StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d', 'Assortment_b', 'Assortment_a', 'Assortment_c']


In [8]:
params = {"objective": "reg:linear",
          "eta": 0.3,
          "max_depth": 8,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "silent": 1
          }
num_trees = 300

In [9]:
print("Train a XGBoost model")
val_size = 100000
#train = train.sort(['Date'])
print(train.tail(1)['Date'])
X_train, X_test = cross_validation.train_test_split(train, test_size=0.01)
#X_train, X_test = train.head(len(train) - val_size), train.tail(val_size)
dtrain = xgb.DMatrix(X_train[features], np.log(X_train["Sales"] + 1))
dvalid = xgb.DMatrix(X_test[features], np.log(X_test["Sales"] + 1))
dtest = xgb.DMatrix(test[features])
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True)

Will train until train error hasn't decreased in 50 rounds.
[0]	eval-rmspe:0.996821	train-rmspe:0.996811
[1]	eval-rmspe:0.981414	train-rmspe:0.981505
[2]	eval-rmspe:0.937583	train-rmspe:0.937983
[3]	eval-rmspe:0.855412	train-rmspe:0.856349
[4]	eval-rmspe:0.742078	train-rmspe:0.743698
[5]	eval-rmspe:0.616666	train-rmspe:0.619244
[6]	eval-rmspe:0.501116	train-rmspe:0.504874
[7]	eval-rmspe:0.411137	train-rmspe:0.415988
[8]	eval-rmspe:0.349545	train-rmspe:0.356177
[9]	eval-rmspe:0.318484	train-rmspe:0.325083
[10]	eval-rmspe:0.307534	train-rmspe:0.314209
[11]	eval-rmspe:0.291402	train-rmspe:0.301123
[12]	eval-rmspe:0.287980	train-rmspe:0.298636
[13]	eval-rmspe:0.285262	train-rmspe:0.294740
[14]	eval-rmspe:0.287124	train-rmspe:0.296820
[15]	eval-rmspe:0.287741	train-rmspe:0.297490
[16]	eval-rmspe:0.287304	train-rmspe:0.296737
[17]	eval-rmspe:0.286897	train-rmspe:0.296737
[18]	eval-rmspe:0.286049	train-rmspe:0.294298
[19]	eval-rmspe:0.285580	train-rmspe:0.294367
[20]	eval-rmspe:0.284825	train

Train a XGBoost model
844391    2013-01-02
Name: Date, dtype: object


[299]	eval-rmspe:0.141263	train-rmspe:0.124010


In [10]:
print("Validating")
train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
print('error', error)

Validating
error 0.141262991418


In [11]:
print("Make predictions on the test set")
test_probs = gbm.predict(xgb.DMatrix(test[features]))
indices = test_probs < 0
test_probs[indices] = 0
submission = pd.DataFrame({"Id": test["Id"], "Sales": np.exp(test_probs) - 1})
submission.to_csv("xgboost_kscript_submission.csv", index=False)

Make predictions on the test set
