In [29]:
%matplotlib inline

import matplotlib.pyplot as plt
import graphviz
import csv
from datetime import datetime, timedelta
from dateutil.relativedelta import *
from sklearn import preprocessing
import numpy as np
from scipy import sparse
from sklearn.cross_validation import KFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn.svm import NuSVR
import pandas as pd
import xgboost as xgb
import time

In [30]:
dtypes = {"Date": datetime, "StateHoliday": np.dtype(str), "SchoolHoliday": np.dtype(int)}

# The read_csv returns an error when reading the stores data because of missing values but
# works when I don't specify the dtypes
store_dtypes = {"CompetitionSinceYear": np.dtype(int), "CompetitionSinceMonth": np.dtype(int), 
                "Promo2SinceYear": np.dtype(int), "Promo2SinceWeek": np.dtype(int)}
data = pd.read_csv('data/train.csv', dtype=dtypes, parse_dates=[2])
stores = pd.read_csv('data/store.csv')

test = pd.read_csv('data/test.csv', dtype=dtypes, parse_dates=[3])

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
Store            1017209 non-null int64
DayOfWeek        1017209 non-null int64
Date             1017209 non-null datetime64[ns]
Sales            1017209 non-null int64
Customers        1017209 non-null int64
Open             1017209 non-null int64
Promo            1017209 non-null int64
StateHoliday     1017209 non-null object
SchoolHoliday    1017209 non-null int64
dtypes: datetime64[ns](1), int64(7), object(1)
memory usage: 77.6+ MB


In [32]:
# remove rows with sales of 0
data = data[data.Sales > 0]

# extract sales to separate Series
sales = data[['Store','Sales']]

# remove Sales and Customers columns
data.drop(['Sales','Customers'], axis=1, inplace=True)

# check Open column and set to open if NaN
data.Open.fillna(1, inplace=True)

In [33]:
# Check all the values for StateHoliday
pd.unique(data.StateHoliday.ravel())

array(['0', 'a', 'b', 'c'], dtype=object)

In [34]:
# Check all the values for SchoolHoliday
pd.unique(data.SchoolHoliday.ravel())

array([1, 0])

In [35]:
# recode StateHoliday as dummy variable
def stateHoliday_asDummy(df):
    sh = pd.get_dummies(df['StateHoliday'])

    sh.rename(columns={'0':'noHoliday','a':'PublicHoliday'}, inplace=True)
    sh[['noHoliday','PublicHoliday']] = sh[['noHoliday','PublicHoliday']].astype(int)
    try:
        sh.rename(columns={'b':'EasterHoliday','c':'XmasHoliday'}, inplace=True)
        sh[['EasterHoliday','XmasHoliday']] = sh[['EasterHoliday','XmasHoliday']].astype(int)
    except:
        pass

    df = pd.concat([df,sh], axis=1)
    return df

In [36]:
# recode SchoolHoliday as binary instead of categorical
def schoolHoliday_asDummy(df):
    df.SchoolHoliday = df.SchoolHoliday.apply(int)
    return df

In [37]:
def recodeCategorical_asInt(df, fn, newfn):
    df[newfn] = pd.Categorical.from_array(df[fn]).codes
    return df

In [38]:
# Dummy variable approach
data = stateHoliday_asDummy(data)
data = schoolHoliday_asDummy(data)

# Categorical variable approach
data = recodeCategorical_asInt(data, 'SchoolHoliday', 'SchoolHolidayVal')
data = recodeCategorical_asInt(data, 'StateHoliday', 'StateHolidayVal')

In [39]:
# find NaN values in test data
#test.isnull().sum().sum()  # indicates 11 missing values
# test.Open.isnull().sum()  # they are all in the Open column
test.Open.fillna(1, inplace=True)
test.Open = test.Open.astype(int)

In [40]:
test = stateHoliday_asDummy(test)
test = schoolHoliday_asDummy(test)

test = recodeCategorical_asInt(test, 'StateHoliday', 'StateHolidayVal')
test = recodeCategorical_asInt(test, 'SchoolHoliday', 'SchoolHolidayVal')

In [41]:
# recode StoreType and Assortment as dummy variables
def storeType_asDummy(df):
    st = pd.get_dummies(df['StoreType'])    
    st.rename(columns={'a':'StoreTypeA','b':'StoreTypeB','c':'StoreTypeC','d':'StoreTypeD'}, inplace=True)
    df = pd.concat([df,st], axis=1)    
    df[['StoreTypeA','StoreTypeB','StoreTypeC','StoreTypeD']] = df[['StoreTypeA','StoreTypeB','StoreTypeC','StoreTypeD']].astype(int)
    return df
    
def assortment_asDummy(df):
    ass = pd.get_dummies(df['Assortment'])
    ass.rename(columns={'a':'BasicAssortment','b':'ExtraAssortment','c':'ExtendedAssortment'}, inplace=True)
    df = pd.concat([df,ass], axis=1)
    df[['BasicAssortment','ExtraAssortment','ExtendedAssortment']] = df[['BasicAssortment','ExtraAssortment','ExtendedAssortment']].astype(int)
    return df

In [42]:
# Convert PromoInterval to dummy variable
def promoInterval_asDummy(df):
    pi = pd.get_dummies(df['PromoInterval'])
    pi.rename(columns={'Feb,May,Aug,Nov':'PromoIntFebMayAugNov','Jan,Apr,Jul,Oct':'PromoIntJanAprJulOct',
                       'Mar,Jun,Sept,Dec':'PromoIntMarJunSeptDec','None':'PromoIntNone'}, inplace=True)
    df = pd.concat([df,pi], axis=1)
    df[['PromoIntFebMayAugNov','PromoIntJanAprJulOct','PromoIntMarJunSeptDec','PromoIntNone']] = \
        df[['PromoIntFebMayAugNov','PromoIntJanAprJulOct','PromoIntMarJunSeptDec','PromoIntNone']].astype(int)
    return df

In [43]:
# Clean up NaN values

# Set competition distance to 0 if NaN
stores.CompetitionDistance.fillna(0, inplace=True)

# Set CompetitionOpenSince values to 0 if NaN
stores.CompetitionOpenSinceMonth.fillna(0, inplace=True)
stores.CompetitionOpenSinceYear.fillna(0, inplace=True)

# Set Promo2Since values to 0 if NaN
stores.Promo2SinceWeek.fillna(0, inplace=True)
stores.Promo2SinceYear.fillna(0, inplace=True)

# Set PromoInterval value to the string 'None' if NaN
stores.PromoInterval.fillna('None', inplace=True)

In [44]:
#Dummy variables
stores = storeType_asDummy(stores)
stores = assortment_asDummy(stores)
stores = promoInterval_asDummy(stores)

In [45]:
# Categorical variables
stores = recodeCategorical_asInt(stores, 'StoreType', 'StoreTypeVal')
stores = recodeCategorical_asInt(stores, 'Assortment', 'AssortmentVal')
stores = recodeCategorical_asInt(stores, 'PromoInterval', 'PromoIntervalVal')

In [46]:
# perform a left outer join of the data and stores dataframes
all_data = pd.merge(data, stores, on='Store', how='left')

# same for the test data
all_test = pd.merge(test, stores, on='Store', how='left')

In [47]:
all_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41088 entries, 0 to 41087
Data columns (total 35 columns):
Id                           41088 non-null int64
Store                        41088 non-null int64
DayOfWeek                    41088 non-null int64
Date                         41088 non-null datetime64[ns]
Open                         41088 non-null int64
Promo                        41088 non-null int64
StateHoliday                 41088 non-null object
SchoolHoliday                41088 non-null int64
noHoliday                    41088 non-null int64
PublicHoliday                41088 non-null int64
StateHolidayVal              41088 non-null int8
SchoolHolidayVal             41088 non-null int8
StoreType                    41088 non-null object
Assortment                   41088 non-null object
CompetitionDistance          41088 non-null float64
CompetitionOpenSinceMonth    41088 non-null float64
CompetitionOpenSinceYear     41088 non-null float64
Promo2                   

In [48]:
# Now that we have the observation date combined with store metadata, we can recompute 
# the promosince... and competitionsince... year/week values as weeks before observation date

# This takes a really long time to execute...

def monthsPromo(x):
    months = 0
    if x.Promo2SinceYear > 0:
        d1s = str(int(x.Promo2SinceYear)) + '-' + str(int(x.Promo2SinceWeek))
        d1 = datetime.strptime(d1s + '-1', "%Y-%W-%w")
        months = (x.Date.year - d1.year) * 12 + x.Date.month - d1.month
        if months < 0:
            months = 0
    return months

def weeksPromo(x):
    weeks = 0
    if x.Promo2SinceYear > 0:
        d1s = str(int(x.Promo2SinceYear)) + '-' + str(int(x.Promo2SinceWeek))
        d1 = datetime.strptime(d1s + '-1', "%Y-%W-%w")
        m1 = (d1 - timedelta(days=d1.weekday()))
        m2 = (x.Date - timedelta(days=x.Date.weekday()))
        weeks = (m2 - m1).days / 7
        if weeks < 0:
            weeks = 0
    return weeks

def monthsComp(x):
    months = 0
    if x.CompetitionOpenSinceYear > 0:
        d1s = str(int(x.CompetitionOpenSinceYear)) + '-' + str(int(x.CompetitionOpenSinceMonth))
        d1 = datetime.strptime(d1s, "%Y-%m")
        r = relativedelta(x.Date,d1)
        months = r.years*12 + r.months
        if months < 0:
            months = 0
    return months

def calculateCompPromoFeatures(df):
    # Calculate the PromoWeeks and CompetitionMonths as single Series
    df['Promo2SinceWeeks'] = df.apply(weeksPromo, axis=1)
    df['Promo2SinceMonths'] = df.apply(monthsPromo, axis=1)
    df['CompetitionOpenSinceMonths'] = df.apply(monthsComp, axis=1)
    return df

In [49]:
all_data = calculateCompPromoFeatures(all_data)
all_test = calculateCompPromoFeatures(all_test)

In [50]:
all_data['day'] = all_data.Date.dt.day
all_data['month'] = all_data.Date.dt.month
all_data['year'] = all_data.Date.dt.year

all_test['day'] = all_test.Date.dt.day
all_test['month'] = all_test.Date.dt.month
all_test['year'] = all_test.Date.dt.year

In [51]:
# Calculate Day of Year from Date
#all_data['DayOfYear'] = all_data.Date.dt.dayofyear
#all_test['DayOfYear'] = all_test.Date.dt.dayofyear

In [52]:
# Construct a feature list for the features we want to include in the models
feature_list_1 = ['Store','DayOfWeek','DayOfYear', 'CompetitionDistance', 'CompetitionOpenSinceMonths',
                'Promo', 'Promo2', 'Promo2SinceWeeks', 'noHoliday','PublicHoliday','EasterHoliday','XmasHoliday'
                'StoreTypeA', 'StoreTypeB', 'StoreTypeC', 'StoreTypeD', 'BasicAssortment', 'ExtraAssortment',
                'ExtendedAssortment', 'PromoIntNone', 'PromoIntFebMayAugNov', 'PromoIntJanAprJulOct',
                'PromoIntMarJunSeptDec']

feature_list_2 = ['Store','DayOfWeek','DayOfYear', 'CompetitionDistance', 'CompetitionOpenSinceMonths',
                'Promo', 'Promo2', 'Promo2SinceMonths', 'SchoolHolidayVal','StateHolidayVal',
                'StoreTypeVal', 'AssortmentVal', 'PromoIntervalVal']


feature_list_3 = ['Store','DayOfWeek','day', 'month', 'year', 'CompetitionDistance', 'CompetitionOpenSinceMonths',
                'Promo', 'Promo2', 'Promo2SinceMonths', 'SchoolHolidayVal','StateHolidayVal',
                'StoreTypeVal', 'AssortmentVal', 'PromoIntervalVal']

In [57]:
pdata = all_data[feature_list_3].as_matrix()
ptest = all_data[feature_list_3].as_matrix()

In [59]:
pdata.shape

(844338, 15)

In [62]:
# turn everything into categorical variables

le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le3 = preprocessing.LabelEncoder()
le4 = preprocessing.LabelEncoder()
le5 = preprocessing.LabelEncoder()
le6 = preprocessing.LabelEncoder()
le7 = preprocessing.LabelEncoder()
le8 = preprocessing.LabelEncoder()
le9 = preprocessing.LabelEncoder()
le10 = preprocessing.LabelEncoder()
le11 = preprocessing.LabelEncoder()
le12 = preprocessing.LabelEncoder()
le13 = preprocessing.LabelEncoder()
le14 = preprocessing.LabelEncoder()
le15 = preprocessing.LabelEncoder()


le1.fit([i[0] for i in pdata])
le2.fit([i[1] for i in pdata])
le3.fit([i[2] for i in pdata])
le4.fit([i[3] for i in pdata])
le5.fit([i[4] for i in pdata])
le6.fit([i[5] for i in pdata])
le7.fit([i[6] for i in pdata])
le8.fit([i[7] for i in pdata])
le9.fit([i[8] for i in pdata])
le10.fit([i[9] for i in pdata])
le11.fit([i[10] for i in pdata])
le12.fit([i[11] for i in pdata])
le13.fit([i[12] for i in pdata])
le14.fit([i[13] for i in pdata])
le15.fit([i[14] for i in pdata])

X_cat1 = le1.transform([i[0] for i in pdata])
X_cat2 = le2.transform([i[1] for i in pdata])
X_cat3 = le3.transform([i[2] for i in pdata])
X_cat4 = le4.transform([i[3] for i in pdata])
X_cat5 = le5.transform([i[4] for i in pdata])
X_cat6 = le6.transform([i[5] for i in pdata])
X_cat7 = le7.transform([i[6] for i in pdata])
X_cat8 = le8.transform([i[7] for i in pdata])
X_cat9 = le9.transform([i[8] for i in pdata])
X_cat10 = le10.transform([i[9] for i in pdata])
X_cat11 = le11.transform([i[10] for i in pdata])
X_cat12 = le12.transform([i[11] for i in pdata])
X_cat13 = le13.transform([i[12] for i in pdata])
X_cat14 = le14.transform([i[13] for i in pdata])
X_cat15 = le15.transform([i[14] for i in pdata])

X_cat = [[X_cat1[i],X_cat2[i],X_cat3[i],X_cat4[i],X_cat5[i],
          X_cat6[i],X_cat7[i],X_cat8[i],X_cat9[i],X_cat10[i],
          X_cat11[i],X_cat12[i],X_cat13[i],X_cat14[i],X_cat15[i]] for i in range(len(X_cat1))]


#transform test data
test_cat1 = le1.transform([i[0] for i in ptest])
test_cat2 = le2.transform([i[1] for i in ptest])
test_cat3 = le3.transform([i[2] for i in ptest])
test_cat4 = le4.transform([i[3] for i in ptest])
test_cat5 = le5.transform([i[4] for i in ptest])
test_cat6 = le6.transform([i[5] for i in ptest])
test_cat7 = le7.transform([i[6] for i in ptest])
test_cat8 = le8.transform([i[7] for i in ptest])
test_cat9 = le9.transform([i[8] for i in ptest])
test_cat10 = le10.transform([i[9] for i in ptest])
test_cat11 = le11.transform([i[10] for i in ptest])
test_cat12 = le12.transform([i[11] for i in ptest])
test_cat13 = le13.transform([i[12] for i in ptest])
test_cat14 = le14.transform([i[13] for i in ptest])
test_cat15 = le15.transform([i[14] for i in ptest])

test_cat = [[test_cat1[i],test_cat2[i],test_cat3[i],test_cat4[i],test_cat5[i],
             test_cat6[i],test_cat7[i],test_cat8[i],test_cat9[i],test_cat10[i],
             test_cat11[i],test_cat12[i],test_cat13[i],test_cat14[i],test_cat15[i]] for i in range(len(X_cat1))]


In [65]:
#create dummy vars
enc = preprocessing.OneHotEncoder(sparse=True)
enc.fit(X_cat)
X = enc.transform(X_cat)
Xt = enc.transform(test_cat)

Y = sales.as_matrix()

In [None]:
#Do some cross val testing
kf = KFold(np.shape(X)[0], n_folds=2)
i=0
rmspe=[]
t1=time.time()
for train_index, test_index in kf:
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    #Scale Data
    #Scale X
    scaler = preprocessing.StandardScaler(with_mean=False)
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    #train model
    #mod = NuSVR(nu=0.5, C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, tol=0.001, cache_size=200, verbose=False, max_iter=-1)
    mod = RandomForestRegressor(n_estimators=50,n_jobs=-1) #very slow, especially on all data
    #mod = GradientBoostingRegressor(n_estimators=50)
    #mod = linear_model.SGDRegressor()#~500 error requires scaling
    #mod = naive_bayes.GaussianNB() #doesn't work with sparse
    mod.fit(X_train_scaled,Y_train)
    #make predictions
    preds = np.exp(mod.predict(X_test_scaled))
    Y_test = np.exp(Y_test)
    #score
    rmspe.append((np.mean(((preds-Y_test)/(Y_test+1))**2))**0.5)
    print i+1,rmspe[i]
    i=i+1
print 'Time =',int(time.time()-t1),'s'
print 'RMSPE avg =',np.mean(rmspe)

In [26]:
# from the xgboost source code it looks like the custom feval function is passed two parameters: y_hat and y
# where the y is a DMatrix object and the Y-hat the output of the prediction for that Y value.
# The return is supposed to be (in this case, I believe) a string, value tuple
# https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/core.py
# Also, how to use this function was not very clear but this example gives a clue:
# https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py

def rmspe(y_hat, dmat):
    # we need to reverse the log(Y)
    y = np.exp(dmat.get_label())
    return "rmspe", np.sqrt(np.mean(((np.exp(y_hat)-y)/(y-1))**2))

In [27]:
def prepDataframe(df, f):
    # take the data from the all_data dataframe and sales dataframe
    X_train, X_test, Y_train, Y_test = train_test_split(df, df.Sales, test_size=0.20, random_state=42)
    X_train = X_train[f]
    X_test  = X_test[f]

    # take the log of the Y values
    Ylog_train = np.log(Y_train)
    Ylog_test  = np.log(Y_test)
    
    # scale the data
    scaler = preprocessing.StandardScaler().fit(X_train)
    scaled_trainX = scaler.transform(X_train)
    scaled_testX = scaler.transform(X_test, len(X_test))
    
    return(scaled_trainX, Ylog_train, scaled_testX, Ylog_test, scaler)

In [55]:

# [546]	eval-rmspe:0.443694	train-rmspe:0.472175  (score 0.39930)
param_1 = {'max_depth':10, 'objective':'reg:linear', 'silent':1, 'eta': 0.5,
         'booster': 'gblinear', 'alpha' : 0.01, 'lambda' : 1}


In [56]:
X, Ylog, X_test, Ylog_test, scaler = prepDataframe(all_data, feature_list_3)

estimators = [("Tree", DecisionTreeRegressor()),
              ("RandomForest", RandomForestRegressor())]

n_estimators = len(estimators)


# bagging ensemble regressor



Will train until train error hasn't decreased in 150 rounds.
[0]	eval-rmse:5.793334	train-rmse:5.793037
[1]	eval-rmse:4.061806	train-rmse:4.061370
[2]	eval-rmse:2.850458	train-rmse:2.849940
[3]	eval-rmse:2.003362	train-rmse:2.002872
[4]	eval-rmse:1.413055	train-rmse:1.412534
[5]	eval-rmse:1.000253	train-rmse:0.999536
[6]	eval-rmse:0.714765	train-rmse:0.713646
[7]	eval-rmse:0.512279	train-rmse:0.510537
[8]	eval-rmse:0.376222	train-rmse:0.372849
[9]	eval-rmse:0.283114	train-rmse:0.276961
[10]	eval-rmse:0.223176	train-rmse:0.212552
[11]	eval-rmse:0.183050	train-rmse:0.167778
[12]	eval-rmse:0.154486	train-rmse:0.135918
[13]	eval-rmse:0.139396	train-rmse:0.117448
[14]	eval-rmse:0.130775	train-rmse:0.106168
[15]	eval-rmse:0.120684	train-rmse:0.094852
[16]	eval-rmse:0.116905	train-rmse:0.090088
[17]	eval-rmse:0.113727	train-rmse:0.086299
[18]	eval-rmse:0.111754	train-rmse:0.083956
[19]	eval-rmse:0.110421	train-rmse:0.082393
[20]	eval-rmse:0.108835	train-rmse:0.080716
[21]	eval-rmse:0.108330	t

In [57]:
test_data = scaler.transform(all_test[feature_list_3])
test_preds = np.exp(bst.predict(xgb.DMatrix(test_data)))

outF = open('subxgb10-trees.csv','wb')
fwriter = csv.writer(outF,delimiter=',')
fwriter.writerow(['Id','Sales'])
for i in range(len(test_preds)):
    fwriter.writerow([i+1,int(test_preds[i])])

outF.close()