In [1]:
import pandas as pd
import numpy as np
import os

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.neighbors import KNeighborsRegressor

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)

In [3]:
train = pd.read_csv('train.csv', infer_datetime_format=True,
                  parse_dates=[2], usecols=[0, 1, 2, 3, 4, 5, 6])

In [4]:
train.dtypes

Store                 int64
DayOfWeek             int64
Date         datetime64[ns]
Sales                 int64
Customers             int64
Open                  int64
Promo                 int64
dtype: object

In [5]:
train['Month'] = train.Date.dt.month
train['Year'] = train.Date.dt.year
train['Trend'] = train.sort_values(['Store','Date']).groupby('Store').cumcount()+1

In [6]:
d_cols = ['Month','Year','DayOfWeek']

In [7]:
train = pd.get_dummies(data=train, columns=d_cols, drop_first=True)

In [8]:
train.sort_values(['Store','Date'], inplace=True)
train.head()

Unnamed: 0,Store,Date,Sales,Customers,Open,Promo,Trend,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12,Year_2014,Year_2015,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,DayOfWeek_7
1016095,1,2013-01-01,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1014980,1,2013-01-02,5530,668,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1013865,1,2013-01-03,4327,578,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1012750,1,2013-01-04,4486,619,1,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1011635,1,2013-01-05,4997,635,1,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [9]:
test = pd.read_csv('test.csv', infer_datetime_format=True,
                  parse_dates=[3], usecols=[0, 1, 2, 3, 4, 5])

In [10]:
test.dtypes

Id                    int64
Store                 int64
DayOfWeek             int64
Date         datetime64[ns]
Open                float64
Promo                 int64
dtype: object

In [11]:
test['Month'] = test.Date.dt.month
test['Year'] = test.Date.dt.year
test['Trend'] = test.sort_values(['Store','Date']).groupby('Store').cumcount()+1

In [12]:
test.isna().sum()  # NA values in the Open column -- shift to

Id            0
Store         0
DayOfWeek     0
Date          0
Open         11
Promo         0
Month         0
Year          0
Trend         0
dtype: int64

In [13]:
test.fillna(0, inplace=True)

In [14]:
test = pd.get_dummies(data=test, columns=d_cols, drop_first=True)

In [15]:
test.sort_values(['Store','Date'], inplace=True)
test.head()

Unnamed: 0,Id,Store,Date,Open,Promo,Trend,Month_9,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,DayOfWeek_7
40232,40233,1,2015-08-01,1.0,0,1,0,0,0,0,0,1,0
39376,39377,1,2015-08-02,0.0,0,2,0,0,0,0,0,0,1
38520,38521,1,2015-08-03,1.0,1,3,0,0,0,0,0,0,0
37664,37665,1,2015-08-04,1.0,1,4,0,1,0,0,0,0,0
36808,36809,1,2015-08-05,1.0,1,5,0,0,1,0,0,0,0


In [16]:
# Get missing columns in the training test
missing_cols = set( train.columns ) - set( test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
train_cols = train.columns.append(pd.Index(['Id']))
test = test[train_cols]

In [17]:
pred_cols = list(np.setdiff1d(train_cols, ['Store','Date','Sales','Customers','Id']))
for i in pred_cols:
    print(i, end=' ')

DayOfWeek_2 DayOfWeek_3 DayOfWeek_4 DayOfWeek_5 DayOfWeek_6 DayOfWeek_7 Month_10 Month_11 Month_12 Month_2 Month_3 Month_4 Month_5 Month_6 Month_7 Month_8 Month_9 Open Promo Trend Year_2014 Year_2015 

In [18]:
Sales = []
Id = []
retailers = list(test.Store.unique())

In [24]:
p_grid = {'n_neighbors':[4,10,20], 
          'weights':['uniform','distance']}

knnr = KNeighborsRegressor(algorithm='auto', n_jobs=-1)

tscv = TimeSeriesSplit(n_splits=2)

flag = True
for i in retailers[0:10]:
    preds = []
    
    # I don't think I need to normalize
    store_train_X = train[train.Store==i].loc[:,pred_cols]
    store_test_X = test[test.Store==i].loc[:,pred_cols]
    
    Id.extend(test[test.Store==i].loc[:,'Id'])
    
    store_train_y = train[train.Store==i].loc[:, 'Sales']
    
    gs = GridSearchCV(knnr, param_grid = p_grid, 
                    scoring = 'neg_mean_squared_error', cv = tscv, n_jobs = -1)
     
    gs.fit(store_train_X, store_train_y)
    
    if flag:
        print(gs.scorer_)
    flag = False
    
    print('Store number:',i, gs.best_params_, round((-1 * gs.best_score_)**0.5,2))
    
    preds = gs.predict(store_test_X)
    preds[preds < 500] = 0
    
    Sales.extend(list(preds))

make_scorer(mean_squared_error, greater_is_better=False)
Store number: 1 {'n_neighbors': 20, 'weights': 'uniform'} 2029.6
Store number: 3 {'n_neighbors': 10, 'weights': 'uniform'} 3333.16
Store number: 7 {'n_neighbors': 10, 'weights': 'uniform'} 4086.41
Store number: 8 {'n_neighbors': 20, 'weights': 'uniform'} 2762.68
Store number: 9 {'n_neighbors': 20, 'weights': 'distance'} 3069.69
Store number: 10 {'n_neighbors': 20, 'weights': 'uniform'} 2359.83
Store number: 11 {'n_neighbors': 10, 'weights': 'uniform'} 3812.13
Store number: 12 {'n_neighbors': 10, 'weights': 'uniform'} 3510.27
Store number: 13 {'n_neighbors': 20, 'weights': 'uniform'} 2458.26
Store number: 14 {'n_neighbors': 20, 'weights': 'distance'} 2538.89
