In [1]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance
import matplotlib.pyplot as plt
import holidays
import seaborn as sns
from sklearn import linear_model
sns.set()
%matplotlib inline 

  import pandas.util.testing as tm


### Feature engineering

Since we want to apply classification model, we will create new attributes from the dataset so that the models can learn. 
<br>

Based on the exploratory analysis, the following features would be relevant:
- month
- dayofweek
- holiday vs normal day
- product category
- shop id


However, the test set for the submission is asking us to submit at total month level, so the dayofweek and holiday features would be applicable. We left with month, product category and shop id

In [2]:
raw_df = pd.read_csv('data/train_df.csv', parse_dates=['date'])
# outlier removal steps as identified in exploratory analysis
id_out = [11, 14, 15, 12]
raw_df.loc[(raw_df.date == '2013-11-29') & (raw_df.item_category_id.isin(id_out)), 'sales'] = np.nan
raw_df.loc[(raw_df.date_block_num == 10) & (raw_df.item_category_id.isin(id_out))].interpolate(method='linear');

In [3]:
raw_df['month'] = raw_df['date'].dt.month

In [4]:
# daily level info would not match the test data set, hence groupby month
month_df = raw_df.groupby(['date_block_num', 'shop_id', 'item_id'])['item_cnt_day'].sum().reset_index()

In [5]:
month_df['shop_item'] = month_df['item_id'].astype('str') + '_' + month_df['shop_id'].astype('str')

In [6]:
month_df.sort_values(['shop_item', 'date_block_num'], ascending=['True', 'True'], inplace=True)

In [7]:
month_df['lag_1'] = month_df['date_block_num'] -1
month_df['lag_3'] = month_df['date_block_num'] -3
month_df['lag_6'] = month_df['date_block_num'] -6

In [8]:
month_df_helper = month_df[['date_block_num', 'item_cnt_day', 'shop_item']]

In [9]:
month_combine = pd.merge(month_df, month_df_helper, left_on=['lag_1', 'shop_item'], right_on=['date_block_num','shop_item'], how='left', suffixes=['', '_lag1'])
month_combine = pd.merge(month_combine, month_df_helper, left_on=['lag_3', 'shop_item'], right_on=['date_block_num','shop_item'], how='left', suffixes=['', '_lag3'])
month_combine = pd.merge(month_combine, month_df_helper, left_on=['lag_6', 'shop_item'], right_on=['date_block_num','shop_item'], how='left', suffixes=['', '_lag6'])

In [10]:
month_combine['lag_1_pct'] = month_combine['item_cnt_day'] - month_combine['item_cnt_day_lag1']
month_combine['lag_3_pct'] = month_combine['item_cnt_day'] - month_combine['item_cnt_day_lag3']
month_combine['lag_6_pct'] = month_combine['item_cnt_day'] - month_combine['item_cnt_day_lag6']

In [11]:
month_combine.dropna(axis=0, how='any', inplace=True)

In [12]:
final = month_combine[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day','lag_1_pct','lag_3_pct', 'lag_6_pct']]

In [13]:
final.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day,lag_1_pct,lag_3_pct,lag_6_pct
146,24,31,10003,1.0,-1.0,0.0,-1.0
219,6,31,10004,2.0,1.0,-1.0,1.0
220,7,31,10004,1.0,-1.0,-1.0,-1.0
221,8,31,10004,1.0,0.0,0.0,-1.0
225,18,31,10004,1.0,0.0,0.0,0.0


In [14]:
train = final[final.date_block_num != 33]
test =final[final.date_block_num == 33]

In [15]:
train_f = train.drop(['item_cnt_day'], axis=1)
train_t = train['item_cnt_day']

In [16]:
test_f = test.drop(['item_cnt_day'], axis=1)
test_t = test['item_cnt_day']

In [17]:
DM_train = xgb.DMatrix(data=train_f, label=train_t)
DM_test =  xgb.DMatrix(data=test_f, label=test_t)

In [18]:
params = {"booster":"gblinear", "objective":"reg:squarederror", 'max_depth':5, 'n_estimators':1000, 'min_child_weight':30, 'colsample_bytree': 0.8, 
          'subsample':0.8, 'eta' :0.3, 'seed':42}

In [19]:
xg_reg = xgb.train(params = params, dtrain=DM_train, num_boost_round=5)

In [20]:
preds = xg_reg.predict(DM_test)

rmse = np.sqrt(mean_squared_error(test_t,preds))
print("RMSE: %f" % (rmse))

RMSE: 24.348302


In [21]:
rfc = RandomForestRegressor()

In [22]:
rfc.fit(train_f, train_t)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [23]:
pred_rfc = rfc.predict(test_f)

In [24]:
regr = linear_model.LinearRegression()
regr.fit(train_f, train_t)
pred_reg = regr.predict(test_f)
rmse_reg = np.sqrt(mean_squared_error(test_t, pred_reg))
print("RMSE: %f" % (rmse_reg))

RMSE: 24.504742
