## Library imports

In [35]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

## Helper functions

In [36]:
def custom_log_loss_scorer(estimator,X,y):
    weight_array = np.where(X.IsHoliday_num==1,5,1)
    log_preds = estimator.predict(X)
    org_preds = np.exp(log_preds)-4990
    org_y = np.exp(y)-4990
    error = np.mean(weight_array*np.abs(org_y-org_preds))
    
    return -(error)

In [37]:
def custom_loss_scorer(estimator,X,y):
    weight_array = np.where(X.IsHoliday==True,5,1)
    preds = estimator.predict(X)
    error = np.mean(weight_array*np.abs(y-preds))
    
    return -(error)

In [38]:
def log_error(preds,org,weights):
    org_preds = np.exp(preds)-4990
    error = np.mean(weights*np.abs(org-org_preds))
    return error

In [39]:
def imputing(train,test,num_features):
    imputer = SimpleImputer(strategy='constant',fill_value=0)
    

    imp_train = pd.DataFrame(imputer.fit_transform(train[num_features]),columns=num_features)
    train[num_features] = imp_train
    
    test.reset_index(drop=True,inplace=True)
    imp_test = pd.DataFrame(imputer.transform(test[num_features]),columns=num_features)
    test[num_features] = imp_test
    
    return train,test

In [40]:
def create_date_features(df):
    
    df['quarter'] = df['Date'].dt.quarter
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    #df['dayofyear'] = df['Date'].dt.dayofyear
    df['weekofyear'] = df['Date'].dt.weekofyear
    df['day'] = df['Date'].dt.day
    df['days'] = (df.month-1) * 30 + df.day
    df['tDays'] = (df.year-2010)*360 + df.days

    def week_of_month(dt):
        """ Returns the week of the month for the specified date.
        """

        first_day = dt.replace(day=1)

        dom = dt.day
        adjusted_dom = dom + first_day.weekday()

        return int(ceil(adjusted_dom/7.0))
    
    df['weekofmonth'] = df['Date'].apply(week_of_month)
    
    return df

In [41]:
def one_hot_encoding(train,test,cat_features):
    ohe = OneHotEncoder()
    
    ohe_train = ohe.fit_transform(train[cat_features]).toarray()
    ohe_train = pd.DataFrame(ohe_train,columns=ohe.get_feature_names())
    train.drop(cat_features,axis=1,inplace=True)
    train.reset_index(drop=True,inplace=True)
    train = pd.concat([train,ohe_train],axis=1)
    
    ohe_test = ohe.transform(test[cat_features]).toarray()
    ohe_test = pd.DataFrame(ohe_test,columns=ohe.get_feature_names())
    test.drop(cat_features,axis=1,inplace=True)
    test.reset_index(drop=True,inplace=True)
    test = pd.concat([test,ohe_test],axis=1)
    
    return (train,test)

In [42]:
def create_lag_features(train,test,lags):
    
    train['set']='Train'
    test['set']='Test'
    test['Weekly_Sales']=0
    
    one_df=pd.concat([train,test],ignore_index=True)
    
    lags = range(39, lags+1)

    df = one_df.assign(**{'Weekly_sales(t-{})'.format(t): one_df.groupby(['Store','Dept']).Weekly_Sales.shift(t) for t in lags})
    
    train_set = df[df.set=='Train']
    test_set = df[df.set=='Test']

    
    return train_set,test_set

In [43]:
def create_other_features(df):
    df['IsHoliday_bin']=np.where(df.IsHoliday,1,0)
    
    #type_dict = {'A':1,'B':2,'C':3}
    #df['Type_num']=df.Type.map(type_dict)
    
    return df

## Data imports and manipulation

In [44]:
features = pd.read_csv('walmart-recruiting-store-sales-forecasting/features.csv',parse_dates=[1])
stores = pd.read_csv('walmart-recruiting-store-sales-forecasting/stores.csv')
test = pd.read_csv('walmart-recruiting-store-sales-forecasting/test.csv',parse_dates=[2])
train = pd.read_csv('walmart-recruiting-store-sales-forecasting/train.csv',parse_dates=[2])
submission = pd.read_csv('walmart-recruiting-store-sales-forecasting/sampleSubmission.csv')

In [45]:
train_data = pd.merge(left=pd.merge(left=train,right=stores,how='left'),right=features,how='left',on=['Store','Date','IsHoliday'])
test_data = pd.merge(left = pd.merge(left=test,right=stores,how='left'),right=features,how='left',on = ['Store','Date','IsHoliday'])

## Feature engineering

In [46]:
train_data = create_date_features(train_data)
test_data = create_date_features(test_data)

In [47]:
train_data,test_data = one_hot_encoding(train_data,test_data,['Type'])

In [48]:
train_data,test_data = create_lag_features(train_data,test_data,52)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [49]:
train_data = create_other_features(train_data)
test_data = create_other_features(test_data)

In [50]:
num_features = list(train_data.columns[np.isin(train_data.dtypes,['int64','float64'])])


In [51]:
num_features = list(train_data.columns[np.isin(train_data.dtypes,['int32','float64'])])
train_data,test_data = imputing(train_data,test_data,num_features)

In [52]:
train_data['log_sales'] = np.log(4990+train_data.Weekly_Sales)

In [53]:
validation=False

features_list = ['Size','x0_A','x0_B','x0_C','year','month','day','days','IsHoliday_bin','tDays','Weekly_sales(t-47)', 'Weekly_sales(t-48)', 'Weekly_sales(t-49)',
       'Weekly_sales(t-50)', 'Weekly_sales(t-51)', 'Weekly_sales(t-52)']

train_stores = train_data.Store.unique()
test_stores = test_data.Store.unique()

detailed_data = dict()

for test_store in test_stores:
    
    print("")
    
    train_depts = train_data.loc[train_data.Store==test_store].Dept.unique()
    test_depts = test_data.loc[test_data.Store==test_store].Dept.unique()
    
    print('Store: ' + str(test_store))
    
    for test_dept in test_depts:
        
        ##print(str(test_dept),end = " ")        
        less_than_10 = False

        train_store_dept_data = train_data.loc[(train_data.Store==test_store) & (train_data.Dept == test_dept)]
        test_store_dept_data = test_data[(test_data.Store==test_store) & (test_data.Dept==test_dept)]
        
        if (len(train_store_dept_data)<10):
            
            train_store_dept_data = train_data.loc[train_data.Dept == test_dept]
            test_store_dept_data = test_data.loc[test_data.Dept == test_dept]        
            less_than_10 = True
        
        X = train_store_dept_data[features_list]
        y = train_store_dept_data.log_sales
        weights = np.where(X.IsHoliday_bin==1,5,1)
        
        #model = XGBRegressor(verbosity=0,seed=28)
        #model.fit(X,y,sample_weight=weights)
        
        model = RandomForestRegressor(n_estimators=500,n_jobs=-1)
        model.fit(X,y)
        
        train_preds = model.predict(X)
        test_preds = model.predict(test_store_dept_data[features_list])
        
        if (less_than_10):
            
            if(len(train_data.loc[(train_data.Store==test_store) & (train_data.Dept == test_dept)][features_list])>0):
                train_preds = model.predict(train_data.loc[(train_data.Store==test_store) & (train_data.Dept == test_dept)][features_list])
            else:
                train_preds = None
            
            if(len(test_data.loc[(test_data.Store==test_store) & (test_data.Dept==test_dept)][features_list])>0):
                test_preds = model.predict(test_data[(test_data.Store==test_store) & (test_data.Dept==test_dept)][features_list])
            
            else:
                test_preds = None
            
        current_data = dict()
        current_data['train_X'] = X
        current_data['train_y'] = y
        current_data['test_X'] = test_store_dept_data[features_list]
        if validation:
            test_y = test_dept_store_data.log_sales
            current_data['test_y'] = test_y
        current_data['model'] = None
        current_data['less_data'] = less_than_10
        current_data['train_preds'] = train_preds
        current_data['test_preds'] = test_preds
        
        detailed_data[str(test_store) + '_' + str(test_dept)] = current_data


Store: 1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 40 41 42 44 45 46 47 48 49 52 54 55 56 58 59 60 67 71 72 74 79 80 81 82 83 85 87 90 91 92 93 94 95 96 97 98 99 
Store: 2
1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 40 41 42 44 45 46 47 48 49 52 54 55 56 58 59 60 67 71 72 74 77 79 80 81 82 83 85 87 90 91 92 93 94 95 96 97 98 99 
Store: 3
1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 38 40 41 42 44 46 47 49 52 54 55 56 59 60 67 71 72 74 79 80 81 82 85 87 90 91 92 94 95 96 97 98 
Store: 4
1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 44 45 46 47 48 49 52 54 55 56 58 59 60 67 71 72 74 79 80 81 82 83 85 87 90 91 92 93 94 95 96 97 98 99 
Store: 5
1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 38 40 41 42 44 45

1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 38 40 41 42 44 45 46 49 52 54 55 56 58 59 60 67 71 72 74 77 79 80 81 82 83 85 87 90 91 92 93 94 95 96 97 98 99 
Store: 40
1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 40 41 42 44 45 46 47 48 49 52 54 55 56 58 59 60 67 71 72 74 79 80 81 82 83 85 87 90 91 92 93 94 95 96 97 98 99 
Store: 41
1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 38 40 41 42 44 45 46 47 48 49 52 54 55 56 58 59 60 67 71 72 74 79 80 81 82 83 85 87 90 91 92 93 94 95 96 97 98 99 
Store: 42
1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 17 18 20 21 22 23 24 25 26 27 28 30 31 32 33 38 40 42 44 46 49 52 55 56 59 60 67 72 74 79 80 81 82 83 85 87 90 91 92 93 94 95 96 97 98 
Store: 43
1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 17 18 20 21 22 23 25 26 28 31 32 33 38 40 42 46 49 52 55 56 59 60 67 71 72 74 79 80 81 82 83 85 87 90 91 92 93 94 95 96

In [54]:
all_train_preds = [preds for dept_data in detailed_data.values() if dept_data['train_preds'] is not None 
                   for preds in dept_data['train_preds']]

In [55]:
all_test_preds = [preds for dept_data in detailed_data.values() for preds in dept_data['test_preds']]

In [56]:
redundant_pairs_mask = (~(train_data.Store.map(str) +'-'+ train_data.Dept.map(str)).isin(test_data.Store.map(str) +'-'+ test_data.Dept.map(str)))
train_data_red = train_data[~redundant_pairs_mask]
log_error(all_train_preds,train_data_red.Weekly_Sales,weights=train_data_red.IsHoliday_bin)

81.56268203915204

In [57]:
submission['Weekly_Sales'] = np.exp(all_test_preds)-4990

In [58]:
test_data['preds']=np.exp(all_test_preds)-4990

In [59]:
t = test_data.copy()

In [60]:
for store in t.Store.unique():
    print(store,end=" ")
    for dept in t.Dept.unique():
        subset = t.loc[(t.Store==store)&(t.Dept==dept)&(t.weekofyear.isin([48,49,50,51,52]))]
        if (len(subset)<5):
            continue

        old_preds = np.array(subset.preds)
        pre_mean = old_preds[1:4].mean()
        post_mean = (old_preds[0]+old_preds[-1])/2
        
        if ((pre_mean/post_mean)>1.1):
        
            shifted = old_preds * (7-2.5)/7
            shifted[1:] = np.array(shifted[1:5]) + np.array(old_preds[0:4]) * (2.5/7)
            shifted[0] = old_preds[0]
            t.loc[(t.Store==store)&(t.Dept==dept)&(t.weekofyear.isin([48,49,50,51,52])),'preds']=shifted
            

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 

In [61]:
submission['Weekly_Sales'] = t.reset_index().preds

In [62]:
submission.to_csv('multiple_models_rf.csv',index=False)

In [None]:
submission.head()

In [64]:
sm_preds = pd.read_csv('xgboost_post_adjustment.csv')
mmdf_preds = pd.read_csv('multiple_models_rf.csv')
mmxgb_preds = pd.read_csv('multiple_models_xgboost.csv')

In [66]:
new_sub = sm_preds.Weekly_Sales*0.3333 + mmdf_preds.Weekly_Sales*0.3333 + mmxgb_preds.Weekly_Sales*0.3333

In [67]:
weighted_submission = pd.DataFrame({'Id':submission.Id,'Weekly_Sales':new_sub})

In [68]:
weighted_submission.head()

Unnamed: 0,Id,Weekly_Sales
0,1_1_2012-11-02,37748.475241
1,1_1_2012-11-09,20945.650544
2,1_1_2012-11-16,20395.303113
3,1_1_2012-11-23,21073.160761
4,1_1_2012-11-30,27822.922654


In [69]:
weighted_submission.to_csv('weighted_sub.csv',index=False)