In [1]:
# Using Ridge regression and Decision tree as base and test with 10-fold cross validation
# Test with MAE
# build model for each Store and Dept

In [2]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import KFold
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
train = pd.read_csv('./train.csv')
feature = pd.read_csv('./features.csv')
test = pd.read_csv('./test.csv')
train['Date'] = pd.to_datetime(train['Date'])
train['Week'] = train['Date'].dt.week
test['Date'] = pd.to_datetime(test['Date'])
test['Week'] = test['Date'].dt.week
feature['Date'] = pd.to_datetime(feature['Date'])
train['IsHoliday'] = (train[['IsHoliday']]== True).astype(int)
test['IsHoliday'] = (test[['IsHoliday']]== True).astype(int)
train_with_feature = pd.merge_asof(train, feature, on='Store',by='Date')
test_with_feature = pd.merge_asof(test, feature, on='Store',by='Date')

In [4]:
test_with_feature

Unnamed: 0,Store,Dept,Date,IsHoliday_x,Week,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y
0,1,1,2012-11-02,0,44,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,False
1,1,1,2012-11-09,0,45,61.24,3.314,11421.32,3370.89,40.28,4646.79,6154.16,223.481307,6.573,False
2,1,1,2012-11-16,0,46,52.92,3.252,9696.28,292.10,103.78,1133.15,6612.69,223.512911,6.573,False
3,1,1,2012-11-23,1,47,56.23,3.211,883.59,4.17,74910.32,209.91,303.32,223.561947,6.573,True
4,1,1,2012-11-30,0,48,52.34,3.207,2460.03,,3838.35,150.57,6966.34,223.610984,6.573,False
5,1,1,2012-12-07,0,49,64.12,3.198,6343.16,,270.00,2928.90,10147.90,223.660021,6.573,False
6,1,1,2012-12-14,0,50,48.89,3.168,3504.83,,73.26,1636.80,2779.60,223.719277,6.573,False
7,1,1,2012-12-21,0,51,56.02,3.098,8231.71,,274.00,358.15,2834.02,223.839845,6.573,False
8,1,1,2012-12-28,1,52,44.79,3.108,12659.55,37101.13,174.78,74.46,1208.86,223.960414,6.573,True
9,1,1,2013-01-04,0,1,41.73,3.161,1214.08,25366.33,15.01,72.36,3940.02,224.080983,6.525,False


In [5]:
train_with_feature.isna().any()

Store           False
Dept            False
Date            False
Weekly_Sales    False
IsHoliday_x     False
Week            False
Temperature     False
Fuel_Price      False
MarkDown1        True
MarkDown2        True
MarkDown3        True
MarkDown4        True
MarkDown5        True
CPI             False
Unemployment    False
IsHoliday_y     False
dtype: bool

In [6]:
# Markdown selection
mean_approach = train_with_feature.fillna(train_with_feature.mean())
remove_approach = train_with_feature.dropna()

In [7]:
def MAE_score(real,predict):
    N = len(predict)
    score = real - predict
    score = abs(score)
    return(sum(score)/N)

In [8]:
def Ridge_KfoldCross(K,dataset):
    total_score = 0
    for store in dataset.Store.unique():
        store_score = 0
        select_store = dataset.loc[dataset['Store'] == store]
        for dept in select_store.Dept.unique():
            dept_score = 0
            select_dept = select_store.loc[select_store['Dept'] == dept]
            select_dept = select_dept.drop(columns=['Store', 'Dept','Date','IsHoliday_y'])
            select_dept = select_dept.reset_index()
            train = select_dept.drop(columns=['Weekly_Sales'])
            test = select_dept['Weekly_Sales']
            kf = KFold(n_splits=K)
            if len(select_dept)>10: # handle some dept have record less than 10 ex. dept 77
                for train_index, test_index in kf.split(select_dept):
                    X_train, X_test = train.loc[train_index], train.loc[test_index]
                    Y_train, Y_test = test[train_index], test[test_index]
                    clf = Ridge(alpha=1.0)
                    clf.fit(X_train,Y_train)
                    test_y = clf.predict(X_test)
                    score = MAE_score(Y_test,test_y)
                    dept_score += score/K
                store_score += dept_score/len(select_store.Dept.unique())
        total_score += store_score/len(select_store.Store.unique())
    return(total_score)

In [9]:
def Tree_KfoldCross(K,dataset):
    total_score = 0
    for store in dataset.Store.unique():
        store_score = 0
        select_store = dataset.loc[dataset['Store'] == store]
        for dept in select_store.Dept.unique():
            dept_score = 0
            select_dept = select_store.loc[select_store['Dept'] == dept]
            select_dept = select_dept.drop(columns=['Store', 'Dept','Date','IsHoliday_y'])
            select_dept = select_dept.reset_index()
            train = select_dept.drop(columns=['Weekly_Sales'])
            test = select_dept['Weekly_Sales']
            kf = KFold(n_splits=K)
            if len(select_dept)>10: # handle some dept have record less than 10 ex. dept 77
                for train_index, test_index in kf.split(select_dept):
                    X_train, X_test = train.loc[train_index], train.loc[test_index]
                    Y_train, Y_test = test[train_index], test[test_index]
                    clf = DecisionTreeRegressor()
                    clf.fit(X_train,Y_train)
                    test_y = clf.predict(X_test)
                    score = MAE_score(Y_test,test_y)
                    dept_score += score/K
                store_score += dept_score/len(select_store.Dept.unique())
        total_score += store_score/len(select_store.Store.unique())
    return(total_score)

In [10]:
print(Ridge_KfoldCross(10,mean_approach)) # 125048.60083556036
print(Ridge_KfoldCross(10,remove_approach)) # 489180.6785732042
print(Tree_KfoldCross(10,mean_approach)) # 94008.56579762329
print(Tree_KfoldCross(10,remove_approach)) # 114060.74665986476

125048.60083556036
489180.6785732042
94330.99489049517
113543.27917536224


In [11]:
# 10-fold result has MEA vary high and I don't sure MEA compare to weight MEA
# so I decide to test 10-fold have same direction with test result or not

In [13]:
def Ridge_Realtest(name,train,test):
    name = './'+name+'_ridge.csv'
    f = open(name,'w')
    f.write('Id,Weekly_Sales\n')
    for store in test.Store.unique():
        select_store_test = test.loc[test['Store'] == store]
        select_store_train = train.loc[train['Store'] == store]
        for dept in select_store_test.Dept.unique():
            select_dept_test = select_store_test.loc[select_store_test['Dept'] == dept]
            dates = select_dept_test['Date'].apply(lambda x: x.strftime('%Y-%m-%d')).values
            select_dept_test = select_dept_test.drop(columns=['Store', 'Dept','Date','IsHoliday_y']).reset_index()
            select_dept_train = select_store_train.loc[select_store_train['Dept'] == dept]
            select_dept_train = select_dept_train.drop(columns=['Store', 'Dept','Date','IsHoliday_y']).reset_index()
            if len(select_dept_train.index) > 0: # handle no train data for test dept ex. store 5 dept 99
                X_train = select_dept_train.drop(columns=['Weekly_Sales'])
                Y_train = select_dept_train['Weekly_Sales']
                X_test = select_dept_test
                clf = Ridge(alpha=1.0)
                clf.fit(X_train,Y_train)
                test_y = clf.predict(X_test)
                for i in range(len(test_y)):
                    Id = str(store)+'_'+str(dept)+'_'+str(dates[i])
                    sales = test_y[i]
                    f.write('%s,%s\n'%(Id,sales))
            else:
                for i in range(len(select_dept_test)):
                    Id = str(store)+'_'+str(dept)+'_'+str(dates[i])
                    sales = 0
                    f.write('%s,%s\n'%(Id,sales))
    f.close()

In [14]:
def Tree_Realtest(name,train,test):
    name = './'+name+'_Tree.csv'
    f = open(name,'w')
    f.write('Id,Weekly_Sales\n')
    for store in test.Store.unique():
        select_store_test = test.loc[test['Store'] == store]
        select_store_train = train.loc[train['Store'] == store]
        for dept in select_store_test.Dept.unique():
            select_dept_test = select_store_test.loc[select_store_test['Dept'] == dept]
            dates = select_dept_test['Date'].apply(lambda x: x.strftime('%Y-%m-%d')).values
            select_dept_test = select_dept_test.drop(columns=['Store', 'Dept','Date','IsHoliday_y']).reset_index()
            select_dept_train = select_store_train.loc[select_store_train['Dept'] == dept]
            select_dept_train = select_dept_train.drop(columns=['Store', 'Dept','Date','IsHoliday_y']).reset_index()
            if len(select_dept_train.index) > 0: # handle no train data for test dept ex. store 5 dept 99
                X_train = select_dept_train.drop(columns=['Weekly_Sales'])
                Y_train = select_dept_train['Weekly_Sales']
                X_test = select_dept_test
                clf = DecisionTreeRegressor()
                clf.fit(X_train,Y_train)
                test_y = clf.predict(X_test)
                for i in range(len(test_y)):
                    Id = str(store)+'_'+str(dept)+'_'+str(dates[i])
                    sales = test_y[i]
                    f.write('%s,%s\n'%(Id,sales))
            else:
                for i in range(len(select_dept_test)):
                    Id = str(store)+'_'+str(dept)+'_'+str(dates[i])
                    sales = 0
                    f.write('%s,%s\n'%(Id,sales))
    f.close()

In [15]:
mean_approach = train_with_feature.fillna(train_with_feature.mean())
remove_approach = train_with_feature.dropna()
test = test_with_feature.fillna(test_with_feature.mean())
Ridge_Realtest('mean',mean_approach,test) #10232151.48932
Ridge_Realtest('remove',remove_approach,test) #26599520.91667
Tree_Realtest('mean',mean_approach,test) #4772.18232
Tree_Realtest('remove',remove_approach,test) #4869.18242

In [16]:
# Weighted MEA has same direction with normal MEA but Decision Tree result has much less Weight MEA compare to MEA
# Decision Tree may be handle many value than Ridge
# so I decide drop some feature then I check some feature to drop start at CPI and Unemployment

In [17]:
# Base Ridge 125048
# Base Tree 94008
drop_CPI_unemploy = mean_approach.drop(columns=['CPI', 'Unemployment'])
drop_CPI = mean_approach.drop(columns=['CPI'])
drop_unemploy = mean_approach.drop(columns=['Unemployment'])
print(Ridge_KfoldCross(10,drop_CPI_unemploy))
print(Tree_KfoldCross(10,drop_CPI_unemploy))
print(Ridge_KfoldCross(10,drop_CPI))
print(Tree_KfoldCross(10,drop_CPI))
print(Ridge_KfoldCross(10,drop_unemploy))
print(Tree_KfoldCross(10,drop_unemploy))

123060.92425443465
92695.14821916597
123342.74436538141
92908.25600205472
124426.84671453196
93456.4683554392


In [20]:
# Drop both CPI and Employment have best MEA

# So i decide to test with both fuel price and temp.

In [21]:
# Base Ridge 123060
# Base Tree 92659
drop_Temp_Fuel = drop_CPI_unemploy.drop(columns=['Temperature', 'Fuel_Price'])
drop_Temp = drop_CPI_unemploy.drop(columns=['Temperature'])
drop_Fuel = drop_CPI_unemploy.drop(columns=['Fuel_Price'])
print(Ridge_KfoldCross(10,drop_Temp_Fuel))
print(Tree_KfoldCross(10,drop_Temp_Fuel))
print(Ridge_KfoldCross(10,drop_Temp))
print(Tree_KfoldCross(10,drop_Temp))
print(Ridge_KfoldCross(10,drop_Fuel))
print(Tree_KfoldCross(10,drop_Fuel))

132788.21989472196
85944.1898711757
139170.80784491656
90704.70564170498
115290.44725082867
89294.62625224702
