# A simple linear baseline for the Walmart challenge
This notebook shows how you load the data, prepare it for usage with Keras and then create a submission file. The model is a simple linear regression.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(color_codes=True)

## Loading the data
In Kaggle, data that can be accessed by a Kernel is saved under ``../inputs/``
From there we can load it with pandas:

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

We are going to do some data preparation. It is easiest to do this for training and test set combined so we have to do all these steps only once. It is good to know where to split the set afterwards though!

In [None]:
len(train) # Get number of training examples

In [None]:
len(test) # Get number of test examples

In [None]:
df = pd.concat([train,test],axis=0) # Join train and test

In [None]:
df.head() # Get an overview of the data

In [None]:
df.describe()

There seem to be some missing values in the data. We have to make sure to deal with them before feeding anything into the network.

In [None]:
df.isnull().sum()

We will do a bit of very basic feature engineering here by creating a feature which indicates whether a certain markdown was active at all.

In [None]:
df = df.assign(md1_present = df.MarkDown1.notnull())
df = df.assign(md2_present = df.MarkDown2.notnull())
df = df.assign(md3_present = df.MarkDown3.notnull())
df = df.assign(md4_present = df.MarkDown4.notnull())
df = df.assign(md5_present = df.MarkDown5.notnull())



In [None]:
df.isnull().sum()

We can probably safely fill all missing values with zero. For the markdowns this means that there was no markdown. For the weekly sales, the missing values are the ones we have to predict, so it does not really matter what we fill in there.

In [None]:
df.fillna(0, inplace=True)

In [None]:
df['MarkDown1'] = (df['MarkDown1'] - df['MarkDown1'].mean())/(df['MarkDown1'].std())
df['MarkDown2'] = (df['MarkDown2'] - df['MarkDown2'].mean())/(df['MarkDown2'].std())
df['MarkDown3'] = (df['MarkDown3'] - df['MarkDown3'].mean())/(df['MarkDown3'].std())
df['MarkDown4'] = (df['MarkDown4'] - df['MarkDown4'].mean())/(df['MarkDown4'].std())
df['MarkDown5'] = (df['MarkDown5'] - df['MarkDown5'].mean())/(df['MarkDown5'].std())



df['Unemployment'] = (df['Unemployment'] - df['Unemployment'].mean())/(df['Unemployment'].std())
df['CPI'] = (df['CPI'] - df['CPI'].mean())/(df['CPI'].std())
df['Fuel_Price'] = (df['Fuel_Price'] - df['Fuel_Price'].mean())/(df['Fuel_Price'].std())

df['Size'] = (df['Size'] - df['Size'].mean())/(df['Size'].std())




In [None]:

sns.distplot(df['Size'])
df['SSize'] = np.where(df["Size"] < -1.0, 1,0)
df['MSize'] = np.where((df["Size"] >= -1.0) & (df["Size"] < 0.5) , 1,0)
df['LSize'] = np.where(df["Size"] > 0.5, 1,0)

#df['LSize'] =  np.where(df['Size'] >= 0.5, 1,0)


In [None]:
sns.distplot(df['Fuel_Price'])
df['SFuel_Price'] = np.where(df["Fuel_Price"] < -0.2, 1,0)
df['LFuel_Price'] = np.where(df["Fuel_Price"] > -0.2, 1,0)

In [None]:
sns.distplot(df['CPI'])
df['SCPI'] = np.where(df["CPI"] < -0.5, 1,0)
df['MCPI'] = np.where((df["CPI"] >= -0.5) & (df["Size"] < 0.75) , 1,0)
df['LCPI'] = np.where(df["CPI"] > 0.75, 1,0)

In [None]:
sns.distplot(df['MarkDown1'])

In [None]:
sns.distplot(df['Unemployment'])

In [None]:
#df.fillna(0, inplace=True)

In [None]:
df.dtypes

Now we have to create some dummy variebles for categorical data.

In [None]:
# Make sure we can later recognize what a dummy once belonged to
df['Type'] = 'Type_' + df['Type'].map(str)
df['Store'] = 'Store_' + df['Store'].map(str)
df['Dept'] = 'Dept_' + df['Dept'].map(str)

In [None]:
# Create dummies
type_dummies = pd.get_dummies(df['Type'])
store_dummies = pd.get_dummies(df['Store'])
dept_dummies = pd.get_dummies(df['Dept'])

In [None]:
# Add dummies
df = pd.concat([df,type_dummies,store_dummies,dept_dummies],axis=1)

In [None]:
ChristmasDays = ['2010-12-31','2011-12-30','2012-12-28','2013-12-27']
ThanksgivingDays = ['2010-11-26','2011-11-25','2012-11-23','2013-11-29']
SpecialDays = ['2010-12-31','2011-12-30','2012-12-28','2013-12-27','2010-11-26','2011-11-25','2012-11-23','2013-11-29']





# Make christmas
df['Christmas'] = np.where(df['Date'].isin(ChristmasDays), 1,0)
df['Thanksgiving'] = np.where(df['Date'].isin(ThanksgivingDays), 1,0)
df['NotSpecial'] = np.where(~df['Date'].isin(SpecialDays), 1,0)

In [None]:
df['January'] = np.where(pd.to_datetime(df['Date']).dt.month == 1, 1,0)


In [None]:
# Remove originals
del df['Type']
del df['Store']
del df['Dept']

In [None]:
del df['Date']

In [None]:
df.dtypes

Now we can split train test again.

In [None]:
train = df.iloc[:282451]
test = df.iloc[282451:]

In [None]:
test = test.drop('Weekly_Sales',axis=1) # We should remove the nonsense values from test

To get numpy arrays out of the pandas data frame, we can ask for a columns, or dataframes values

In [None]:
y = train['Weekly_Sales'].values

In [None]:
X = train.drop('Weekly_Sales',axis=1).values

In [None]:
X.shape

Now we create the baseline model

In [None]:
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.layers import BatchNormalization

In [None]:
model = Sequential()

model.add(Dense(74,input_dim=157))
model.add(BatchNormalization())
model.add(Activation('relu'))


model.add(Dense(36,input_dim=74))
model.add(Activation('relu'))


model.add(Dense(1,input_dim=36))



model.compile(optimizer='adam', loss='mae')

In [None]:
model.fit(X,y,batch_size=2048,epochs=100)

After we have created our model, we can predict things with it on the test set

In [None]:
X_test = test.values

In [None]:
y_pred = model.predict(X_test,batch_size=2048)

To create the ids required for the submission we need the original test file one more time

In [None]:
testfile = pd.read_csv('../input/test.csv')

Now we create the submission. Once you run the kernel you can download the submission from its outputs and upload it to the Kaggle InClass competition page.

In [None]:
submission = pd.DataFrame({'id':testfile['Store'].map(str) + '_' + testfile['Dept'].map(str) + '_' + testfile['Date'].map(str),
                          'Weekly_Sales':y_pred.flatten()})

In [None]:
submission.to_csv('submission.csv',index=False)