In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [7]:
rossmann_train_path = './kaggle-data/rossmann/train.csv'
rossmann_test_path = './kaggle-data/rossmann/test.csv'
rossmann_store_path = './kaggle-data/rossmann/store.csv'

In [8]:
train = pd.read_csv(rossmann_train_path, sep=',', parse_dates=['Date'],
                    dtype={'StateHoliday': str, 'SchoolHoliday':str})

test = pd.read_csv(rossmann_test_path, sep=",", index_col = 'Id', parse_dates=['Date'],
                  dtype={'StateHoliday': str, 'SchoolHoliday':str})

store = pd.read_csv(rossmann_store_path, sep=",", dtype={'StoreType': str,
                                                      'Assortment': str,
                                                      'PromoInterval': str})

In [16]:
train['Year'] = pd.DatetimeIndex(train['Date']).year
train['Month'] = pd.DatetimeIndex(train['Date']).month

In [17]:
train = train[['Store', 'DayOfWeek', 'Date', 'Year', 'Month', 'Customers', 'Open',
               'Promo', 'StateHoliday', 'SchoolHoliday', 'Sales']]

In [18]:
train.loc[train['StateHoliday'] == '0', 'StateHoliday'] = 0
train.loc[train['StateHoliday'] == 'a', 'StateHoliday'] = 1
train.loc[train['StateHoliday'] == 'b', 'StateHoliday'] = 2
train.loc[train['StateHoliday'] == 'c', 'StateHoliday'] = 3
train['StateHoliday'] = train['StateHoliday'].astype(int, copy=False)

In [19]:
def factor_to_integer(df, colname, start_value=0):
    while df[colname].dtype == object:
        myval = start_value # factor starts at "start_value".
        for sval in df[colname].unique():
            df.loc[df[colname] == sval, colname] = myval
            myval += 1
        df[colname] = df[colname].astype(int, copy=False)
    print('levels :', df[colname].unique(), '; data type :', df[colname].dtype)

In [21]:
factor_to_integer(train, 'SchoolHoliday')

levels : [0 1] ; data type : int64


In [22]:
train = train[['Store', 'DayOfWeek', 'Date', 'Year', 'Month', 'Customers', 'Open',
               'Promo', 'StateHoliday', 'SchoolHoliday', 'Sales']]

In [23]:
test['Year'] = pd.DatetimeIndex(test['Date']).year
test['Month'] = pd.DatetimeIndex(test['Date']).month

In [24]:
test = test[['Store', 'DayOfWeek', 'Date', 'Year', 'Month', 'Open',
             'Promo', 'StateHoliday', 'SchoolHoliday']]

In [25]:
factor_to_integer(test, 'StateHoliday')
factor_to_integer(test, 'SchoolHoliday')

levels : [0 1] ; data type : int64
levels : [0 1] ; data type : int64


In [26]:
train = train.ix[train['StateHoliday'] < 2]

In [27]:
store.ix[store['Promo2'] == 0, ['Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']] = 0
store.ix[store['Promo2'] != 0, 'Promo2SinceWeek'] = store['Promo2SinceWeek'].max() - store.ix[store['Promo2'] != 0, 'Promo2SinceWeek']
store.ix[store['Promo2'] != 0, 'Promo2SinceYear'] = store['Promo2SinceYear'].max() - store.ix[store['Promo2'] != 0, 'Promo2SinceYear']
factor_to_integer(store, 'PromoInterval', start_value=0)

levels : [0 1 2 3] ; data type : int64


In [28]:
factor_to_integer(store, 'StoreType')
factor_to_integer(store, 'Assortment')

levels : [0 1 2 3] ; data type : int64
levels : [0 1 2] ; data type : int64


In [29]:
from sklearn.preprocessing import Imputer
imputer = Imputer().fit(store)
store_imputed = imputer.transform(store)

In [31]:
store2 = pd.DataFrame(store_imputed, columns=store.columns.values)

In [32]:
store2['CompetitionOpenSinceMonth'] = store2['CompetitionOpenSinceMonth'].max() - store2['CompetitionOpenSinceMonth']
store2['CompetitionOpenSinceYear'] = store2['CompetitionOpenSinceYear'].max() - store2['CompetitionOpenSinceYear']

In [33]:
train_store = pd.merge(train, store2, how = 'left', on='Store')

In [34]:
test_store = test.reset_index().merge(store2, how = 'left', on='Store').set_index('Id')

In [35]:
train_model = train_store.drop(['Customers', 'Date'], axis=1)
train_model['Year'] = train_model['Year'].max() - train_model['Year']

In [36]:
test_model = test_store.drop(['Date'], axis=1)
test_model['Year'] = test_model['Year'].max() - test_model['Year']

In [43]:
test_model_open = test_model.ix[test_model['Open'] == 1]
test_model_open = test_model_open.drop('Open', axis=1)

test_model_open = test_model.ix[test_model['Open'] == 0]

In [44]:
SalesDF = train_model['Sales']
train_model = train_model.drop(['Sales'], axis=1)
train_model['Sales'] = SalesDF

In [45]:
summary = train_model.describe()
train_normalized = train_model.copy()
ncols = len(train_normalized.columns)

for i in range(ncols):
    mean = summary.iloc[1, i]
    sd = summary.iloc[2, i]
    train_normalized.iloc[:,i:(i + 1)] = \
        (train_normalized.iloc[:,i:(i + 1)] - mean) / sd

In [46]:
sales_normalized = train_normalized['Sales']
train_normalized = train_normalized.drop(['Sales'], axis=1)
train_normalized['Sales'] = sales_normalized

In [47]:
from sklearn.cross_validation import train_test_split
X = train_model.drop('Sales', axis=1)
y = train_model['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [55]:
train_model.to_csv("processed-data/rossmann/train_model.csv")
test_model_open.to_csv("processed-data/rossmann/test_model.csv")