In [82]:
# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from sklearn import datasets, linear_model, cross_validation, grid_search

In [83]:
# get rossmann, store, & test csv files as a DataFrame
train_df  = pd.read_csv("../input/train.csv")
store_df     = pd.read_csv("../input/store.csv")
test_df      = pd.read_csv("../input/test.csv")

In [84]:
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def RMSPE(y, yhat):
    y = y.astype('float')
    yhat = yhat.astype('float')
    # Note: y's shape is n x 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

def rmspe(Y_test, preds):
    Y_test = Y_test.astype('float')
    preds = preds.astype('float')
    inner = ((preds-Y_test)/Y_test)**2
    return (np.mean(inner))**0.5

y1 = np.array([4,5,6,7]).astype('float')
yhat1 = np.array([3,7,7,8]).astype('float')
print RMSPE(y1, yhat1)
print rmspe(y1, yhat1)

0.260137435331
0.260137435331


### Data Preprocessing ###

In [85]:
# Got a warning about column 7 during import.
# Fix it by change all 0 to '0'
train_df.loc[train_df['StateHoliday'] == 0, 'StateHoliday'] = '0'

# Convert the date column in train and test data
train_df['Date'] = pd.to_datetime(train_df['Date'], format="%Y-%m-%d")
test_df['Date'] = pd.to_datetime(test_df['Date'], format="%Y-%m-%d")

# Keep only records where the store is open
train_df = train_df[train_df['Open'] == 1]

# Keep only records with non-zero sales
train_df = train_df[train_df['Sales'] > 0]

# For each store, find its first opening day and merge it into store_df
storeOpenDays = train_df.groupby('Store')['Date'].min()
d = pd.DataFrame(storeOpenDays)
d = d.rename(columns = {'Date':'OpenDay'})
store_df = pd.merge(store_df, d, left_on = 'Store', right_index = True) 

# For each store, convert it's competitor open day into a Date column
competitorOpenDay = store_df.apply(lambda row: np.nan 
        if pd.isnull(row['CompetitionOpenSinceMonth']) or pd.isnull(row['CompetitionOpenSinceYear'])
        else np.datetime64("%d-%02d-01" % (int(row['CompetitionOpenSinceYear']), int(row['CompetitionOpenSinceMonth']))), 
        axis=1)

store_df["CompetitorOpenDay"] = competitorOpenDay

# For those stores with empty CompetitorOpenDay date, set it to the same as StoreOpenDay
store_df.ix[:, 'CompetitorOpenDay'] = store_df["CompetitorOpenDay"].fillna(store_df['OpenDay'])

In [86]:
def convertColToCategory(data, colName, knownCategories):
    data.loc[:, colName] = data.loc[:, colName].astype('category', categories=knownCategories)

def transformData(inputData):

    data = inputData.copy()

    # StateHoliday have values of both '0' and 0.  Change all 0 to '0'
    data.loc[data['StateHoliday'] == 0, ('StateHoliday')] = '0'

    # Can consider:
    # Add a "IsDec" col

    # Convert categorical columns to category type
    convertColToCategory(data, 'DayOfWeek', list(range(1, 8)))
    convertColToCategory(data, 'Promo', list(range(2)))
    convertColToCategory(data, 'SchoolHoliday', list(range(2)))
    convertColToCategory(data, 'StateHoliday', list('0abc'))
    
    if 'StoreType' in data.columns:
        convertColToCategory(data, 'StoreType', list('abcd'))
                         
    if 'Assortment' in data.columns:
        convertColToCategory(data, 'Assortment', list('abc'))

    return pd.get_dummies(data)

In [87]:
train_full_df = pd.merge(train_df, store_df.ix[:,['StoreType', 'Assortment', 'Store']], on='Store')

In [88]:
train_data = transformData(train_full_df)

In [89]:
train_data.set_index('Date', inplace=True)
train_data.sort_index(inplace=True)

In [90]:
train_data.columns

Index([u'Store', u'Sales', u'Customers', u'Open', u'DayOfWeek_1',
       u'DayOfWeek_2', u'DayOfWeek_3', u'DayOfWeek_4', u'DayOfWeek_5',
       u'DayOfWeek_6', u'DayOfWeek_7', u'Promo_0', u'Promo_1',
       u'StateHoliday_0', u'StateHoliday_a', u'StateHoliday_b',
       u'StateHoliday_c', u'SchoolHoliday_0', u'SchoolHoliday_1',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d',
       u'Assortment_a', u'Assortment_b', u'Assortment_c'],
      dtype='object')

In [91]:
def getX(data, *args):
    x_drop_cols = ['Store', 'Open'] + list(args)
    
    train_cols = ['Sales', 'Customers']
    for c in train_cols:
        if c in data.columns:
            x_drop_cols.append(c)
        
    return data.drop(x_drop_cols, axis=1, inplace=False)

In [92]:
def trainModels(n_cutoff_months):
    all_y = []
    all_yhat = []
    lms = {}
    
    test_cutoff_date = train_data.index.max() - pd.DateOffset(months = n_cutoff_months)

    train_set = train_data[train_data.index <= test_cutoff_date]
    dev_set = train_data[train_data.index > test_cutoff_date]
    print "train_set size:", len(train_set)
    print "dev_set size:", len(dev_set)

    
    train_set_grouped = train_set.groupby('Store')
    dev_set_grouped = dev_set.groupby('Store')

    for group in train_set_grouped.groups.keys():
        x_drop_cols = ['Store', 'Sales', 'Customers', 'Open']

        train_X = train_set_grouped.get_group(group).drop(x_drop_cols, axis=1, inplace=False)
        train_y = train_set_grouped.get_group(group)['Sales']

        dev_X = dev_set_grouped.get_group(group).drop(x_drop_cols, axis=1, inplace=False)
        dev_y = dev_set_grouped.get_group(group)['Sales']

        # One model per store
        lm = linear_model.LinearRegression()
        lm.fit(X=train_X, y=train_y)
        lms[group] = lm
        yhat = lm.predict(dev_X)

        all_y.extend(dev_y.tolist())
        all_yhat.extend(yhat.tolist())

    print "Training model - dev data RMSPE=", RMSPE(np.array(all_y), np.array(all_yhat))
    return lms

In [93]:
models = trainModels(2)

train_set size: 785727
dev_set size: 58611
Training model - dev data RMSPE= 0.151001136238


## Make Predictions on Test data ##

In [60]:
test_full_df = pd.merge(test_df, store_df.ix[:,['StoreType', 'Assortment', 'Store']], on='Store')

In [61]:
test_data = transformData(test_full_df)

In [62]:
test_data.columns

Index([u'Id', u'Store', u'Date', u'Open', u'DayOfWeek_1', u'DayOfWeek_2',
       u'DayOfWeek_3', u'DayOfWeek_4', u'DayOfWeek_5', u'DayOfWeek_6',
       u'DayOfWeek_7', u'Promo_0', u'Promo_1', u'StateHoliday_0',
       u'StateHoliday_a', u'StateHoliday_b', u'StateHoliday_c',
       u'SchoolHoliday_0', u'SchoolHoliday_1', u'StoreType_a', u'StoreType_b',
       u'StoreType_c', u'StoreType_d', u'Assortment_a', u'Assortment_b',
       u'Assortment_c'],
      dtype='object')

In [78]:
all_yhat = []

test_set_grouped = train_set.groupby('Store')

for group in test_set_grouped.groups.keys():
    x_drop_cols = ['Store', 'Open']

    train_X = getX(train_set_grouped.get_group(group))
    train_y = train_set_grouped.get_group(group)['Sales']

    dev_X = getX(dev_set_grouped.get_group(group))
    dev_y = dev_set_grouped.get_group(group)['Sales']

    # One model per store
    lm = linear_model.LinearRegression()
    lm.fit(X=train_X, y=train_y)
    lms[group] = lm
    yhat = lm.predict(dev_X)

    all_y.extend(dev_y.tolist())
    all_yhat.extend(yhat.tolist())