In [224]:
# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame
import random

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from sklearn import datasets, linear_model, cross_validation, grid_search

In [225]:
# get rossmann, store, & test csv files as a DataFrame
train_df  = pd.read_csv("../input/train.csv")
store_df     = pd.read_csv("../input/store.csv")
test_df      = pd.read_csv("../input/test.csv")

In [226]:
def rmspe(y, yhat):
    y = y.astype('float')
    yhat = yhat.astype('float')
    inner = ((y-yhat)/y)**2
    return (np.mean(inner))**0.5

### Data Preprocessing ###

In [227]:
# Got a warning about column 7 during import.
# Fix it by change all 0 to '0'
train_df.loc[train_df['StateHoliday'] == 0, 'StateHoliday'] = '0'

# Convert the date column in train and test data
train_df['Date'] = pd.to_datetime(train_df['Date'], format="%Y-%m-%d")
test_df['Date'] = pd.to_datetime(test_df['Date'], format="%Y-%m-%d")

# Keep only records where the store is open
train_df = train_df[train_df['Open'] == 1]

# Keep only records with non-zero sales
train_df = train_df[train_df['Sales'] > 0]

In [228]:
def convertColToCategory(data, colName, knownCategories):
    data.loc[:, colName] = data.loc[:, colName].astype('category', categories=knownCategories)

def transformData(inputData):

    data = inputData.copy()

    # StateHoliday have values of both '0' and 0.  Change all 0 to '0'
    data.loc[data['StateHoliday'] == 0, ('StateHoliday')] = '0'

    # Can consider:
    # Add a "IsDec" col

    # Convert categorical columns to category type
    convertColToCategory(data, 'DayOfWeek', list(range(1, 8)))
    convertColToCategory(data, 'Promo', list(range(2)))
    convertColToCategory(data, 'SchoolHoliday', list(range(2)))
    convertColToCategory(data, 'StateHoliday', list('0abc'))
    
    if 'StoreType' in data.columns:
        convertColToCategory(data, 'StoreType', list('abcd'))
                         
    if 'Assortment' in data.columns:
        convertColToCategory(data, 'Assortment', list('abc'))

    return pd.get_dummies(data)

In [229]:
train_full_df = pd.merge(train_df, store_df.ix[:,['StoreType', 'Assortment', 'Store']], on='Store')

In [230]:
train_data = transformData(train_full_df)

In [231]:
train_data.set_index('Date', inplace=True)
train_data.sort_index(inplace=True)
#train_data.columns

In [232]:
def trainModels(n_cutoff_months):
    all_y = []
    all_yhat = []
    lms = {}
    
    test_cutoff_date = train_data.index.max() - pd.DateOffset(months = n_cutoff_months)

    train_set = train_data[train_data.index <= test_cutoff_date]
    dev_set = train_data[train_data.index > test_cutoff_date]
    print "train_set size:", len(train_set)
    print "dev_set size:", len(dev_set)

    
    train_set_grouped = train_set.groupby('Store')
    dev_set_grouped = dev_set.groupby('Store')
    cols = None

    for group in train_set_grouped.groups.keys():
        x_drop_cols = ['Store', 'Sales', 'Customers', 'Open']

        train_X = train_set_grouped.get_group(group).drop(x_drop_cols, axis=1, inplace=False)
        train_y = train_set_grouped.get_group(group)['Sales']

        dev_X = dev_set_grouped.get_group(group).drop(x_drop_cols, axis=1, inplace=False)
        dev_y = dev_set_grouped.get_group(group)['Sales']

        # One model per store
        lm = linear_model.LinearRegression()
        lm.fit(X=train_X, y=train_y)
        lms[group] = lm
        yhat = lm.predict(dev_X)

        all_y.extend(dev_y.tolist())
        all_yhat.extend(yhat.tolist())
        
    print "Training model - dev data RMSPE=", RMSPE(np.array(all_y), np.array(all_yhat))
    
    return lms

In [233]:
models = trainModels(2)

train_set size: 785727
dev_set size: 58611
Training model - dev data RMSPE= 0.150966292575


## Make Predictions on Test data ##

In [234]:
def makePredictions(data):
    test_ids = []
    test_yhats = []

    grouped = data.groupby('Store')

    for storeId, group in grouped:
        x_drop_cols = ['Id', 'Open', 'Store', 'Date']

        ids = group['Id']
        X = group.drop(x_drop_cols, axis=1, inplace=False)

        yhat = models[storeId].predict(X)

        # Ignore prediction and set Sales to zero hen the store is closed
        yhat[np.array(group['Open'] == 0)] = 0
        test_yhats += yhat.tolist()
        test_ids += ids.tolist()

    res = [[i, y] for i, y in zip(test_ids, test_yhats)]
    return sorted(res, key=lambda x: x[0])

In [235]:
test_full_df = pd.merge(test_df, store_df.ix[:,['StoreType', 'Assortment', 'Store']], on='Store')
test_data = transformData(test_full_df)
result = makePredictions(test_data)

In [236]:
f = open("submission.csv", "w")
f.write('"Id","Sales"\n')

for r in result:
    f.write("%d,%d\n" % (r[0],int(r[1])))


#### Misc stuff ####

In [237]:
def littleTest():
    # Test prediction using some input generated from training data
    random_idx = random.sample(train_df.index.tolist(), 100)

    try_df = train_df.loc[random_idx]
    try_y = try_df['Sales']
    try_df.drop(['Sales', 'Customers'], axis=1, inplace=True)
    try_df["Id"] = range(1, len(try_df) + 1)
    try_data = transformData(pd.merge(try_df, store_df.ix[:,['StoreType', 'Assortment', 'Store']], on='Store'))

    result = makePredictions(try_data)
    try_yhat = np.array([r[1] for r in result])

    print rmspe(try_y, try_yhat)