In [1]:
import itertools
import pandas as pd
import numpy as np

# Mean as a Baseline

In [72]:
# Columns: date - store_nbr - item_nbr - unit_sales - onpromotion
# Index: 0 - 1461581

df_train = pd.read_csv(
    'Data/train.csv', usecols=[1, 2, 3, 4, 5], dtype={'onpromotion': str},
    converters={'unit_sales': lambda u: float(u) if float(u) > 0 else 0},
    skiprows=range(1, 124035460)
)

In [73]:
# log transform - makes df_train unit-sales into a more normal distribution
df_train["unit_sales"] = df_train["unit_sales"].apply(np.log1p)

In [75]:
# Fill gaps in dates
# Improved with the suggestion from Paulo Pinto
# Dates from 8/2/2017 to 8/15/2017
u_dates = df_train.date.unique()

In [76]:
# Stores from 1 to 54
u_stores = df_train.store_nbr.unique()

In [77]:
u_items = df_train.item_nbr.unique()

In [78]:
# Index: date, store_nbr, item_nbr
# Columns: unit_sales, onpromotion
df_train.set_index(["date", "store_nbr", "item_nbr"], inplace=True)

In [79]:
# Reindex df_train so every date, store, item combo is represented
df_train = df_train.reindex(
    pd.MultiIndex.from_product(
        (u_dates, u_stores, u_items),
        names=["date", "store_nbr", "item_nbr"]
    )
)

In [80]:
# Fill NAs in unit_sales with 0
df_train.loc[:, "unit_sales"].fillna(0, inplace=True)

In [81]:
# Assume missing entris imply no promotion
df_train.loc[:, "onpromotion"].fillna("False", inplace=True)

In [82]:
# Calculate means 
df_train = df_train.groupby(
    ['item_nbr', 'store_nbr', 'onpromotion']
)['unit_sales'].mean().to_frame('unit_sales')

In [83]:
# Inverse transform
df_train["unit_sales"] = df_train["unit_sales"].apply(np.expm1)

In [85]:
# Create submission
df_test = pd.read_csv("Data/test.csv", usecols=[0, 2, 3, 4], dtype={'onpromotion': str})

In [86]:
df_test = df_test.set_index(['item_nbr', 'store_nbr', 'onpromotion'])

In [87]:
df_test = df_test.join(df_train, how='left')

In [90]:
df_test= df_test.fillna(0)

In [91]:
df_test.to_csv('mean.csv.gz', float_format='%.2f', index=None, compression="gzip")

In [92]:
df_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,unit_sales
item_nbr,store_nbr,onpromotion,Unnamed: 3_level_1,Unnamed: 4_level_1
96995,1,False,125497040,0.397155
96995,1,False,125707694,0.397155
96995,1,False,125918348,0.397155
96995,1,False,126129002,0.397155
96995,1,False,126339656,0.397155
96995,1,False,126550310,0.397155
96995,1,False,126760964,0.397155
96995,1,False,126971618,0.397155
96995,1,False,127182272,0.397155
96995,1,False,127392926,0.397155
