In [7]:
import pandas as pd
import numpy as np

train = pd.read_csv('data/sales_train.csv.gz')
train['date'] = pd.to_datetime(train['date'], format='%d.%m.%Y')
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


## There are no missing values

In [8]:
train.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

## Predicting only mean values

In [2]:
# sample_sub = pd.read_csv('data/sample_submission.csv.gz')
# sample_sub.head()

In [3]:
test = pd.read_csv('data/test.csv.gz')
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [4]:
mean_per_item_month = train.groupby(['date_block_num','shop_id','item_id']).sum() \
                           .groupby('item_id').mean()['item_cnt_day']

In [5]:
submission = pd.DataFrame({'ID': test.ID,
                           'item_cnt_month': mean_per_item_month.loc[test.item_id].values})
submission.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


Unnamed: 0,ID,item_cnt_month
0,0,2.873303
1,1,
2,2,2.668421
3,3,1.855263
4,4,


Things to consider

- Not all the items and shops from the test set are also in the train set

So we fill those NaN values with 0

In [6]:
## IMPUTER
submission["item_cnt_month"].fillna(0, inplace=True)


## EXPORT SUBMISSION
submission.to_csv('output/output.csv', index=False)
submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,2.873303
1,1,0.0
2,2,2.668421
3,3,1.855263
4,4,0.0
