In [None]:
import pandas as pd
import sklearn 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

from subprocess import check_output

In [None]:
train_raw_data = pd.read_csv('../input/sales_train.csv')
test_raw_data = pd.read_csv('../input/test.csv')
items_data = pd.read_csv('../input/items.csv')
item_categories_data = pd.read_csv('../input/item_categories.csv')
shops_data = pd.read_csv('../input/shops.csv')

In [None]:
train_raw_data.head()

In [None]:
test_raw_data.head()

In [None]:
items_data.head()

In [None]:
item_categories_data.head()

In [None]:
shops_data.head()

In [None]:
print(train_raw_data.shape)
print(test_raw_data.shape)

In [None]:
n_train = train_raw_data.shape[0]
n_test = test_raw_data.shape[0]

join train data table with item data to get item category id information

In [None]:
train_data = train_raw_data.set_index('item_id').join(items_data.set_index('item_id'), how='left').reset_index()

In [None]:
train_data = train_data.drop(['item_name'], axis=1)

convert date table into usable format

In [None]:
train_data.loc[:, 'date'] = train_data['date'].map(lambda d: datetime.strptime(d, '%d.%m.%Y')).values

In [None]:
train_data.head(10)

## Some analysis of various columns

In [None]:
n_ids = train_data.groupby('item_id')['shop_id'].count()
n_ids.plot()

In [None]:
n_cats = train_data.groupby('item_category_id')['item_id'].count()
plt.bar(n_cats.index, n_cats.values)

In [None]:
n_items_month = train_data.groupby(['date_block_num'])['item_id'].count()
n_items_month.plot(kind='bar')

In [None]:
unique_items_shops = train_data.groupby('shop_id')['item_id'].nunique()
unique_items_shops.plot(kind='bar')

## Check for null values

In [None]:
def has_null(data):
    columns = data.columns
    nulls = []
    for column in data.columns:
        nulls.append(data[column].isnull().sum())
        
    null_df = pd.DataFrame({'column': columns, 'total null values': nulls})
    print(null_df)

In [None]:
has_null(train_data)

In [None]:
print('min item id {}'.format(train_data['item_id'].min()))
print('max item id {}'.format(train_data['item_id'].max()))

In [None]:
item_id_vc = train_data['item_id'].value_counts()
print(item_id_vc)
plt.plot(item_id_vc.index, item_id_vc, 'b.')

item_id 20949 is present very high number of times

In [None]:
train_data_20949 = train_data.loc[train_data['item_id'] == 20949]
train_data_20949

In [None]:
train_data_20949['date_block_num'].value_counts()

In [None]:
itemid_shopid_month = train_data.groupby(['item_id', 'shop_id', 'date_block_num'])

In [None]:
train_data['item_cnt_day'].describe()


items that have very large value of item cnt day (may be outlier)

In [None]:
train_data.loc[train_data['item_cnt_day'] > 500]

for item_id 11373 shop_id 12, on date 2015-10-28, price is very low (can be reason of high sales)

items that are returned many times

In [None]:
train_data.loc[train_data['item_cnt_day'] < -10]

number of negative + positive values of item cnt day of an item in a shop in a month

In [None]:
n_itemid_shopid_month = itemid_shopid_month['item_cnt_day'].count()
n_itemid_shopid_month.describe()

number of positive values of item cnt day of an item in a shop in a month

In [None]:
itemid_shopid_month_pos = train_data.loc[train_data['item_cnt_day'] > 0].groupby(['item_id', 'shop_id', 'date_block_num'])

In [None]:
train_data.loc[train_data['item_cnt_day'] > 0, ['item_id', 'shop_id', 'date_block_num', 'item_cnt_day']]

In [None]:
n_itemid_shopid_month_pos = itemid_shopid_month_pos['item_cnt_day'].count()
n_itemid_shopid_month_pos.describe()

In [None]:
n_itemid_shopid_month_pos

sum is the thing that we want to predict

In [None]:
tot_itemid_shopid_month_pos = itemid_shopid_month_pos['item_cnt_day'].sum()
tot_itemid_shopid_month_pos.describe()

In [None]:
tot_itemid_shopid_month_pos

### is there any item_id, shop_id combination in test data not in train data?

In [None]:
print(train_data['item_id'].describe())
print(train_data['shop_id'].describe())

In [None]:
print(test_raw_data['item_id'].describe())
print(test_raw_data['shop_id'].describe())

In [None]:
def find_unique_item_shops(data):
    item_shop_ids = set()
    for index, row in data.iterrows():
        item_shop = (row['item_id'], row['shop_id'], )
        if item_shop not in item_shop_ids:
            item_shop_ids.add(item_shop)

    return item_shop_ids

In [None]:
train_item_shop_ids = find_unique_item_shops(train_data)

In [None]:
test_item_shop_ids = find_unique_item_shops(test_raw_data)

In [None]:
len(train_item_shop_ids)

In [None]:
len(test_item_shop_ids)

In [None]:
cnt = 0
for item_shop_id in test_item_shop_ids:
    if item_shop_id not in train_item_shop_ids:
        print(item_shop_id)
        cnt += 1

print(cnt)

## previous value benchmark
Find out the baseline for prediction to compare models. Here I am using the sales of item in previous month as prediction in the month of the test set. For the item id and shop id combination not there in previous month, I am taking that as 0.

In [None]:
tot_itemid_shopid_month_pos.sort_index(inplace=True)
tot_itemid_shopid_month_pos_df = tot_itemid_shopid_month_pos.reset_index()

In [None]:
def get_33_sales(r): 
    try:
        return tot_itemid_shopid_month_pos.loc[(r['item_id'], r['shop_id'], 33)]
    except pd.core.indexing.IndexingError:
        return 0
    except KeyError:
        return 0

pred_prev_value_bench = test_raw_data.apply(get_33_sales, axis=1)

In [None]:
pred_prev_value_bench.describe()

There can be large values in previous month which can be very harmful, hence I am clipping the large values to 20.

In [None]:
pred_prev_value_bench_clipped = pred_prev_value_bench.clip(0, 20)

In [None]:
prev_value_bench_df = pd.DataFrame()
prev_value_bench_df['ID'] = test_raw_data['ID']
prev_value_bench_df['item_cnt_month'] = pred_prev_value_bench_clipped

In [None]:
prev_value_bench_df.to_csv('submission1.csv', header=True, index=False)

It gives score of 1.16813 on public leaderboard

In [None]:
print(check_output(["ls", "../working"]).decode("utf8"))

## Feature Generation

### Number of week day this item was sold

In [None]:
# train_data['week_day'] = train_data['date'].dt.weekday

### Number of times this item was sold this month (target)

In [None]:
train_data_final = pd.DataFrame(train_data[train_data['item_cnt_day'] > 0].groupby(['date_block_num', 'item_id', 'shop_id'])['item_cnt_day'].sum())
train_data_final.columns = ['item_cnt_month']

### Number of times this item in this shop was returned last month

In [None]:
returned_items_df = pd.DataFrame(train_data[train_data['item_cnt_day'] < 0].groupby(['date_block_num', 'item_id', 'shop_id'])['item_cnt_day'].sum())
returned_items_df.columns = ['item_cnt_rtn_last_month']
returned_items_df.reset_index(inplace=True)
returned_items_df['item_cnt_rtn_last_month'] *= -1
returned_items_df['date_block_num'] += 1

In [None]:
train_data_final = train_data_final.join(returned_items_df.set_index(['date_block_num', 'item_id', 'shop_id']), how='left')

In [None]:
train_data_final.loc[:, 'item_cnt_rtn_last_month'] = train_data_final.loc[:, 'item_cnt_rtn_last_month'].fillna(0)

### This month is first month or not

In [None]:
train_data_final.sort_index(inplace=True)

In [None]:
train_data_final.loc[:, 'first_month'] = 0
train_data_final.loc[(0), 'first_month'] = 1

### Number of times this item was sold in last month

In [None]:
total_items_df = pd.DataFrame(train_data[train_data['item_cnt_day'] > 0].groupby(['date_block_num', 'item_id', 'shop_id'])['item_cnt_day'].sum())
total_items_df.columns = ['item_cnt_sold_last_month']
total_items_df.reset_index(inplace=True)
total_items_df['date_block_num'] += 1

In [None]:
train_data_final = train_data_final.join(total_items_df.set_index(['date_block_num', 'item_id', 'shop_id']), how='left')

In [None]:
train_data_final.loc[:, 'item_cnt_sold_last_month'].fillna(0, inplace=True)

### Number of items sold in same month last year
If there are specific occassions during a month that impacts sales of an item in some region, then that will be captured here

In [None]:
items_sold_year_ago_df = train_data_final.reset_index()
items_sold_year_ago_df['date_block_num'] += 12
items_sold_year_ago_df.drop(items_sold_year_ago_df[items_sold_year_ago_df['date_block_num'] > 34].index, axis=0, inplace=True)
items_sold_year_ago_df.drop(['item_cnt_rtn_last_month', 'first_month', 'item_cnt_sold_last_month'], axis=1, inplace=True)
items_sold_year_ago_df.rename(columns={'item_cnt_month': 'item_cnt_prev_year'}, inplace=True)

In [None]:
train_data_final = train_data_final.join(items_sold_year_ago_df.set_index(['date_block_num', 'item_id', 'shop_id']), how='left')

In [None]:
train_data_final.fillna(0, inplace=True)

In [None]:
train_data_final

This notebook is under progress...
Thank you for reading.