In [None]:

import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)
sns.set(rc={'figure.figsize':(20, 10)})

In [None]:
for p in [np, pd, sklearn, scipy, lgb, sns]:
    print (p.__name__, p.__version__)

In [None]:
items_cat = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

# EDA

In [None]:
sns.set_context("talk", font_scale=1.4)
sales_month = pd.DataFrame(sales.groupby(['date_block_num']).sum().item_cnt_day).reset_index()
sales_month.columns = ['date_block_num', 'sum_items_sold']
sns.barplot(x ='date_block_num', y='sum_items_sold', 
            data=sales_month.reset_index());
plt.plot(sales_month.sum_items_sold)
plt.title('Distribution of the sum of sales per month')
del sales_month

In [None]:
comb_shop_item = pd.DataFrame(sales[['date_block_num', 'shop_id', 
                                     'item_id']].drop_duplicates().groupby('date_block_num').size()).reset_index()
comb_shop_item.columns = ['date_block_num', 'item-shop_comb']
sns.barplot(x ='date_block_num', y='item-shop_comb', data=comb_shop_item);
plt.plot(comb_shop_item['item-shop_comb']);
plt.title('Number of combinations shop-it with sales per month')
del comb_shop_item

In [None]:
sns.set_context("talk", font_scale=1.4)
sales_item_id = pd.DataFrame(sales.groupby(['item_id']).sum().item_cnt_day)
plt.xlabel('item id')
plt.ylabel('sales')
plt.plot(sales_item_id);

# Data Leakeage

In [None]:
tuples_df = pd.Series(list(sales[['item_id', 'shop_id']].itertuples(index = False, name = None)))
tuples_test = pd.Series(list(test[['item_id', 'shop_id']].itertuples(index = False, name = None)))
print(str(round(tuples_df.isin(tuples_test).sum()/len(tuples_df),2)*100)+'%')

 Only 42% of the combinations between items and shops appears in the test set

In [None]:
sales['item_cnt_day'].plot()

In [None]:
sales = sales[sales['item_price'] < 100000]
sales = sales[sales['item_cnt_day'] < 1000]

In [None]:
from itertools import product
index_cols = ['shop_id', 'item_id', 'date_block_num']

# Compute all shop and item combinations
grid = []
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

In [None]:
grid.head()

In [None]:
sales_in_month = sales.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': 'sum','item_price': np.mean}).reset_index()

In [None]:
sales_in_month.head()

In [None]:
sales_by_item_id_for_last_val = sales_in_month.groupby('item_id').last()

In [None]:
month_34_sales = test.copy()

In [None]:
def fill_price(item_id):
    if item_id in sales_by_item_id_for_last_val.index:
        return sales_by_item_id_for_last_val.loc[item_id]['item_price']
    else:
        return sales_by_item_id_for_last_val['item_price'].median()

In [None]:
month_34_sales['date_block_num'] = 34
month_34_sales['item_price'] = month_34_sales['item_id'].apply(fill_price)
month_34_sales['item_cnt_day'] = 0
month_34_sales = month_34_sales.drop(columns=['ID'])

In [None]:
month_34_sales.head()

In [None]:
sales_in_month = pd.merge(grid, sales_in_month, on=['date_block_num','shop_id','item_id'], how='left').fillna(0)

In [None]:
sales_in_month.rename({'item_cnt_day': 'item_cnt_month'}, axis='columns', inplace=True)
month_34_sales.rename({'item_cnt_day': 'item_cnt_month'}, axis='columns', inplace=True)
month_34_sales = month_34_sales.reindex(columns=sales_in_month.columns)

In [None]:
month_34_sales.head()

In [None]:
sales_in_month = sales_in_month.append(month_34_sales)

In [None]:
sales_in_month.head()

In [None]:
sales_in_month = pd.merge(sales_in_month, items, on=['item_id'], how='left')

In [None]:
sales_in_month.head()

In [None]:
id_types = ['item_id', 'shop_id', 'item_category_id']
encoding_methods = [('item_price', np.mean, 'mean'), ('item_cnt_day', np.sum, 'sum'), ('item_cnt_day', np.mean, 'mean')]
sales_with_item_info = pd.merge(sales, items, on=['item_id'], how='left')

for id_type in id_types:
    for column_id, agg, agg_type in encoding_methods:
        new_df = sales_with_item_info.groupby([id_type, 'date_block_num']).aggregate(agg).reset_index()[[column_id, id_type, 'date_block_num']]
        new_df.columns = [id_type + '_' + agg_type + '_' + column_id, id_type, 'date_block_num']
        sales_in_month = pd.merge(sales_in_month, new_df, on=['date_block_num', id_type], how='left')

In [None]:
sales_in_month.head()

In [None]:
lag_features = ['item_id_mean_item_price',
 'item_id_sum_item_cnt_day',
 'item_id_mean_item_cnt_day',
 'shop_id_mean_item_price',
 'shop_id_sum_item_cnt_day',
 'shop_id_mean_item_cnt_day',
 'item_category_id_mean_item_price',
 'item_category_id_sum_item_cnt_day',
 'item_category_id_mean_item_cnt_day',
 'item_cnt_month']

In [None]:
lags = [1, 6, 12]

In [None]:
for lag in lags:
    sales_lags = sales_in_month.copy()
    sales_lags.date_block_num += lag
    sales_lags = sales_lags[['date_block_num','shop_id','item_id'] + lag_features]
    sales_lags.columns = ['date_block_num','shop_id','item_id'] + [lag_feat + '_' + str(lag) + 'months_ago' for lag_feat in lag_features]
    sales_in_month = pd.merge(sales_in_month, sales_lags, on=['date_block_num','shop_id','item_id'], how='left')

In [None]:
for feature in sales_in_month.columns:
    if 'item_cnt' in feature:
        sales_in_month[feature] = sales_in_month[feature].fillna(0)
    elif 'item_price' in feature:
        sales_in_month[feature] = sales_in_month[feature].fillna(sales_in_month[feature].median())

In [None]:
sales_in_month.head()

In [None]:
sales_in_month = sales_in_month[sales_in_month['date_block_num'] > 12]

In [None]:
unused_cols = lag_features[:-1] + ['item_name', 'item_price']

In [None]:
sales_in_month['item_cnt_month'] = sales_in_month['item_cnt_month'].clip(0, 40)

In [None]:
x_train = sales_in_month[sales_in_month['date_block_num'] < 33].drop(unused_cols, axis=1)
x_cv = sales_in_month[sales_in_month['date_block_num'] == 33].drop(unused_cols, axis=1)
x_test = sales_in_month[sales_in_month['date_block_num'] == 34].drop(unused_cols, axis=1)

In [None]:
x_train.head()