In [None]:
ls ../input/competitive-data-science-predict-future-sales

In [None]:
import numpy as np
import pandas as pd
import sklearn
import gc

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from xgboost import plot_importance

## Load all files

In [None]:
%%time
sales_train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv', 
                          index_col='date', parse_dates=True)
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
item_categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
sample_sub = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
test =  pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
print(len(set(sales_train['item_id'].values)))  # unique items
sales_train.index

In [None]:
sales_train.info()

## Explore daily sale of items

In [None]:
sales_train.loc[sales_train['item_id']==7409]['item_cnt_day'].plot(figsize=(12,5))

In [None]:
sales_train.loc[sales_train['item_id']==2552]['item_cnt_day'].plot(figsize=(12,5))

In [None]:
sales_train.loc[sales_train['item_id']==7460]['item_cnt_day'].plot(figsize=(12,5))

## Explore daily sale of items by shop

In [None]:
sales_train.loc[(sales_train['item_id']==2555) & (sales_train['shop_id']==25)]['item_cnt_day'].plot(figsize=(12,5))

In [None]:
sales_train[['shop_id', 'item_id', 'item_cnt_day']]

In [None]:
sales_train.groupby('shop_id')['item_id'].count()

In [None]:
df = sales_train[['item_cnt_day', 'shop_id', 'item_id']].groupby(['shop_id', 'item_id']).count()
df.reset_index()

In [None]:
df = sales_train[['item_cnt_day', 'shop_id', 'item_id']].groupby(['shop_id', 'item_id']).sum()
df.reset_index()

In [None]:
df.loc[df['item_cnt_day']==df['item_cnt_day'].max()]

In [None]:
sales_train[sales_train['shop_id']==28]

In [None]:
df.plot(figsize=(20,5))

## Look for outliers and remove them

In [None]:
sales_train.describe()

In [None]:
f1, axes = plt.subplots(1, 2, figsize=(12,5))
f1.subplots_adjust(hspace=0.4, wspace=0.2)
sns.boxplot(x=sales_train['item_price'], ax=axes[0])
sns.boxplot(x=sales_train['item_cnt_day'], ax=axes[1])

In [None]:
## price outlier
print(sales_train.loc[sales_train['item_price']==sales_train['item_price'].max()])
print(sales_train.loc[sales_train['item_price']>20000])
print(sales_train.loc[sales_train['item_price']>30000])
print(sales_train.loc[sales_train['item_price']>50000])

In [None]:
sales_train = sales_train.drop(sales_train[sales_train['item_price']==307980].index)

In [None]:
## item count outlier
print(sales_train.loc[sales_train['item_cnt_day']==sales_train['item_cnt_day'].max()])
print(sales_train.loc[sales_train['item_cnt_day']>2000])
print(sales_train.loc[sales_train['item_cnt_day']>1500])
print(sales_train.loc[sales_train['item_cnt_day']>800])

In [None]:
sales_train = sales_train.drop(sales_train[sales_train['item_cnt_day']>=1000].index)

In [None]:
f1, axes = plt.subplots(1, 2, figsize=(12,5))
f1.subplots_adjust(hspace=0.4, wspace=0.2)
sns.boxplot(x=sales_train['item_price'], ax=axes[0])
sns.boxplot(x=sales_train['item_cnt_day'], ax=axes[1])

### Handle duplicate shops (see https://www.kaggle.com/dlarionov/feature-engineering-xgboost)

In [None]:
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 57, 'shop_id'] = 0
test.loc[test.shop_id == 57, 'shop_id'] = 0
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 58, 'shop_id'] = 1
test.loc[test.shop_id == 58, 'shop_id'] = 1
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 11, 'shop_id'] = 10
test.loc[test.shop_id == 11, 'shop_id'] = 10

In [None]:
## extract city names from shop names
from sklearn.preprocessing import LabelEncoder
shops['city'] = shops['shop_name'].map(lambda row: row.split(' ')[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
encoder = LabelEncoder()
shops['city_label'] = encoder.fit_transform(shops['city'])

# removing city dummies, after looking at feature importance plot
# shops = pd.concat((shops, pd.get_dummies(shops['city_label'],prefix='city')), axis=1)
shops

In [None]:
## add city labels in test dataset
test = test.join(shops, on='shop_id', rsuffix='_shop').drop(['shop_name', 'shop_id_shop', 'city'], axis=1)
test

## Extract main category and subcategory from category name

In [None]:
## Split categories
categories_split = item_categories['item_category_name'].str.split('-')
item_categories['main_category'] = categories_split.map(lambda row: row[0].strip())
item_categories['secondary_category'] = categories_split.map(lambda row: row[1].strip() if (len(row)>1) else 'N/A')

## Encode catgeroies
item_categories['main_cat_label'] = encoder.fit_transform(item_categories['main_category'])
item_categories['sec_cat_label'] = encoder.fit_transform(item_categories['secondary_category'])
# item_categories = pd.concat((item_categories, pd.get_dummies(item_categories['main_cat_label'], prefix='main_cat')), axis=1) ## removing after looking at important features plot
# item_categories = pd.concat((item_categories, pd.get_dummies(item_categories['sec_cat_label'], prefix='sec_cat')), axis=1)  ## removing after looking at important features plot
item_categories

In [None]:
item_and_cat = items.join(item_categories, on='item_category_id', rsuffix='item_cat_').drop(['item_category_name', 'item_name', 'main_category', 'secondary_category', 'item_category_iditem_cat_'], axis=1)
# 'item_category_id', 'main_cat_label', 'sec_cat_label'
item_and_cat

In [None]:
price_data = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'])['item_price'].median().rename('item_price_median').reset_index()
price_data

In [None]:
# Create price categories (0-5, 5-10, 10,20, 20,30, 30-50, 50-100, >100)
def price_category(row):
    if row.item_price_median<5.:
        val = 1
    elif row.item_price_median<10.:
        val = 2
    elif row.item_price_median<100.:
        val = 3
    elif row.item_price_median<200.:
        val = 4
    elif row.item_price_median<300.:
        val = 5
    elif row.item_price_median<500.:
        val = 6
    elif row.item_price_median<1000.:
        val = 7
    elif row.item_price_median>1000.:
        val = 8
    else:
        val = 0
    return val

In [None]:
%%time
price_data['price_cat'] = price_data.apply(price_category, axis=1)
price_data

## Merge all dataset to form training data

In [None]:
monthly_sales = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().rename('item_cnt_month').reset_index()

train = pd.merge(monthly_sales, price_data, on=['shop_id', 'item_id', 'date_block_num'], how='left')
train = pd.merge(train, item_and_cat, on=['item_id'], how='left')
train = pd.merge(train, shops, on='shop_id', how='left')
train = train.drop(['item_price_median'], axis=1)
train

In [None]:
pcat = price_data.drop(['price_cat', 'date_block_num'], axis=1).drop_duplicates().groupby(['shop_id', 'item_id'])['item_price_median'].median().rename('item_price_median').reset_index()
pcat['price_cat'] = pcat.apply(price_category, axis=1)
pcat

In [None]:
# pcat_uniq = price_data.drop(['item_price_median', 'date_block_num'], axis=1).drop_duplicates().reset_index().drop(['index'], axis=1)
# pcat_uniq

In [None]:
test = pd.merge(test, pcat, on=['shop_id', 'item_id'], how='left').reset_index().drop(['index'], axis=1)
test

In [None]:
test = test.drop(['ID', 'item_price_median'], axis=1)
test.columns

## Impute missing item prices in test data with mean price of item category

In [None]:
df = pd.merge(items, sales_train[['item_id', 'item_price']], on='item_id', how='left')

In [None]:
df = df.groupby('item_category_id')['item_price'].median().rename('item_price_median').reset_index()

In [None]:
test = pd.merge(test, items[['item_id', 'item_category_id']], on='item_id', how='left')

In [None]:
test = pd.merge(test, df[['item_category_id', 'item_price_median']], on='item_category_id', how='left')
test

In [None]:
%%time
test['price_cat_med'] = test.apply(price_category, axis=1)
test

In [None]:
test.columns

In [None]:
test['price_cat'].fillna(test['price_cat_med'], inplace=True)
test.loc[test['price_cat'].isna()]

In [None]:
test.drop(['price_cat_med', 'item_price_median'], axis=1, inplace=True)
test.columns

In [None]:
test = pd.merge(test, item_categories.drop(['item_category_name', 'main_category', 'secondary_category'], axis=1), on='item_category_id', how='left')
##, 'main_cat_label', 'sec_cat_label'
test.columns

In [None]:
[item for item in train.columns if item not in test.columns]

In [None]:
train.drop(['shop_name', 'city'], axis=1, inplace=True)

In [None]:
test['date_block_num']=34
test['item_cnt_month']=0
[item for item in train.columns if item not in test.columns]

In [None]:
test['price_cat'] = test['price_cat'].astype(np.int8)
test['date_block_num'] = test['date_block_num'].astype(np.int8)

train['price_cat'] = train['price_cat'].astype(np.int8)
train['date_block_num'] = train['date_block_num'].astype(np.int8)
train['item_cnt_month'] = train['item_cnt_month'].astype(np.int8)

In [None]:
## prepare lag columns
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
## prepare lag columns with item_cnt aggregate
def prepare_lag_columns(df, lag, column_list, name):
    tmp = df.groupby(column_list).agg({'item_cnt_month':['mean']})
    tmp.columns = [name]
    tmp.reset_index(inplace=True)
    df = pd.merge(df, tmp, on=column_list, how='left')
    df[name] = df[name].astype(np.float16)
    df = lag_feature(df, lag, name)
    df.drop([name], axis=1, inplace=True)
    return df

In [None]:
## prepare lags with item_price aggregation
def prepare_lag_columns_price(df, column_list, name):
    tmp = sales_train.groupby(column_list).agg({'item_price':['mean']})
    tmp.columns = [name]
    tmp.reset_index(inplace=True)
    df = pd.merge(df, tmp, on=column_list, how='left')
    df[name] = df[name].astype(np.float16)
    return df

In [None]:
all_data = pd.concat([train, test], axis = 0, sort=False)
all_data.fillna(0, inplace=True)
all_data = all_data.reset_index()
all_data

In [None]:
%%time
all_data = lag_feature(all_data, [1,2,3,4,5,6,12], 'item_cnt_month')
all_data

In [None]:
%%time
all_data = prepare_lag_columns(all_data, [1], ['date_block_num', 'item_id'], 'total_avg_month_cnt')
all_data

In [None]:
%%time
all_data = prepare_lag_columns(all_data, [1,2,3,4,5,6,12], ['date_block_num'], 'item_avg_month_cnt')
all_data

In [None]:
%%time
all_data = prepare_lag_columns(all_data, [1,2,3,4,5,6,12], ['date_block_num', 'shop_id'], 'shop_avg_month_cnt')
all_data

In [None]:
all_data.columns

In [None]:
%%time
all_data = prepare_lag_columns(all_data, [1], ['date_block_num','city_label'], 'city_avg_month_cnt')
all_data = prepare_lag_columns(all_data, [1], ['date_block_num','item_id','city_label'], 'item_city_avg_month_cnt')
all_data = prepare_lag_columns(all_data, [1], ['date_block_num', 'item_category_id'], 'category_id_avg_month_cnt')
all_data = prepare_lag_columns(all_data, [1], ['date_block_num', 'main_cat_label'], 'main_category_avg_month_cnt')
all_data = prepare_lag_columns(all_data, [1], ['date_block_num', 'sec_cat_label'], 'secondary_category_avg_month_cnt')
all_data = prepare_lag_columns(all_data, [1], ['date_block_num','shop_id','item_category_id'], 'shop_category_id_avg_month_cnt')
all_data = prepare_lag_columns(all_data, [1], ['date_block_num','shop_id','main_cat_label'], 'shop_main_category_avg_month_cnt')
all_data = prepare_lag_columns(all_data, [1], ['date_block_num','shop_id','sec_cat_label'], 'shop_secondary_category_avg_month_cnt')
all_data

In [None]:
%%time
all_data = prepare_lag_columns_price(all_data, ['item_id'], 'item_avg_price')
all_data = prepare_lag_columns_price(all_data, ['date_block_num','item_id'], 'item_avg_price_month')
all_data = lag_feature(all_data, [1,2,3,4,5,6], 'item_avg_price_month')
all_data

In [None]:
%%time
for lag in [1,2,3,4,5,6]:
    all_data['trend_price_lag_'+str(lag)] = (all_data['item_avg_price_month_lag_'+str(lag)] - all_data['item_avg_price']) / all_data['item_avg_price']

all_data    

In [None]:
def clean_trend_price_lag(row):
    for l in [1,2,3,4,5,6]:
        if row['trend_price_lag_'+str(l)]:
            return row['trend_price_lag_'+str(l)]
    return 0

In [None]:
%%time
tmp_1, tmp_2, tmp_3, tmp_4 = [], [], [], []
tmp_1 = pd.DataFrame(tmp_1)
tmp_2 = pd.DataFrame(tmp_2)
tmp_3 = pd.DataFrame(tmp_3)
tmp_4 = pd.DataFrame(tmp_4)
tmp_1 = all_data[:500000].apply(clean_trend_price_lag, axis=1)
tmp_2 = all_data[500000:1000000].apply(clean_trend_price_lag, axis=1)
tmp_3 = all_data[1000000:1500000].apply(clean_trend_price_lag, axis=1)
tmp_4 = all_data[1500000:].apply(clean_trend_price_lag, axis=1)
all_data['trend_price_lag'] = pd.concat([tmp_1, tmp_2, tmp_3, tmp_4])
all_data['trend_price_lag'] = all_data['trend_price_lag'].astype(np.float16)
all_data['trend_price_lag'].fillna(0, inplace=True)
all_data

In [None]:
for i in [1,2,3,4,5,6]:
    all_data.drop(['item_avg_price_month_lag_'+str(i), 'trend_price_lag_'+str(i)], axis=1, inplace=True)

In [None]:
'''
# Correlation matrix for monthly sales
all_data_2 = all_data[all_data['date_block_num']<34]

# Correlation matrix
f = plt.figure(figsize=(9, 5))
plt.matshow(all_data_2.corr(), fignum=f.number)
plt.xticks(range(all_data_2.shape[1]),all_data_2.columns, fontsize=7, rotation=90)
plt.yticks(range(all_data_2.shape[1]), all_data_2.columns, fontsize=7)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
'''

In [None]:
all_data.head()

In [None]:
all_data.drop(
    ['price_cat', 'item_avg_price', 'item_avg_price_month', 'index'], 
    inplace=True, axis=1)

#'sec_cat_label', main_cat_label, city_label

In [None]:
all_data.fillna(0, inplace=True)
all_data.to_pickle('data.pkl')
data = pd.read_pickle('data.pkl')

In [None]:
data = data[data.date_block_num > 11]

In [None]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

gc.collect();

In [None]:
X_test

In [None]:
%%time
model=lgb.LGBMRegressor(
        n_estimators=5000,
        learning_rate=0.3,
        min_child_weight=300,
        #num_leaves=32,
        colsample_bytree=0.8,
        subsample=0.8,
        max_depth=8,
        #reg_alpha=0.04,
        #reg_lambda=0.073,
        #min_split_gain=0.0222415,
        verbose=1,
        seed=21)

model.fit(X_train, Y_train,eval_metric="rmse", eval_set=[(X_train, Y_train), (X_valid, Y_valid)], verbose=1, early_stopping_rounds = 10)

In [None]:
print(*(all_data.columns), sep='\n')

In [None]:
[item for item in train.columns if item not in test.columns]

In [None]:
lgb.plot_importance(model, figsize=(10,14))

In [None]:
import pickle
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('submission.csv', index=False)

# save predictions for an ensemble
pickle.dump(Y_pred, open('xgb_train.pickle', 'wb'))
pickle.dump(Y_test, open('xgb_test.pickle', 'wb'))

In [None]:
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

In [None]:
# model_xgb = XGBRegressor(
#     max_depth=8,
#     n_estimators=1000,
#     min_child_weight=300, 
#     colsample_bytree=0.8, 
#     subsample=0.8, 
#     eta=0.3,    
#     seed=21)

# model_xgb.fit(
#     X_train, 
#     Y_train, 
#     eval_metric="rmse", 
#     eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
#     verbose=True, 
#     early_stopping_rounds = 10)

In [None]:
# plot_features(model_xgb, (10,14))

In [None]:
# Y_pred_xg = model.predict(X_valid).clip(0, 20)
# Y_test_xg = model.predict(X_test).clip(0, 20)

# submission = pd.DataFrame({
#     "ID": test.index, 
#     "item_cnt_month": Y_test_xg
# })
# submission.to_csv('xgb_submission.csv', index=False)

# # save predictions for an ensemble
# # pickle.dump(Y_pred, open('xgb_train.pickle', 'wb'))
# # pickle.dump(Y_test, open('xgb_test.pickle', 'wb'))