## Modules import, reading data

In [None]:
import pandas as pd
import numpy as np

from itertools import product
import gc

import matplotlib.pyplot as plt
import seaborn as sns
from multiprocessing import Pool

import lightgbm as lgb

In [None]:
from matplotlib import style
style.use('seaborn')

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')

In [None]:
print(train.shape)
train.head(10)

## Data investigation

At first, let's drop duplicates

In [None]:
train.drop_duplicates(inplace=True, ignore_index=True)

Plot features 'item_cnt_day' and 'item_price' to see their distributions and find outliers

In [None]:
sns.boxplot(x=train['item_cnt_day'])
plt.show()

In [None]:
sns.boxplot(x=train['item_price'])
plt.show()

Let's inspect rows with item_price 300,000 and -1 and rows with item_cnt_day 1000 and 2000+

In [None]:
train.loc[train['item_cnt_day'].argmax()]

In [None]:
items[items['item_id'] == 11373]

In [None]:
sns.distplot(train[train['item_id']==11373]['item_cnt_day'].values)
plt.show()

It's seems like an outlier. Item name is smth like "Delivery to post office" and we can see that item_cnt_day - 2169 is really too big value for it. So we will delete it.

In [None]:
train[train['item_cnt_day'] == 1000]

In [None]:
items[items['item_id'] == 20949]

In [None]:
sns.distplot(train[train['item_id']==20949]['item_cnt_day'].values)
plt.show()

The name product is a package with some print. Item_cnt_day 1000 seems too big even for it, but I think it really possible to sell 1000 per day, so I'll keep it.

In [None]:
train.iloc[train['item_price'].argmax()]

In [None]:
items[items['item_id'] == 6066]

In [None]:
test[test['item_id'] == 6066]

So this item is really costs so much, because it's some kind of corporative software. But we don't have this item in test set, so we can delete it form train set.

In [None]:
train.iloc[train['item_price'].argmin()]

Fill -1 price with mean item price

In [None]:
train.loc[train['item_price'].argmin(), 'item_price'] = train[train['item_id'] == 2973].item_price.mean()

### Deleting outliers

In [None]:
train = train[train['item_cnt_day'] <= 1000]

In [None]:
train = train[train['item_price'] < 300000]

In [None]:
cols = ['date_block_num', 'shop_id', 'item_id']

## Creating training seet

But at firts take a look at shops info

In [None]:
shops

Before we generate train set, we can see that some shop names seems very similar. So let's fix shop_id values it  train and test data.

In [None]:
train.loc[train['shop_id'] == 0, 'shop_id'] = 57
test.loc[test['shop_id'] == 0, 'shop_id'] = 57
train.loc[train['shop_id'] == 1, 'shop_id'] = 58
test.loc[test['shop_id'] == 1, 'shop_id'] = 58
train.loc[train['shop_id'] == 10, 'shop_id'] = 11
test.loc[test['shop_id'] == 10, 'shop_id'] = 11
train.loc[train['shop_id'] == 40, 'shop_id'] = 39
test.loc[test['shop_id'] == 40, 'shop_id'] = 39

Generate training dataframe

In [None]:
%%time
data = []
for block in range(34):
    tmp = train[train['date_block_num'] == block]
    data.append(np.array(list(product([block], tmp['shop_id'].unique(), tmp['item_id'].unique())), dtype='int16'))

del tmp

data = pd.DataFrame(data=np.vstack(data), columns=cols)

Always downcast datatypes if possible

In [None]:
data['date_block_num'] = data['date_block_num'].astype('int8')
data['shop_id'] = data['shop_id'].astype('int8')
data.dtypes

In [None]:
data.sort_values(cols, inplace=True)

In [None]:
group = train.groupby(cols).agg({'item_cnt_day': 'sum'})
group.columns = ['target']
group.reset_index(inplace=True)

data = data.merge(group, how='left', on=cols)

data['target'] = data['target'].fillna(0).clip(0, 20).astype('float16')

# TO HDF FILE
Very basic version of dataset is ready. Saving it to hdf file for fast backup.

In [None]:
data.to_hdf('data.hdf5', 'df')

In [None]:
data = pd.read_hdf('data.hdf5', 'df')
data

## Some explorations of other files

In [None]:
shops

Let's extract city names from shop names

In [None]:
shops['city_name'] = shops['shop_name'].apply(lambda x: x.split(' ')[0])
shops.replace('!Якутск', 'Якутск', inplace=True)
shops['city_name'], _ = pd.factorize(shops['city_name'])
shops['city_name'] = shops['city_name'].astype('int8')

shops.head(10)

Also let's extract the first words from categories names

In [None]:
categories['category_general_name'] = categories['item_category_name'].apply(lambda x: x.split(' ')[0])
categories['category_general_name'], _ = pd.factorize(categories['category_general_name'])
categories['category_general_name'] = categories['category_general_name'].astype('int8')

categories.head()

## Adding test data to train data

In [None]:
test['date_block_num'] = 34
test['shop_id'] = test['shop_id'].astype('int8')
test['date_block_num'] = test['date_block_num'].astype('int8')
test['item_id'] = test['item_id'].astype('int16')

In [None]:
data = pd.concat([data, test], ignore_index=True).fillna(-1)
data

# MERGE SHOPS AND CATEGORIES WITH TRAIN DF

In [None]:
data = data.merge(shops[['shop_id', 'city_name']], how='left', on='shop_id')

In [None]:
a = pd.merge(items[['item_id', 'item_category_id']], categories[['item_category_id','category_general_name']], how='left', on='item_category_id')
data = data.merge(a, how='left', on='item_id')
del a

In [None]:
data['item_category_id'] = data['item_category_id'].astype('int8')

### Month feature

In [None]:
data['month'] = data['date_block_num'] % 12 + 1

## Generating mean encoded features and adding lag

In [None]:
def lag_generator(df, col, lags):
    tmp = df[['date_block_num', 'shop_id', 'item_id', col]]
    for lag in lags:
        a = tmp.copy()
        a['date_block_num'] += lag
        a.columns = ['date_block_num', 'shop_id', 'item_id', f'{col}_lag_{lag}']
        df = df.merge(a, how='left', on=['date_block_num', 'shop_id', 'item_id'])
    return df

In [None]:
%%time
lags = [1, 2, 3]
data = lag_generator(data, 'target', lags)

data.fillna(-1, inplace=True)

In [None]:
%%time
group = data.groupby(['date_block_num', 'shop_id']).agg({'target': 'mean'})
group.columns = ['target_mean_date_shop']
group.reset_index(inplace=True)

data = data.merge(group, how='left', on=['date_block_num', 'shop_id']).fillna(-1)

data = lag_generator(data, 'target_mean_date_shop', [1])
data.drop(columns='target_mean_date_shop', inplace=True)

data.fillna(-1, inplace=True)

In [None]:
%%time
group = data.groupby(['date_block_num', 'item_id']).agg({'target': 'mean'})
group.columns = ['target_mean_date_item']
group.reset_index(inplace=True)

data = data.merge(group, how='left', on=['date_block_num', 'item_id']).fillna(-1)

data = lag_generator(data, 'target_mean_date_item', [1, 2, 3])
data.drop(columns='target_mean_date_item', inplace=True)

data.fillna(-1, inplace=True)

In [None]:
%%time
group = data.groupby(['date_block_num', 'item_category_id']).agg({'target': 'mean'})
group.columns = ['target_mean_date_category']
group.reset_index(inplace=True)

data = data.merge(group, how='left', on=['date_block_num', 'item_category_id']).fillna(-1)

data = lag_generator(data, 'target_mean_date_category', [1])
data.drop(columns='target_mean_date_category', inplace=True)

data.fillna(-1, inplace=True)

In [None]:
%%time
group = data.groupby(['date_block_num', 'city_name']).agg({'target': 'mean'})
group.columns = ['target_mean_date_city']
group.reset_index(inplace=True)

data = data.merge(group, how='left', on=['date_block_num', 'city_name']).fillna(-1)

data = lag_generator(data, 'target_mean_date_city', [1])
data.drop(columns='target_mean_date_city', inplace=True)

data.fillna(-1, inplace=True)

In [None]:
%%time
group = data.groupby(['date_block_num', 'category_general_name']).agg({'target': 'mean'})
group.columns = ['target_mean_date_gencategory']
group.reset_index(inplace=True)

data = data.merge(group, how='left', on=['date_block_num', 'category_general_name']).fillna(-1)

data = lag_generator(data, 'target_mean_date_gencategory', [1])
data.drop(columns='target_mean_date_gencategory', inplace=True)

data.fillna(-1, inplace=True)

### Checkpoint

In [None]:
data.to_hdf('data1.hdf5', 'df')

In [None]:
data = pd.read_hdf('data1.hdf5', 'df')
data

In [None]:
group = train.groupby('item_id').agg({'item_price': 'mean'})
group['item_price'] = group['item_price'].astype('float32')
group.columns = ['item_mean_price']
group.reset_index(inplace=True)

data = data.merge(group, how='left', on='item_id')

Calculate time from last sale

In [None]:
%%time
group = train.groupby(['shop_id', 'item_id'], sort=False)['date_block_num'].unique()

group.name = 'last_sales'

In [None]:
data = data.merge(group.reset_index(), how='left', on=['shop_id', 'item_id'])

In [None]:
def find_prev_sel(arr):
    try:
        date_block = arr[0]
        last_sale = arr[1]
        return last_sale[last_sale < date_block].max()
    except:
        return np.nan

In [None]:
%%time
pool = Pool(2)

data['last_sale'] = pool.map(find_prev_sel, data[['date_block_num', 'last_sales']].values)

pool.close()
pool.join()

In [None]:
group = train.groupby(['item_id', 'date_block_num'], as_index=False, sort=False).agg({'item_price': 'mean'})
group.columns = ['item_id', 'last_sale', 'item_date_mean_price_prev_sale']

data = data.merge(group, how='left', on=['item_id', 'last_sale'])

In [None]:
data['delta_item_prev_price'] = data['item_mean_price'] - data['item_date_mean_price_prev_sale']
data['prev_sold_delta'] = data['date_block_num'] - data['last_sale']

In [None]:
data.drop(columns=['last_sale', 'item_mean_price', 'item_date_mean_price_prev_sale', 'last_sales'], inplace=True)

In [None]:
data.fillna(-1, inplace=True)

In [None]:
data['prev_sold_delta'] = data['prev_sold_delta'].astype('int8')
data['delta_item_prev_price'] = data['delta_item_prev_price'].astype('float32')

### SHOP REVENUE LAG

In [None]:
train['revenue'] = train['item_price'] * train['item_cnt_day']
group = train.groupby(['date_block_num', 'shop_id']).agg({'revenue': 'sum'})
group.columns = ['revenue_lag_1']
group.reset_index(inplace=True)
group['date_block_num'] += 1

data = data.merge(group, how='left', on=['date_block_num', 'shop_id'])

data.fillna(-1, inplace=True)

In [None]:
data.to_hdf('data2.hdf5', 'df')

In [None]:
data = pd.read_hdf('data2.hdf5', 'df')
data

Deleting garbage

In [None]:
del train
del items
del test
del shops
del categories
del group

gc.collect()

## LightGBM Model

In [None]:
data = data[data['date_block_num'] >= 3]

Train/val

In [None]:
train_data = lgb.Dataset(data[data['date_block_num'] < 33].drop(columns=['date_block_num', 'target']), label=data[data['date_block_num'] < 33].target.values, categorical_feature=['shop_id', 'item_id', 'city_name', 'item_category_id', 'category_general_name', 'month', 'prev_sold_delta'])
val_data = lgb.Dataset(data[data['date_block_num'] == 33].drop(columns=['date_block_num', 'target']), label=data[data['date_block_num'] == 33].target.values, categorical_feature=['shop_id', 'item_id', 'city_name', 'item_category_id', 'category_general_name', 'month', 'prev_sold_delta'], reference=train_data)
test_data = data[data['date_block_num'] == 34].drop(columns=['date_block_num', 'target'])

In [None]:
%%time
params = {'metric': 'rmse',
          'learning_rate': 0.01,
          'max_depth': 13,
          'num_leaves': 1673,
          'random_state': 42,
          'num_iterations': 500,
          'early_stopping_round': 12,
          'num_threads': 2
         }

model = lgb.train(params, train_data, valid_sets=[val_data, train_data])

#model = lgb.train(params, train_data, valid_sets=train_data)

In [None]:
lgb.plot_importance(model)

In [None]:
preds = model.predict(test_data)

In [None]:
submission = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')

In [None]:
submission['item_cnt_month'] = preds
submission

In [None]:
submission.to_csv('best_lgb.csv', index=False)

## The next step is downloading data2.hdf file and training other models on it