In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Read data

In [None]:
df_train_sales = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv", parse_dates=['date'])
df_item_cat = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
df_items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
df_shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
df_test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

df_submission = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")

In [None]:
df_train_sales.info()

In [None]:
df_train_sales.head()

In [None]:
# Creating extra features based on date
df_train_sales['year'] = df_train_sales['date'].dt.year
df_train_sales['month'] = df_train_sales['date'].dt.month

### Merging tables to form a complete dataset

In [None]:
df_train_sales_temp = pd.merge(df_train_sales, df_items, on='item_id', how='left')
df_train_sales_temp2 = pd.merge(df_train_sales_temp, df_item_cat, on='item_category_id', how='left')
df_train_sales_temp3 = pd.merge(df_train_sales_temp2, df_shops, on='shop_id', how='left')

df_train_sales_temp3.head()

 No need to train model for shop and items missing in test data?

In [None]:
df_train_sales_eda = df_train_sales_temp3.loc[(df_train_sales_temp3['shop_id'].isin(df_test['shop_id'])) &
                                              (df_train_sales_temp3['item_id'].isin(df_test['item_id']))].copy()

del df_train_sales_temp, df_train_sales_temp2, df_train_sales_temp3

### Overall stats

In [None]:
df_train_sales_eda.describe().T

##### Checking diversity of categorical feature item_name and shop_name

In [None]:
print("No. of unique items sold:", df_train_sales_eda['item_name'].nunique())

x = df_train_sales_eda['item_name'].value_counts()
np.log10(x).hist(bins=25)
plt.xlabel('count of unique item name');

print(f"Item name that repeats more than 5 times: {100*x[x>10].shape[0]/x.shape[0]:1.0f}%")

In [None]:
only_use_item_names = x[x>10].index

In [None]:
# Most sold item
x.nlargest(15)

In [None]:
print("No. of unique item category:", df_train_sales_eda['item_category_name'].nunique())

df_train_sales_eda['item_category_name'] = df_train_sales_eda['item_category_name'].str.split().str[0]
x = df_train_sales_eda['item_category_name'].value_counts()
np.log10(x).hist(bins=25)
plt.xlabel('count of unique item name');

print(f"Item name that repeats more than 500 times: {100*x[x>5].shape[0]/x.shape[0]:1.0f}%")

In [None]:
only_use_item_cat_name = x[x>5].index

In [None]:
print("No. of unique shop names:", df_train_sales_eda['shop_name'].nunique())
x = df_train_sales_eda['shop_name'].value_counts()
x.hist(bins=25)
plt.xlabel('count of unique shop name');

print(f"Shop name that repeats more than 300 time: {100*x[x>500].shape[0]/x.shape[0]:1.0f}%")

In [None]:
only_use_shop_names = x[x>300].index

In [None]:
# Most item selling shop
x.nlargest(5)

### Purchase trend by year

In [None]:
fig, ax = plt.subplots(figsize=(5,4))
sns.violinplot(x = df_train_sales_eda["year"]
               , y = np.log10(df_train_sales_eda['item_price']+0.1)
               , showfliers=False)

In [None]:
# Purchase trend by year
fig, ax = plt.subplots(figsize=(5,4))
sns.boxplot(x = df_train_sales_eda["year"]
               , y = df_train_sales_eda['item_price']
               , showfliers=False)

### What is most selling item/shop by year ? 

In [None]:
for year in [2013, 2014, 2015]:
    x = df_train_sales_eda.loc[df_train_sales_eda['year']==year, 'item_name'].value_counts().nlargest(5)
    print(f"Year: {year}\n", x, '\n')

In [None]:
for year in [2013, 2014, 2015]:
    x = df_train_sales_eda.loc[df_train_sales_eda['year']==year, 'shop_name'].value_counts().nlargest(5)
    print(f"Year: {year}\n", x, '\n')

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
sns.countplot(x='shop_name', hue='year'
              ,data = df_train_sales_eda, ax=ax)
plt.xticks(rotation=90);

In [None]:
# Top 25 popular shop, by years.

fig, ax = plt.subplots(figsize=(15,5))
sns.countplot(x='shop_name', hue='year'
              ,data = df_train_sales_eda.loc[df_train_sales_eda['shop_name'].\
                                             isin(df_train_sales_eda['shop_name'].\
                                                  value_counts().nlargest(25).index)], ax=ax)
plt.xticks(rotation=90);

All shop seems to have increasing sales trend with increasing years.

In [None]:
# Top 25 popular item, by years.

fig, ax = plt.subplots(figsize=(18,5))
sns.countplot(x='item_name', hue='year'
              ,data = df_train_sales_eda.loc[df_train_sales_eda['item_name'].\
                                             isin(df_train_sales_eda['item_name'].\
                                                  value_counts().nlargest(50).index)], ax=ax)
plt.xticks(rotation=90);

### Item count

In [None]:
df_train_sales_eda.\
    groupby(['shop_name', 'item_name']).\
    agg({'item_cnt_day': 'sum'}).\
    sort_values('item_cnt_day', ascending=False)

### Data cleaning

In [None]:
# Remove -ve or extremely high sales
np.log10(df_train_sales_eda['item_price']).hist(bins=25)

In [None]:
# Removing outliers in item_price [10, 10000]
df_train_sales_eda = df_train_sales_eda.loc[(df_train_sales_eda['item_price'] > 10) & (df_train_sales_eda['item_price'] < 1e4)].copy()

In [None]:
# Remove non +ve item counts
df_train_sales_eda = df_train_sales_eda.loc[(df_train_sales_eda['item_cnt_day']>=1) & (df_train_sales_eda['item_cnt_day']<=10)].copy()

np.log10(df_train_sales_eda['item_cnt_day']).hist(bins=25)

In [None]:
# Shop name
df_train_sales_eda['shop_name'].unique()

In [None]:
df_train_sales_eda.loc[~df_train_sales_eda['shop_name'].isin(only_use_shop_names), 'shop_name'] = 'Unknown'
df_train_sales_eda.loc[~df_train_sales_eda['item_name'].isin(only_use_item_names), 'item_name'] = 'Unknown'
df_train_sales_eda.loc[~df_train_sales_eda['item_category_name'].isin(only_use_item_cat_name), 'item_category_name'] = 'Unknown'

## Preparation for model fitting

In [None]:
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

In [None]:
# Label encoding
label_encoder = LabelEncoder()
df_train_sales_eda['shop_name'] = label_encoder.fit_transform(df_train_sales_eda['shop_name'])

label_encoder = LabelEncoder()
df_train_sales_eda['item_category_name'] = label_encoder.fit_transform(df_train_sales_eda['item_category_name'])

In [None]:
# Create tuple of shop id and item id per date_block_num

meta_features = ['date_block_num', 'shop_id', 'item_id']

train_agg_set = df_train_sales_eda.groupby(meta_features, as_index=False).size()

train_agg_set = train_agg_set[meta_features].copy()

train_agg_set

#### Generating group features

In [None]:
df_train_sales_eda.columns

In [None]:
group1 = df_train_sales_eda.groupby(meta_features, as_index=False).agg({'item_price': 'mean'
                                                                       ,'item_cnt_day': 'sum'})

train_agg_set = pd.merge(train_agg_set, group1, on=meta_features, how='left')

train_agg_set = train_agg_set.rename(columns={'item_cnt_day': 'item_cnt_month', 'item_price': 'item_price_mean'})

In [None]:
train_agg_set.describe()

#### Train/Test

In [None]:
train_agg_set

In [None]:
# From here heavily influenced by https://www.kaggle.com/werooring/top-3-5-lightgbm-with-feature-engineering
# Merging train/test into one

df_test['date_block_num'] = 34

df_combo = pd.concat([train_agg_set, df_test.drop('ID', axis=1)], ignore_index=True, keys=meta_features)
df_combo = df_combo.fillna(0)

In [None]:
# Train data (Features)
X_train = df_combo[df_combo['date_block_num'] < 33]
X_train = X_train.drop(['item_cnt_month'], axis=1)

# Valid data (Features)
X_valid = df_combo[df_combo['date_block_num'] == 33]
X_valid = X_valid.drop(['item_cnt_month'], axis=1)

# Test data (Features)
X_test = df_combo[df_combo['date_block_num'] == 34]
X_test = X_test.drop(['item_cnt_month'], axis=1)

# Train data (Target values)
y_train = df_combo[df_combo['date_block_num'] < 33]['item_cnt_month']
# Valid data (Target values)
y_valid = df_combo[df_combo['date_block_num'] == 33]['item_cnt_month']

In [None]:

# lgb hyper-parameters
params = {'metric': 'rmse',
          'num_leaves': 200,
          'learning_rate': 0.003, 'feature_fraction': 0.75,
          'bagging_fraction': 0.75,
          'bagging_freq': 5, 'force_col_wise' : True,
          'random_state': 42}

#cat_features = ['shop_id', 'city', 'item_category_id', 'category', 'month']

# lgb train and valid dataset
dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid)
 
# Train LightGBM model
lgb_model = lgb.train(params=params,
                      train_set=dtrain,
                      num_boost_round=2500,
                      valid_sets=(dtrain, dvalid),
                      early_stopping_rounds=250,
                      #categorical_feature=cat_features,
                      verbose_eval=250)      

In [None]:
preds = lgb_model.predict(X_test).clip(0,20)

df_submission['item_cnt_month'] = preds
df_submission.to_csv('submission.csv', index=False)