In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import datetime as dt
import holidays
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

import gc

In [None]:
PATH = 'competitive-data-science-predict-future-sales'
item_categories = pd.read_csv('../input/' + PATH + '/item_categories.csv')
items = pd.read_csv('../input/' + PATH + '/items.csv')
sales_train = pd.read_csv('../input/' + PATH + '/sales_train.csv')
submission = pd.read_csv('../input/' + PATH + '/sample_submission.csv')
shops = pd.read_csv('../input/' + PATH + '/shops.csv')
test = pd.read_csv('../input/' + PATH + '/test.csv')

In [None]:
sales_train['datetime'] = pd.to_datetime(sales_train['date'], format='%d.%m.%Y')
sales_train['day'] = sales_train['datetime'].dt.day
sales_train['month'] = sales_train['datetime'].dt.month
sales_train['year'] = sales_train['datetime'].dt.year

In [None]:
figure, axe = plt.subplots(figsize = (8,6))
axe.set_title(" EDA Item Price VS  Sales Day", weight="bold")

plot = plt.scatter(sales_train.item_price, sales_train.item_cnt_day)
plt.xlabel('Item Price')
plt.ylabel('Sales Day')

I remove outliers with very large item_price and item_cnt_day .

In [None]:
sales_train = sales_train[sales_train.item_price<300000]
sales_train = sales_train[sales_train.item_cnt_day<2000]

In [None]:
sales_train[sales_train.item_price<0]

One item_price is below zero. I replace it with median.

In [None]:
median = sales_train[sales_train.item_price>0].item_price.median()
sales_train.loc[sales_train.item_price<0, 'item_price'] = median

In [None]:
sales_train[sales_train.item_price<0]

Several shops are duplicates of each other (according to its name). Fix train and test set.

In [None]:
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [None]:
sales_train['revenue'] = sales_train['item_cnt_day'] * sales_train['item_price']

In [None]:
sales_train

# LightGBM

I prepare number of weekdays and holidays in each month. 

In [None]:
ru_holidays = holidays.Russia()

In [None]:
df_day_information = pd.DataFrame(index=[], columns=['year', 'month', 'days_in_month', 'weekdays_in_month', 'holidays_in_month'])
i = 0
for year in [2013, 2014, 2015]:
    if year==2015:
        months = 11
    else:
        months = 12
    for month in range(1,months+1):
        if month==2:
            days = 28
        elif month==4 or month==6 or month==9 or month==11:
            days = 30
        else:
            days = 31
        count = 0
        for day in range(1,days+1):
            date = dt.datetime(year, month, day) 
            if date in ru_holidays or date.weekday()==5 or date.weekday()==6:
                count += 1
        df_day_information.loc[i] = [year, month, days, days-count, count]
        i += 1

df_day_information

The first word of item_category_name represents large category of item. So I use it.   

In [None]:
item_categories["large_category"] = item_categories.item_category_name.str.split(" ").map( lambda x: x[0] )

In [None]:
item_categories

In [None]:
item_categories["large_category"] = LabelEncoder().fit_transform( item_categories.large_category )

The first word of shop_name represents city. So I use it.   

In [None]:
shops.loc[ shops.shop_name == 'Сергиев Посад ТЦ "7Я"',"shop_name" ] = 'СергиевПосад ТЦ "7Я"'
shops["city"] = shops.shop_name.str.split(" ").map( lambda x: x[0] )
shops.loc[shops.city == "!Якутск", "city"] = "Якутск"

In [None]:
shops

In [None]:
shops["city"] = LabelEncoder().fit_transform( shops.city )

Features are added by following function. 

In [None]:
def create_df_month(year, month, sales_train, test, df_day_information, items, item_categories, shops):
    
    if year==2015 and month==11:
        df_month = test.loc[:, ['shop_id', 'item_id']]
    else:
        # add item_cnt_month
        df = sales_train[(sales_train.year==year)&(sales_train.month==month)].groupby(['shop_id', 'item_id']).sum()
        df_2 = pd.merge(test, df, on=['shop_id', 'item_id'], how='left')
        df_2.fillna(0.0, inplace=True)
        df_2.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
        df_month = df_2.loc[:, ['shop_id', 'item_id', 'item_cnt_month']]
        df_month['item_cnt_month'] = df_month.loc[:, 'item_cnt_month'].clip(0,20)
    
    # add year and month
    df_month['year'] = year
    df_month['year'] = df_month.loc[:,'year'].astype('int16')
    df_month['month'] = month
    df_month['month'] = df_month.loc[:,'month'].astype('int8')

    # add item_category    
    df_month = pd.merge(df_month, items, on=['item_id'], how='left')
    df_month['item_category_id'] = df_month.loc[:,'item_category_id'].astype('int8')
    df_month.drop('item_name', axis=1, inplace=True)

    # add large_category
    df_month = pd.merge(df_month, item_categories, on=['item_category_id'], how='left')
    df_month['large_category'] = df_month.loc[:,'large_category'].astype('int8')
    df_month.drop('item_category_name', axis=1, inplace=True)
    
    # add city category
    df_month = pd.merge(df_month, shops, on=['shop_id'], how='left')
    df_month['city'] = df_month.loc[:,'city'].astype('int8')
    df_month.drop('shop_name', axis=1, inplace=True)

    # add days_in_month, weekdays_in_month and holidays_in_month
    days=df_day_information[(df_day_information.year==year)&(df_day_information.month==month)].loc[:,'days_in_month'].values[0]
    df_month['days_in_month']=days
    df_month['days_in_month'] = df_month.loc[:,'days_in_month'].astype('int8')
    days=df_day_information[(df_day_information.year==year)&(df_day_information.month==month)].loc[:,'weekdays_in_month'].values[0]
    df_month['weekdays_in_month']=days
    df_month['weekdays_in_month'] = df_month.loc[:,'weekdays_in_month'].astype('int8')
    days=df_day_information[(df_day_information.year==year)&(df_day_information.month==month)].loc[:,'holidays_in_month'].values[0]
    df_month['holidays_in_month']=days
    df_month['holidays_in_month'] = df_month.loc[:,'holidays_in_month'].astype('int8')

    # add item_cnt_month, revenue and item_price for past three months and a year ago
    for i in [1,2,3,12]:
        month_2 = month - i
        if month_2 <= 0:
            year_2 = year - 1
            month_2 += 12
        else:
            year_2 = year
        df = sales_train[(sales_train.year==year_2)&(sales_train.month==month_2)].groupby(['shop_id', 'item_id']).sum()
        df_2 = pd.merge(test, df, on=['shop_id', 'item_id'], how='left')
        df_2.fillna(0.0, inplace=True)
        df_2.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
        df_month['item_cnt_month_lag_'+str(i)] = df_2['item_cnt_month'].clip(0,20)
        df_month['revenue_lag_'+str(i)] = df_2['revenue']
        if i <= 3:
            df = sales_train[(sales_train.year==year_2)&(sales_train.month==month_2)].groupby(['shop_id', 'item_id']).mean()
            df_2 = pd.merge(test, df, on=['shop_id', 'item_id'], how='left')
            df_2.fillna(sales_train['item_price'].median(), inplace=True)
            df_month['item_price_lag_'+str(i)] = df_2.loc[:, 'item_price']

    # add item_cnt_month_mean and revenue_mean
    for i in [6,12]:
        j = (year-2013)*12 + (month-1)
        df = sales_train[(sales_train.date_block_num>=j-i)&(sales_train.date_block_num<=j-1)].groupby(['shop_id', 'item_id']).mean()
        df_2 = pd.merge(test, df, on=['shop_id', 'item_id'], how='left')
        df_2.fillna(0.0, inplace=True)
        df_2.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
        df_month['item_cnt_month_mean_last_'+str(i)] = df_2['item_cnt_month'].clip(0,20)
        df_month['revenue_mean_last_'+str(i)] = df_2['revenue']

    # add item_cnt_month_std
    for i in [6,12]:
        j = (year-2013)*12 + (month-1)
        df = sales_train[(sales_train.date_block_num>=j-i)&(sales_train.date_block_num<=j-1)].groupby(['shop_id', 'item_id']).std()
        df_2 = pd.merge(test, df, on=['shop_id', 'item_id'], how='left')
        df_2.fillna(0.0, inplace=True)
        df_2.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
        df_month['item_cnt_month_std_last_'+str(i)] = df_2['item_cnt_month']
        df_month['revenue_std_last_'+str(i)] = df_2['revenue']

    return df_month

In [None]:
i = 0
for year in [2014, 2015]:
    if year==2015:
        max = 11
    else:
        max = 12
    for month in range(1,max+1):
        i += 1
        df_month = create_df_month(year, month, sales_train, test, df_day_information, items, item_categories, shops)
        if i==1:
            df_train_lgbm = df_month.copy()
        elif year==2015 and month==10:
            df_val_lgbm = df_month.copy()
        elif year==2015 and month==11:
            df_test_lgbm = df_month.copy()
        else:
            df_train_lgbm = pd.concat([df_train_lgbm, df_month], ignore_index=True)

In [None]:
df_train_lgbm

random sampling for cross validation

In [None]:
df_train_lgbm = df_train_lgbm.sample(frac=1, random_state=0).reset_index(drop=True)

In [None]:
X_train_lgbm = df_train_lgbm.drop('item_cnt_month', axis=1)
y_train_lgbm = df_train_lgbm.loc[:, 'item_cnt_month']

In [None]:
X_val_lgbm = df_val_lgbm.drop('item_cnt_month', axis=1)
y_val_lgbm = df_val_lgbm.loc[:, 'item_cnt_month']

In [None]:
X_test_lgbm = df_test_lgbm.copy()

In [None]:
del df_train_lgbm, df_val_lgbm, df_test_lgbm
gc.collect()

Grid search (cv = 5 for cross validation)

In [None]:
params = {"learning_rate":[0.1],
          "max_depth": [6, 8, 10],
          "num_leaves": [32, 64, 128],
          "n_estimators":[100],
          "bagging_fraction":[0.5],
          "feature_fraction": [1.0],
          "min_data_in_leaf": [5],
          "reg_alpha": [0.1],
          "reg_lambda": [1],
          "random_state": [42],
         }


lgbm_model = LGBMRegressor()

cv = GridSearchCV(lgbm_model, params, cv = 5, scoring= 'neg_root_mean_squared_error', n_jobs =-1, verbose=2)
cv.fit(X_train_lgbm, y_train_lgbm)
best = cv.best_estimator_

In [None]:
print(cv.best_params_)

cv_results = pd.DataFrame(cv.cv_results_)
cv_results

In [None]:
plt.plot(-cv_results['mean_test_score'])

I run LGBMRegressor with best parameters to show importance. 

In [None]:
params = cv.best_params_

lgbm_model = LGBMRegressor(**params)

lgbm_model.fit(X_train_lgbm, 
               y_train_lgbm, 
               eval_metric="rmse",
               eval_set=[(X_train_lgbm, y_train_lgbm), (X_val_lgbm, y_val_lgbm)], 
               verbose=10, 
               early_stopping_rounds = 40)

lgb.plot_importance(lgbm_model)

"best" is used for prediction. 

In [None]:
y_test_lgbm = best.predict(X_test_lgbm)

# LSTM

In [None]:
def create_df_month_lstm(year, month, sales_train, test):
    
    if year==2015 and month==11:
        df_month = test.loc[:, ['shop_id', 'item_id']]
    else:
        # add item_cnt_month
        df = sales_train[(sales_train.year==year)&(sales_train.month==month)].groupby(['shop_id', 'item_id']).sum()
        df_2 = pd.merge(test, df, on=['shop_id', 'item_id'], how='left')
        df_2.fillna(0.0, inplace=True)
        df_2.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
        df_month = df_2.loc[:, ['shop_id', 'item_id', 'item_cnt_month']]
        df_month['item_cnt_month'] = df_month.loc[:,'item_cnt_month'].clip(0,20)

    # add item_cnt_month for past 33 months
    for i in range(33, 0, -1):
        month_2 = month - i
        year_2 = year
        if month_2 <= 0:
            year_2 = year - 1
            month_2 += 12
        if month_2 <= 0:
            year_2 = year - 1
            month_2 += 12
        df = sales_train[(sales_train.year==year_2)&(sales_train.month==month_2)].groupby(['shop_id', 'item_id']).sum()
        df_2 = pd.merge(test, df, on=['shop_id', 'item_id'], how='left')
        df_2.fillna(0.0, inplace=True)
        df_2.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
        df_month['lag_'+str(i)] = df_2['item_cnt_month'].clip(0,20)
       
    return df_month

In [None]:
i = 0
for year in [2015]:
    for month in range(10,12):
        i += 1
        df_month = create_df_month_lstm(year, month, sales_train, test)       
        if i==1:
            df_train_lstm = df_month.copy()
        else:
            df_test_lstm = df_month.copy()

In [None]:
df_train_lstm

random sampling for cross validation

In [None]:
df_train_shuffle_lstm = df_train_lstm.sample(frac=1, random_state=0).reset_index(drop=True)

In [None]:
X_train_lstm = np.expand_dims(df_train_shuffle_lstm.drop(['item_cnt_month'], axis=1),axis=2)
y_train_lstm = np.expand_dims(df_train_shuffle_lstm.loc[:,'item_cnt_month'].values, axis=1)

print(X_train_lstm.shape,y_train_lstm.shape)

In [None]:
model_lstm = Sequential()
model_lstm.add(LSTM(units = 64,input_shape = (30,1)))
model_lstm.add(Dropout(0.4))
model_lstm.add(Dense(1))

model_lstm.compile(loss = 'mse',optimizer = 'adam', metrics = ['mean_squared_error'])

In [None]:
EPOCHS = 10
history = model_lstm.fit(X_train_lstm, y_train_lstm, batch_size=4096, epochs=EPOCHS,validation_split=0.05)

In [None]:
plt.plot(range(1, EPOCHS+1), history.history['mean_squared_error'], label="mean_squared_error")
plt.plot(range(1, EPOCHS+1), history.history['val_mean_squared_error'], label="val_mean_squared_error")
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
X_test_lstm = np.expand_dims(df_test_lstm,axis=2)
y_test_lstm = model_lstm.predict(X_test_lstm).clip(0,20)

# Ensemble

In [None]:
predicted_values = test.copy()
predicted_values['LGBM'] = y_test_lgbm
predicted_values['LSTM'] = y_test_lstm
predicted_values = predicted_values.loc[:, ['LGBM', 'LSTM']]
submission['item_cnt_month'] = np.dot(predicted_values, [0.9, 0.1]).clip(0,20)

In [None]:
submission.to_csv('submission.csv', index=False)