In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
import plotly.graph_objects as go
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)
from pandas.tseries.offsets import MonthEnd
from pandas import Grouper
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import warnings
warnings.simplefilter(action='ignore', category= FutureWarning)

In [None]:
item_cat = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv',header=0)
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv',header=0)
sales_train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv',header=0)
sample_sub = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv',header=0)
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv',header=0)
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv',header=0)

# 1. Exploratory Data Analysis

In [None]:
def check_data(data):
    print('-' * 38+'Head'+'-' * 39)
    print(data.head(3))
    print('-' * 38+'Shape'+'-' * 38)
    print(data.shape)
    print('-' * 38+'Types'+'-' * 38)
    print(data.dtypes)
    print('-' * 38+'Na'+'-' * 41)
    print(data.isnull().sum())

# 1. 1. Check basic information

In [None]:
check_data(sales_train)

In [None]:
check_data(test)

In [None]:
# Check if shop_ids & item_ids in the training set are identical to those ids in the testing set
print('training set:\n shop_id:',sorted(list(sales_train.shop_id.unique())),
      '\n shop_id size:',sales_train.shop_id.unique().size,
      '\n item_id size:',sales_train.item_id.unique().size,
      '\n item_id max:',sales_train.item_id.unique().max(),
     '\n testing set:\n shop_id:',sorted(list(test.shop_id.unique())),
      '\n shop_id size:',test.shop_id.unique().size,
      '\n item_id size:',test.item_id.unique().size,
      '\n item_id max:',test.item_id.unique().max())

In [None]:
sales_train["shop_item_id"] = sales_train["shop_id"] * 100000  + sales_train["item_id"]
test["shop_item_id"] = test["shop_id"]* 100000  + test["item_id"] 
print('\n\n shop_item_id in testing set and also in training set:\n',
     test["shop_item_id"].isin(sales_train['shop_item_id']).value_counts(),
      '\n\n item_id in testing set and also in training set:\n',
     test["item_id"].isin(sales_train['item_id']).value_counts(),
     '\n\n shop_id in testing set and also in training set:\n',
     test["shop_id"].isin(sales_train['shop_id']).value_counts())

In [None]:
item_id_train = list(sales_train.item_id.unique())
item_id_test = list(test.item_id.unique())
item_id_new = [item for item in item_id_test if item not in item_id_train]
print('The number of new item_id in the testing set:\n',len(item_id_new))

We find that: Some shop-item combinations in the testing set are the same as ids in the training set, which means shops will continue to sell those items (scenario 1). Some shop-item combinations in the testing set are not identical to those in the training set:  Those items were sold in some shops in the past and would sell in other shops in the future (scenario 2);  Some new products will be released in the future(scenario 3). 
   
**Solutions:**

For scenario 1, we generate some lag features considering the shop-item combination to capture the temporal dynamics and treat the joint multivariate time series forecast as a regression.  

For scenario 2, the basic idea is the same as scenario 1, but the lag features generated only depend on the monthly sales of items. 

For scenario 3, we have no historical sales information to use for a time series model. We can cluster items in each category,  calculate sales per cluster,  then map the new products to the nearest category. (in next notebook)

#  1. 2. Group observations and aggregate monthly sales per shop-item combination  

In [None]:
monthly_df = sales_train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day':'sum'})
monthly_df.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
monthly_df = monthly_df.reset_index()
monthly_df.head(3)

In [None]:
del sales_train

#  1. 3. Reduce memory & Free up Ram

Before starting to work, we run the 'reduce_mem_usage' function (Reference: [load data (reduce memory usage)](https://www.kaggle.com/code/gemartin/load-data-reduce-memory-usage)) on the sales dataset to save memory and free some RAM because the Kaggle notebook only gives 16GB of free RAM. You can skip it if you have no trouble implementing a large dataset.

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
print('-' * 80)
print('monthly_df')
monthly_df = reduce_mem_usage(monthly_df)

#  1. 4. Impute missing values of the monthly sales per shop-item combination

The target variable can be used for feature engineering when working on a time series problem. The previous sales per shop-item combination are a critical variable in prediction. If the present value is at time t, then past values are known as lags, so t-1 is lag 1, and t-2 is lag 2. However, not all shop-item combinations have sales every month in this dataset. Therefore, we need to impute the missing values and generate lag features for our series.

In [None]:
monthly_df["shop_item_id"] = monthly_df["shop_id"] * 100000  + monthly_df["item_id"]
month_size = monthly_df['date_block_num'].value_counts().size
shop_item_id = pd.Series(monthly_df['shop_item_id'].unique())
size = shop_item_id.size
data = pd.DataFrame(columns = ['date_block_num', 'shop_item_id', 'item_cnt_month']) 
for i in range(month_size):
    # print(i)
    impute_data = pd.DataFrame({'date_block_num':[i]*size, 'shop_item_id':shop_item_id, 'item_cnt_month': [0]*size})
    sub_df = monthly_df.iloc[(monthly_df['date_block_num']==i).tolist()]
    sub_df = sub_df.append(impute_data, ignore_index=True).drop_duplicates(subset=['date_block_num', 'shop_item_id'])
    # print(sub_df.shape)
    data = data.append(sub_df, ignore_index=True)
    # print(data.shape)

del monthly_df

In [None]:
# Check if the number of the rows is equal to unique id number size* month_size. 
data.shape

In [None]:
data['Date']=data['date_block_num'].apply(lambda x: ((x//12 + 2013)*100+(x % 12)+1))
data['Date']=pd.to_datetime(data['Date'],format='%Y%m')+ MonthEnd(1)
data['item_id']=data['shop_item_id'].apply(lambda x: x%100000)
data['shop_id']=data['shop_item_id'].apply(lambda x: x//100000)
data.drop(['date_block_num'], axis=1, inplace=True)
data.head(3)

In [None]:
data_pivot = data.pivot('Date',"shop_item_id", "item_cnt_month")
data_pivot.head(3)

#  1. 5. Visualize the top 10 shop-item combinations

In [None]:
# Sort according to the best sales shop_item_id
df_2id = data.groupby(['shop_item_id']).sum().sort_values(by='item_cnt_month',ascending=False)
# list the top tenth best sold id
best_sold_id = list(df_2id.index[0:10])
best_sold_itemid=[abs(best_sold_id[i]%100000) for i in range(len(best_sold_id))]
best_sold_shopid=[int(best_sold_id[i]/100000) for i in range(len(best_sold_id))]
print('Best sold shop&item id:', best_sold_id, '\nitem_id:', best_sold_itemid, '\nshop_id:', best_sold_shopid)

In [None]:
# Select the top 10 sales ids
data_top10 = data_pivot[best_sold_id]
data_top10.head(3)

In [None]:
rcParams['figure.figsize'] = 12, 6
data_top10.plot()
plt.legend(loc='upper left', fontsize=11)
plt.show()

In [None]:
# The boxplot of the top 10 sales ids per year 
rcParams['figure.figsize'] = 20, 8
data_top10.groupby(Grouper(freq='A')).boxplot() # rot=45 xticks rotation 45
plt.show()

We observe that the monthly sales between different shop-item combinations are very diverse. It is very common for retailers, especially in different departments, such as the sales of milk might be ten thousand times the sales of TV in Walmart. The sales of some shop-item combinnations have obviously seasonly tread.  

In [None]:
del data_pivot
del data_top10

#  1.6. Visualize the best sales item and the worst sales item

In [None]:
# Sort according to the best sold item_id
df_item = data.groupby(['item_id']).agg({'item_cnt_month':'sum'}).sort_values(by='item_cnt_month',ascending=False)
item_t1th = data.loc[data['item_id']==df_item.index[0]]
item_l1th = data.loc[data['item_id']==df_item.index[-1]]

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(x=np.array(item_t1th['item_cnt_month']),name=f'The best item id:{df_item.index[0]}'))
fig.add_trace(go.Box(x=np.array(item_l1th['item_cnt_month']),name=f'The worst item id:{df_item.index[-1]}'))
fig.show()

In [None]:
del df_item

#  1.7. Statistics summary per shop-item combination

In [None]:
data_summary = data.groupby(['shop_item_id']).agg({'item_cnt_month': ['sum', 'mean', 'median', 'std']})
data_summary.head(3)

In [None]:
del data_summary

# 2. Feature Engineer

Reference :
['6 Powerful Feature Engineering Techniques For Time Series Data (using Python)'](https://www.analyticsvidhya.com/blog/2019/12/6-powerful-feature-engineering-techniques-time-series/#h2_11)

# 2. 1. Lag Features

In [None]:
grouped = data.groupby('shop_item_id')['item_cnt_month']
data['lag_1'] = grouped.transform(lambda x : x.shift(1))  
data['rmean_3'] = grouped.transform(lambda x : x.shift(2).rolling(3).mean())
data.dropna(inplace=True)

# 2. 2. Date-Related Features

In [None]:
data['Month'] = data['Date'].dt.month.astype('int16') 
data['Quarter'] = data['Date'].dt.quarter.astype('int16') 
data['Year'] = data['Date'].dt.year.astype('int16')
data.head(1)

# 2. 3. encode categorical features

In [None]:
cat_feats= ['item_id','shop_id','Year','Quarter','Month']
for i in cat_feats:
    cat_encoder = LabelEncoder()
    data[i] = cat_encoder.fit_transform(data[i])

# 3. LightGBM forecast 

# 3. 1. Training & validation

In [None]:
from dateutil.relativedelta import relativedelta
cutoff = data.Date.max() - relativedelta(months=3)
xtrain = data.loc[data.Date < cutoff].copy()
xvalid = data.loc[data.Date >= cutoff].copy()

In [None]:
ytrain = xtrain['item_cnt_month']
yvalid = xvalid['item_cnt_month']

xtrain.drop(['Date', 'item_cnt_month','shop_item_id'], axis = 1, inplace = True)
xvalid.drop(['Date', 'item_cnt_month','shop_item_id'], axis = 1, inplace = True)

In [None]:
dtrain = lgb.Dataset(xtrain , label = ytrain,  free_raw_data=False)
dvalid = lgb.Dataset(xvalid, label = yvalid,   free_raw_data=False)

In [None]:
lgb_params = {'objective':'regression',
              'metric': 'rmse',
              'boosting':'goss', # gradient-based one-side sampling
              'num_leaves': 12,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,  # used to speed up training and deal with over-fitting, (0,1]
              'max_depth': 5,   # used to deal with over-fitting when #data is small
              'verbosity': 1,
              'force_row_wise':True,
              'early_stopping_rounds': 100, #will stop training if one metric of one validation data doesn't improve in last # rounds
             }


model = lgb.train(lgb_params, dtrain, valid_sets = [dtrain, dvalid], num_boost_round=1500, verbose_eval=100) 

# 3. 2. Plot important features

In [None]:
def plot_lgb_importances(model,plot=True,num=10):
    gain = model.feature_importance('gain')
    feat_imp = pd.DataFrame({'feature': model.feature_name(),
                             'split': model.feature_importance('split'),
                             'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    if plot:
        plt.figure(figsize=(10, 4))
        sns.set(font_scale=1)
        sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
        plt.title('feature')
        plt.tight_layout()
        plt.show()
    else:
        print(feat_imp.head(num))
    print(feat_imp.head(num))

In [None]:
plot_lgb_importances(model,7)

# 3. 3. Forecast

In [None]:
print('-' * 80)
print('test')
test = reduce_mem_usage(test)

**Scenario 1:** 

Shop-item combinations in the testing set and the training set are the same. We can directly use the identical shop-item combination last month, and the monthly sales in last month are the lag1 of the testing set.

In [None]:
test1 = data.loc[(data['shop_item_id'].isin(test['shop_item_id']))&(data.Date == data.Date.max()),
                ['shop_item_id','shop_id','item_id','item_cnt_month','Month','Quarter','Year']] 
test1.rename(columns={'item_cnt_month':'lag1'}, inplace=True)
test1['Month']+=1

In [None]:
# Calculate rolling mean for last three month
cutoff2 = data.Date.max() - relativedelta(months=3)
L3months = data.loc[(data['shop_item_id'].isin(test['shop_item_id']))& (data.Date > cutoff2)]
rmean_3 = L3months.groupby(['shop_item_id']).agg({'item_cnt_month': 'mean'})
rmean_3.reset_index(inplace=True)

In [None]:
test1=test1.join(rmean_3.set_index('shop_item_id'), on='shop_item_id')
test1.rename(columns={'item_cnt_month':'rmean_3'}, inplace=True)

In [None]:
test1.drop(['shop_item_id'], axis = 1, inplace = True)

In [None]:
lgb_params = {'objective':'regression',
              'metric': 'rmse',
              'boosting':'goss', # gradient-based one-side sampling
              'num_leaves': 12,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 5,
              'force_row_wise':True,
              'verbosity': 1}

In [None]:
# Final model, train total data and predict
X_train = data.drop(['Date', 'item_cnt_month','shop_item_id'], axis = 1)
y_train = data['item_cnt_month']
lgbtrain_all = lgb.Dataset(data=X_train, label=y_train)
final_model = lgb.train(lgb_params, lgbtrain_all, num_boost_round=model.best_iteration)
test_preds1 = final_model.predict(test1, num_iteration=model.best_iteration)

In [None]:
test_pre1=pd.DataFrame({'shop_id':test1['shop_id'].values, 'item_id': test1['item_id'].values,'item_cnt_month':test_preds1})
test_pre1.head(5)

**Scenario 2:** 

Shop-item combinations in the testing set are not identical to those in the training set, but items are the same. So lag features generated only depend on monthly sales of items. 

In [None]:
test2 = test[(test['item_id'].isin(data['item_id']))&(~test['shop_item_id'].isin(data['shop_item_id']))]
# Calculate lag1
l1month_sales = data.loc[data.Date == data.Date.max()]
lag1 = l1month_sales.groupby(['item_id']).agg({'item_cnt_month': 'mean'})
lag1.reset_index(inplace=True)
lag1.rename(columns={'item_cnt_month':'lag1'}, inplace=True)

In [None]:
# Calculate rolling mean for last three month
cutoff2 = data.Date.max() - relativedelta(months=3)
L3months = data.loc[data.Date > cutoff2]
rmean_3 = L3months.groupby(['item_id']).agg({'item_cnt_month': 'mean'})
rmean_3.reset_index(inplace=True)
rmean_3.rename(columns={'item_cnt_month':'rmean_3'}, inplace=True)

In [None]:
test2 = test2.merge(lag1, how='left', on='item_id')
test2 = test2.merge(rmean_3, how='left', on='item_id')
test2['Month']=[11]*test2.shape[0]
test2['Quarter']=[4]*test2.shape[0]
test2['Year']=[2015]*test2.shape[0]
test2.drop(['ID','shop_item_id'], axis = 1, inplace = True)

In [None]:
test_preds2 = final_model.predict(test2, num_iteration=model.best_iteration)

In [None]:
test_pre2=pd.DataFrame({'shop_id':test2['shop_id'].values, 'item_id': test2['item_id'].values,'item_cnt_month':test_preds2})
test_pre2.head(5)

In [None]:
test = test.merge(test_pre1, how='left', on=['shop_id','item_id'])
test = test.merge(test_pre2, how='left', on=['shop_id','item_id'])
test['item_cnt_month'] = test['item_cnt_month_x'].fillna(test['item_cnt_month_y'])

In [None]:
submission_df = test.drop(['shop_id','item_id','shop_item_id','item_cnt_month_x','item_cnt_month_x'], axis=1)
submission_df.to_csv('submission.csv', index=False)

**Scenario 3:** 

See the next notebook.