In [None]:
# Basic packages
import numpy as np 
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 20)
import pandas_profiling as pp
from pandas import Series as Series
import datatable as dt
import random as rd 
import datetime
import gc
import os
from tqdm import tqdm
import time
import sys


# Visualization
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly as py
import plotly.express as px
import plotly.graph_objects as go
py.offline.init_notebook_mode(connected = True)

# For time series analysis
from datetime import datetime
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic, pacf
! pip install pmdarima
from pmdarima import auto_arima
from pandas.plotting import autocorrelation_plot
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
from fbprophet import Prophet

# Model Development Related
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error, median_absolute_error, mean_squared_error, mean_squared_log_error
from xgboost import XGBRegressor
from xgboost import plot_importance
import pickle

# Ignore warning 
import warnings
warnings.filterwarnings("ignore")

**Overview About Competition :**
1. This time-series dataset consisting of daily sales data (largest Russian software firms - 1C Company)
2. We have to predict total sales for every product and store in the next month

#### Import Python Packages


#### Import All Files

In [None]:
# List Of Files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Main Data File Import
train_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
test_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
items_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
items_cat_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
shops_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
# Convert to date time data type
train_df['date'] = pd.to_datetime(train_df['date'], dayfirst = True)

In [None]:
train_df.info()

In [None]:
train_df.head()

#### Basic Exploratory Data Analysis (EDA)

In [None]:
print("Total Number Of row : ", train_df.shape[0])
print("Total Unique Days : ", train_df['date'].nunique())
print("Starting Days : ", train_df['date'].min())
print("Ending Days : ", train_df['date'].max())
print('---------------------------')
missing_values_count = train_df.isnull().sum()[train_df.isna().sum() > 0].sort_values(ascending=False)
total_cells = np.product(train_df.shape)
total_missing = missing_values_count.sum()
print('NAN Valued Columns: %d' % train_df.isna().any().sum())
print ("Missing data = ",str(round((total_missing/total_cells) * 100, 2))+'%')

*Quick Observations:*
> 1. Total 1034 unique days means almost 3 (2.8) years data is given
> 2. Data entry starts from January(2013) to October(2015)

In [None]:
print("Total Unique Date Block : ", train_df['date_block_num'].nunique())
print("Total Unique Shop : ", shops_df['shop_id'].nunique())
print("Total Unique Item Category : ", items_df['item_category_id'].nunique())
print("Total Unique Item : ", train_df['item_id'].nunique())
print('Total Number of duplicates:', len(train_df[train_df.duplicated()]))
# Only shops that exist in test set.
print('Total Number of Common Unique Shops in Training&Test Set:',train_df[train_df['shop_id'].isin(test_df['shop_id'])]['shop_id'].nunique())
# Only items that exist in test set.
print('Total Number of Common Unique Items in Training&Test Set:',train_df[train_df['item_id'].isin(test_df['item_id'])]['item_id'].nunique())
print("Number Of Items with Price 0 : ", train_df[train_df['item_price']<=0]['item_id'].count())

*Quick Observations:*
> 1. Total 1034 daye is also divided into 34 date blocks (2 year 10 month = 34 month )
2. Only 6 duplicates value : <br>
a. We can remove it <br>
b. But I think only 6 duplicate values will not make any difference

In [None]:
monthly_sales = train_df.groupby(["date_block_num","shop_id","item_id"])["date","item_price","item_cnt_day"]\
                   .agg({"date":["min",'max'], "item_price":"mean", "item_cnt_day":"sum"})

In [None]:
monthly_sales.head()

In [None]:
# Understand item category and items per category
items_per_category = items_df.groupby(['item_category_id']).count()
items_per_category = items_per_category.sort_values(by='item_id', ascending=False)
print("Understand distribution Of product in each category :\n", items_per_category['item_id'].describe())
items_per_category = items_per_category.iloc[0:15].reset_index()

# Visualize top item category(Category with grater number of items)
plt.figure(figsize=(14,6))
ax= sns.barplot(items_per_category.item_category_id, items_per_category.item_id, alpha=0.8)
plt.title("Items per Category")
plt.ylabel('Number of items', fontsize=10)
plt.xlabel('Item Category', fontsize=10)
plt.show()
print("Category with more than 800 items : ", items_per_category[items_per_category['item_id']>800]['item_id'].count())

*Quick Observations:*
> 1. Some Category has very large number of items (max - 5k+)
2. 50% category has less than 50 items 
3. 75% category has less than 300 items
4. On average most of the category has less than 263 items
5. Only 4 item category has very large number of items (more than 800)

In [None]:
(train_df[train_df['item_cnt_day']<0]['item_cnt_day'].count()/train_df.shape[0])*100

*Quick Observations:*
> 1. item_cnt_day : number of products sold. You are predicting a monthly amount of this measure
2. We can see that there are almost .25% negative value. Is is possible !!
3. Maybe by mistake it happend, so convert those neg values to positive 

In [None]:
# Convert all neg value of "item_cnt_day" to positive
train_df['item_cnt_day'] = train_df['item_cnt_day'].apply(abs)

##### Time Series Analysis

*Quick Observations:*
> First of all, we have to understand the time series data <br>
> Then we will apply the following models:
1. AR (Autoregressive Model)
2. MA (Moving Average Model)
3. ARIMA (Autoregressive Integrated Moving Average)
4. SARIMA (Seasonal Autoregressive Integrated Moving Average)

##### Item Count and Price 

In [None]:
# Item Sales Per Month Calculation
item_sales_per_month = train_df.groupby(['date_block_num'])['item_cnt_day'].sum()
plt.figure(figsize=(16, 4))
plt.title('Total item sales of the company')
plt.xlabel('Month Number')
plt.ylabel('Item Sales')
plt.plot(item_sales_per_month, color = 'blue', linewidth = 2, markersize = 12);
plt.axvspan(10,12,linestyle=':',linewidth=2,label='First Year Peak',color='darkorange',alpha=.2)
plt.axvspan(22,24,linestyle=':',linewidth=2,label='Second Year Peak',color='green',alpha=.2)
plt.legend(fontsize=12, ncol=1, loc='upper right');

In [None]:
plt.figure(figsize=(16,4))
plt.plot(item_sales_per_month.rolling(window=15,center=False).mean(),label='Rolling: Mean Item Sales');
plt.plot(item_sales_per_month.rolling(window=15,center=False).std(),label='Rolling: Standard Deviation');
plt.legend();

In [None]:
# Year & Month wise Item Count
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month

plt.figure(figsize=(16,10))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)

grouped_item_count = pd.DataFrame(train_df.groupby(['year','month'])['item_cnt_day'].sum().reset_index())
sns.pointplot(x='month', y='item_cnt_day', hue='year', data=grouped_item_count, ax = ax1)

#Price
grouped_item_price = pd.DataFrame(train_df.groupby(['year','month'])['item_price'].mean().reset_index())
sns.pointplot(x='month', y='item_price', hue='year', data=grouped_item_price, ax = ax2)

In [None]:
res = sm.tsa.seasonal_decompose(item_sales_per_month.values,freq=12,model="multiplicative")
res.plot()
plt.show()

In [None]:
res = sm.tsa.seasonal_decompose(item_sales_per_month.values,freq=12,model="additive")
res.plot()
plt.show()

*Quick Observations:*
> 1. Overall selling items following decreasing treand (after 5 months).
> 2. Clearly showing "seasonality" (peak sales around a time of year, mostly in Q4 of a year)
> 3. Last two months of the year having more sales.
> 4. 2015, expecting more sales.

##### Outlayers

In [None]:
# There are some outdated items : didn't sell those items in last 6 months 
out_dated_items = items_df[items_df['item_id'].isin(train_df[train_df['date_block_num'] > 27]['item_id'])== False]['item_id']
print("Outdated items in last 6 months (training set): ", out_dated_items.nunique())
print("Outdated items in last 6 months % (training set): ", (out_dated_items.nunique()/train_df['item_id'].nunique())*100)

print("Outdated items in last 6 months (test set): ", test_df[test_df['item_id'].isin(out_dated_items)==True]['item_id'].nunique())
print("Outdated items in last 6 months %(test set): ", (test_df[test_df['item_id'].isin(out_dated_items)==True]['item_id'].nunique()/test_df['item_id'].nunique())*100)

*Quick Observations:*
> 1. Almost 60% items didn't sell in last 6 months
> 2. But in test set it's only 10%

In [None]:
# Understand item sales count outlayers
plt.figure(figsize = (16,4))
plt.xlim(-100, 3000)
sns.violinplot(x = train_df.item_cnt_day)
# Understand item sales price outlayers
plt.figure(figsize = (16,4))
plt.xlim(train_df.item_price.min(), train_df.item_price.max()*1.1)
sns.violinplot(x=train_df.item_price)

In [None]:
print('Sale item outliers:',train_df['item_id'][train_df['item_cnt_day']>500].unique())
print('Item price outliers:',train_df['item_id'][train_df['item_price']>50000].unique())

In [None]:
plt.figure(figsize = (16,4))
sns.jointplot(x="item_cnt_day", y="item_price", data=train_df, height=8)
plt.show()

*Quick Observations:*
> 1. Removing outlayers may increase model performance.
> 2. Items sales and prices have some outlayers.
> 3.  remove items with price > 100000 and item sales > 1000

##### Stationarity Checking

> Time series is stationary:- easier to model. <br>
*Stationarity Checking Methods:*

>1. ADF( Augmented Dicky Fuller Test)
>2. KPSS
>3. PP (Phillips-Perron test)


###### ADF( Augmented Dicky Fuller Test)

In [None]:
# Stationarity tests
def test_stationarity(timeseries):
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    print('-'*50)
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

test_stationarity(item_sales_per_month)

*Quick Observations:*
> 1. test-statistics : More  negative means more likely to be stationary
> 2. p-value (small): reject null-hypothesis, reject non-stationary
> 3. Here p vaule is high (not bellow 5% )
> 4. Non-Stationary Series : not suitable for time series model 
> 5. Convert Non Stationary : Stationary
> 6. To make it Stationary : remove trends and seasonality (p value bellow 5%)

In [None]:
# Create a differenced series
# To make No-Stationary series to Stationary series we have to calculate difference
# difference = Y(t)-Y(t-1)
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return Series(diff)

# invert differenced forecast
def inverse_difference(last_ob, value):
    return value + last_ob

In [None]:
plt.figure(figsize=(16,16))
plt.subplot(311)
plt.title('Original (p-vale : '+str(round(adfuller(item_sales_per_month, autolag='AIC')[1], 4))+')')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(item_sales_per_month)

# Seasonality is 1 months interval
new_ts=difference(item_sales_per_month)
plt.subplot(312)
plt.title('After De-trend (p-vale : '+str(round(adfuller(new_ts, autolag='AIC')[1], 4))+')')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(new_ts)

# Assuming the seasonality is 12 months long
new_ts=difference(item_sales_per_month,12) 
plt.subplot(313)
plt.title('After De-seasonalization (p-vale : '+str(round(adfuller(new_ts, autolag='AIC')[1], 4))+')')
plt.xlabel('Time')
plt.ylabel('Sales')    
plt.plot(new_ts)
plt.show()

In [None]:
# now testing the stationarity again after de-seasonality
test_stationarity(new_ts)

> * Data Series bocome stationary 

##### Moving Average Model(MA) For Smoothing

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
def plot_moving_average(series, window, plot_intervals=False, scale=1.96):

    rolling_mean = series.rolling(window=window).mean()
    
    plt.figure(figsize=(16,4))
    plt.title('Moving average\n window size = {}'.format(window))
    plt.plot(rolling_mean, 'g', label='Rolling mean trend')
    
    #Plot confidence intervals for smoothed values
    if plot_intervals:
        mae = mean_absolute_error(series[window:], rolling_mean[window:])
        deviation = np.std(series[window:] - rolling_mean[window:])
        lower_bound = rolling_mean - (mae + scale * deviation)
        upper_bound = rolling_mean + (mae + scale * deviation)
        plt.plot(upper_bound, 'r--', label='Upper bound / Lower bound')
        plt.plot(lower_bound, 'r--')
            
    plt.plot(series[window:], label='Actual values')
    plt.legend(loc='best')
    plt.grid(True)
    
#Smooth by the previous 1 month 
plot_moving_average(item_sales_per_month, 1)

#Smooth by the previous 1 quarter
plot_moving_average(item_sales_per_month, 3)

#Smooth by previous 6 month
plot_moving_average(item_sales_per_month, 6, plot_intervals=True)


##### Exponential Smoothing

In [None]:
def exponential_smoothing(series, alpha):

    result = [series[0]] # first value is same as series
    for n in range(1, len(series)):
        result.append(alpha * series[n] + (1 - alpha) * result[n-1])
    return result
  
def plot_exponential_smoothing(series, alphas):
 
    plt.figure(figsize=(17, 8))
    for alpha in alphas:
        plt.plot(exponential_smoothing(series, alpha), label="Alpha {}".format(alpha))
    plt.plot(series.values, "c", label = "Actual")
    plt.legend(loc="best")
    plt.axis('tight')
    plt.title("Exponential Smoothing")
    plt.grid(True);

plot_exponential_smoothing(item_sales_per_month, [0.05, 0.3])


##### Double exponential smoothing

In [None]:
def double_exponential_smoothing(series, alpha, beta):

    result = [series[0]]
    for n in range(1, len(series)+1):
        if n == 1:
            level, trend = series[0], series[1] - series[0]
        if n >= len(series): # forecasting
            value = result[-1]
        else:
            value = series[n]
        last_level, level = level, alpha * value + (1 - alpha) * (level + trend)
        trend = beta * (level - last_level) + (1 - beta) * trend
        result.append(level + trend)
    return result

def plot_double_exponential_smoothing(series, alphas, betas):
     
    plt.figure(figsize=(17, 8))
    for alpha in alphas:
        for beta in betas:
            plt.plot(double_exponential_smoothing(series, alpha, beta), label="Alpha {}, beta {}".format(alpha, beta))
    plt.plot(series.values, label = "Actual")
    plt.legend(loc="best")
    plt.axis('tight')
    plt.title("Double Exponential Smoothing")
    plt.grid(True)
    
plot_double_exponential_smoothing(item_sales_per_month, alphas=[0.9, 0.02], betas=[0.9, 0.02])

##### Dickey-Fuller Test : Stationarity 

In [None]:
def tsplot(y, lags=None, figsize=(10, 8), style='bmh',title=''):
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    with plt.style.context(style):    
        fig = plt.figure(figsize=figsize)
        layout = (3, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        qq_ax = plt.subplot2grid(layout, (2, 0))
        pp_ax = plt.subplot2grid(layout, (2, 1))
        
        y.plot(ax=ts_ax)
        ts_ax.set_title(title)
        smt.graphics.plot_acf (y, lags=lags, ax=acf_ax, alpha=0.5)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)
        sm.qqplot(y, line='s', ax=qq_ax)
        qq_ax.set_title('QQ Plot')        
        scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)

        plt.tight_layout()
     

In [None]:
tsplot(item_sales_per_month, lags=3)

# Take the first difference to remove to make the process stationary
data_diff = item_sales_per_month - item_sales_per_month.shift(1)

tsplot(data_diff[1:], lags=3)

##### Data Cleaning

In [None]:
train_df.head()

##### ARIMA Model

In [None]:
# Finding best ARIMA parameter p,d,q
stepwise_fit = auto_arima(item_sales_per_month, trace=True, suppress_warnings=True)
stepwise_fit.summary()

In [None]:
# Add Month index 
item_sales_per_month_with_date = item_sales_per_month.copy()
item_sales_per_month_with_date.index = pd.date_range(start = '2013-01-01',end='2015-10-01', freq = 'MS')
item_sales_per_month_with_date = item_sales_per_month_with_date.reset_index()
item_sales_per_month_with_date.columns=['ds','y']
item_sales_per_month_with_date.head()

In [None]:
# Prophet Model Development
model = Prophet(yearly_seasonality=True) 
#fit the model with dataframe
model.fit(item_sales_per_month_with_date, algorithm='Newton') 