In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading Inputs

In [None]:
#reading Inputs
items_df=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
#items_df
items_df.describe().T

In [None]:
item_categories_df=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
shops_df=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')


In [None]:
sales_train_df=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
sales_train_df

## Exploratory Data Analysis

In [None]:
#distribution of item ID
import matplotlib.pyplot as plt
plt.figure(figsize=(6,5))
plt.hist(sales_train_df['item_id'])
plt.show

In [None]:
sales_train_df.groupby(['item_id']).nunique()

There are 21807 unique items

In [None]:
sales_train_df[['shop_id','item_id','item_price','item_cnt_day']].corr()

No significant correlation between any 2 columns!

In [None]:
#EDA on train data
sales_train_df.groupby(['shop_id']).nunique()

In [None]:
#Distribution of shop_id
plt.figure(figsize=(6,5))
plt.hist(sales_train_df['shop_id'])
plt.show

There are 59 unique shops

In [None]:
#Getting total sales of a shop for a unique date
sales_train_df.groupby(['shop_id','date_block_num']).sum('item_cnt_day')

In [None]:
#Getting total sales of a shop for a unique item and date
#Transforming sales training set to required format as test.csv
sales_train_df_groupby=sales_train_df.groupby(['shop_id','item_id','date_block_num'])['item_cnt_day'].sum().reset_index()

In [None]:
sales_train_df_groupby

**Inputs of Test.csv:shop_id,item_id** . 
**Output of Test.csv: item_cnt**. 

We'll use "sales_train_df_groupby" for training the model. We shall split this into train and validation set to test the model accuracy. Right now, i'm thinking of using a multi-linear or decision tree based regressor. We shall tets the model RMSE and the aim is to get score as close to 1.00 .  

## Linear Regression for Sales Prediction

In [None]:
#importing needed packages
import matplotlib.pyplot as plt  # To visualize
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 
from sklearn import metrics

In [None]:
#Getting X and Y variables
X = sales_train_df_groupby.drop(columns=['item_cnt_day','date_block_num'])#['date_block_num','shop_id']  # values converts it into a numpy array
Y = sales_train_df_groupby['item_cnt_day']  # -1 means that calculate the dimension of rows, but have 1 column

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
X_train.head(2)

In [None]:
#Training and predicting for validation dataset
linear_regressor = LinearRegression()  # create object for the class
linear_regressor.fit(X_train, y_train)  # perform linear regression
Y_pred = linear_regressor.predict(X_test)  # make predictions

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, Y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, Y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, Y_pred)))

RMSE is 7.5 which means model is having much deviation from the actual sale value. So, let's try Random forest regressor and xgboost techniques to predict sales value.

## **XGBoost Model**

In [None]:
item_cat_joined_df=items_df.merge(item_categories_df, on='item_category_id')

In [None]:
sales_train_df_groupby_cat_join=sales_train_df_groupby.merge(item_cat_joined_df, on='item_id')

In [None]:
sales_train_df_groupby_cat_join

In [None]:
Xgboost_X_train=sales_train_df_groupby_cat_join[['date_block_num','shop_id','item_id','item_category_id']]
Xgboost_X_train

In [None]:
Xgboost_Y_train=sales_train_df_groupby_cat_join[['item_cnt_day']]
Xgboost_Y_train

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(Xgboost_X_train, Xgboost_Y_train, test_size=0.2, random_state=123)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear',max_depth = 10,  n_estimators = 100)
xg_reg.fit(X_train_xgb,y_train_xgb)

In [None]:
preds = xg_reg.predict(X_test_xgb)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test_xgb, preds))
print("RMSE: %f" % (rmse))

By using XGBoost, Now RMSE has further reduced to 4.68 from Linear regression's 7.65. So, XGBoost is much more efficient predicting sales price compared to Linear regression.

Now let's try using Random Forest regressor,Deep learning methods and Time series for much better accuracy.

## **Random Forest Regressor to predict sales**

In [None]:
#Random forest regressor model building
from sklearn.ensemble import RandomForestRegressor
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(Xgboost_X_train, Xgboost_Y_train, test_size=0.2, random_state=123)
RF_model = RandomForestRegressor()
RF_model.fit(X_train_rf, y_train_rf)

In [None]:
RF_pred=RF_model.predict(X_test_rf)

In [None]:
rmse_rf = np.sqrt(mean_squared_error(y_test_rf, RF_pred))
print("RMSE: %f" % (rmse))

Random forest and XGBoost are almost having same error predicting.

## **Time Series Analysis**

In [None]:
ts=sales_train_df.groupby(["date_block_num"])["item_cnt_day"].sum()
ts.astype('float')
plt.figure(figsize=(16,8))
plt.title('Total Sales of the company')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(ts);

In [None]:
plt.figure(figsize=(16,6))
plt.plot(ts.rolling(window=12,center=False).mean(),label='Rolling Mean');
plt.plot(ts.rolling(window=12,center=False).std(),label='Rolling sd');
plt.legend();

**Quick observations:** There is an obvious "seasonality" (Eg: peak sales around a time of year) and a decreasing "Trend".

Let's check that with a quick decomposition into Trend, seasonality and residuals.

In [None]:
import statsmodels.api as sm
# multiplicative
res = sm.tsa.seasonal_decompose(ts.values,period=12,model="multiplicative")
#plt.figure(figsize=(16,12))
fig = res.plot()
#fig.show()

In [None]:
# Additive model
res = sm.tsa.seasonal_decompose(ts.values,period=12,model="additive")
#plt.figure(figsize=(16,12))
fig = res.plot()
#fig.show()

There is seasonality and residual plot is not random and seems to follow a pattern. Lets check if series is non-stationary. We shall do ADF stationarity test.

In [None]:
# TIME SERIES
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

In [None]:
# Stationarity tests
def test_stationarity(timeseries):
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

test_stationarity(ts)

##### our p-value for the DF test is well within 5 %. Hence we can assume Stationarity of the series.

### ARMA models:

MA - Next value in the series is a function of the average of the previous n number of values AR - The errors(difference in mean) of the next value is a function of the errors in the previous n number of values ARMA - a mixture of both.

Now, How do we find out, if our time-series in AR process or MA process?

Let's find out!

In [None]:
def tsplot(y, lags=None, figsize=(10, 8), style='bmh',title=''):
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    with plt.style.context(style):    
        fig = plt.figure(figsize=figsize)
        #mpl.rcParams['font.family'] = 'Ubuntu Mono'
        layout = (3, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        qq_ax = plt.subplot2grid(layout, (2, 0))
        pp_ax = plt.subplot2grid(layout, (2, 1))
        
        y.plot(ax=ts_ax)
        ts_ax.set_title(title)
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)
        sm.qqplot(y, line='s', ax=qq_ax)
        qq_ax.set_title('QQ Plot')        
        scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)

        plt.tight_layout()
    return 

In [None]:
# Simulate an AR(1) process with alpha = 0.6
np.random.seed(1)
n_samples = int(1000)
a = 0.6
x = w = np.random.normal(size=n_samples)

for t in range(n_samples):
    x[t] = a*x[t-1] + w[t]
limit=12    
_ = tsplot(x, lags=limit,title="AR(1)process")

##From the graph it's visible that ACF tails out and PACF cuts at lag1

In [None]:
# Simulate an AR(2) process

n = int(1000)
alphas = np.array([.444, .333])
betas = np.array([0.])

# Python requires us to specify the zero-lag value which is 1
# Also note that the alphas for the AR model must be negated
# We also set the betas for the MA equal to 0 for an AR(p) model
# For more information see the examples at statsmodels.org
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]

ar2 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n) 
_ = tsplot(ar2, lags=12,title="AR(2) process")

In [None]:
# Simulate an MA(1) process
n = int(1000)
# set the AR(p) alphas equal to 0
alphas = np.array([0.])
betas = np.array([0.8])
# add zero-lag and negate alphas
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]
ma1 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n) 
limit=12
_ = tsplot(ma1, lags=limit,title="MA(1) process")

In [None]:
# Simulate an ARMA(2, 2) model with alphas=[0.5,-0.25] and betas=[0.5,-0.3]
max_lag = 12

n = int(5000) # lots of samples to help estimates
burn = int(n/10) # number of samples to discard before fit

alphas = np.array([0.8, -0.65])
betas = np.array([0.5, -0.7])
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]

arma22 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n, burnin=burn)
_ = tsplot(arma22, lags=max_lag,title="ARMA(2,2) process")

## **DEEP LEARNING METHOD**

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from tensorflow import keras
import tensorflow as tf

from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler,ReduceLROnPlateau

from sklearn.model_selection import KFold,GroupKFold
from tensorflow.keras import layers

In [None]:
from sklearn.preprocessing import RobustScaler,StandardScaler

sc = StandardScaler()

In [None]:
dataset=[]
dataset = sales_train_df.pivot_table(index = ['shop_id','item_id'],values = ['item_cnt_day'],columns = ['date_block_num'],fill_value = 0,aggfunc='sum')

In [None]:
test_df=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
test_df

In [None]:
dataset.reset_index(inplace = True)


In [None]:
dataset = pd.merge(test_df,dataset,on = ['item_id','shop_id'],how = 'left')

In [None]:
dataset.fillna(0,inplace = True)

In [None]:
dataset.drop(['shop_id','item_id','ID'],inplace = True, axis = 1)

In [None]:
dataset.shape

In [None]:
# X we will keep all columns execpt the last one 
X_train_deep = np.expand_dims(dataset.values[:,:-1],axis = 2)
# the last column is our label
y_train_deep = dataset.values[:,-1:]

# for test we keep all the columns execpt the first one
X_test_deep = np.expand_dims(dataset.values[:,1:],axis = 2)

# lets have a look on the shape 
print(X_train_deep.shape,y_train_deep.shape,X_test_deep.shape)

In [None]:
save_best = tf.keras.callbacks.ModelCheckpoint("Model.h5", monitor='val_loss',verbose=1, save_best_only=True)

In [None]:
def build_model():
    
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True), input_shape=(33, 1)))

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
    model.add(tf.keras.layers.Dropout(0.2))

    model.add(tf.keras.layers.Flatten())
    
    model.add(tf.keras.layers.Dense(32, activation='relu', kernel_initializer='uniform'))
    model.add(tf.keras.layers.Dense(1))
    
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.002), loss = 'mse', metrics=['mse'])

    model.summary()
    
    return model

In [None]:
model = build_model()

In [None]:
#Run/Fit model only for first time and save the model details in Model.H5 for future use
#as it takes around 4-5 hrs for the model to fit the data
#Use saved Model.H5 for predicting future sales
##model.fit(X_train_deep, y_train_deep, validation_split=0.2, epochs=40, batch_size=512, verbose=1, callbacks=[save_best])

In [None]:
#run this for first time while building the model for first time
#model.evaluate(X_train_deep, y_train_deep)

In [None]:
model = tf.keras.models.load_model('../input/saved-model/Model.h5')

In [None]:
# creating submission file 
submission = model.predict(X_test_deep, verbose=1)
# we will keep every value between 0 and 20
submission = submission.clip(0,20)


In [None]:
# creating dataframe with required columns 
submission = pd.DataFrame({'ID':test_df['ID'],'item_cnt_month':submission.ravel()})
# creating csv file from dataframe
submission.to_csv('submission.csv',index = False)