**Import 
** 

In [None]:
import os
for dirname,_,filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print (os.path.join(dirname,filename))

In [None]:
import pandas as pd # Data handling and managing
import numpy as np  # Handiling linear Algera
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline


df = pd.read_csv('../input/productdemandforecasting/Historical Product Demand.csv', parse_dates=['Date'])
df.head(100) # Getting the first 100 rows to view the records
#df.shape

In [None]:
df.dtypes

In [None]:
# Check for the columns which got has the NaN values
print(df.isnull().any().sum(), ' / ', len(df.columns))
# Check any number of data points with NaN
print(df.isnull().any(axis=1).sum(),'/', len(df))



In [None]:
df.dropna(axis=0, inplace=True) #Remove all the rows with null values
df.reset_index(drop=True)
df.sort_values('Date')[1:50]

In [None]:
#df.dropna().sum()

In [None]:
df['Order_Demand']=df['Order_Demand'].str.replace('(',"")
df['Order_Demand']=df['Order_Demand'].str.replace(')',"")
df.head(100)
#Since the "()" has been removed , Now i Will change the data type.

df['Order_Demand'] = df['Order_Demand'].astype('int64')

In [None]:
df.sort_values('Date')[10:20]

In [None]:
#Get the Hieghest and lowest dates in the dataset.
df['Date'].min() , df['Date'].max()

In [None]:
from scipy.stats import norm, skew #Import Norm and skew for some statistics
from scipy import stats #Import stats
import statsmodels.api as sm #for decomposing the trends, seasonality etc.

from statsmodels.tsa.statespace.sarimax import SARIMAX #for the Seasonal Forecast


#Lets check the ditribution of the target variable (Order_Demand)
from matplotlib import rcParams
# figure size in inches
rcParams['figure.figsize'] = 10,5

sn.distplot(df['Order_Demand'],fit=norm)

#Get the QQ-plot
fig = plt.figure()
res = stats.probplot(df['Order_Demand'], plot=plt)
plt.show()

In [None]:
df['Warehouse'].value_counts().sort_values(ascending=False)

In [None]:
#Now I will get the amount of orders shipped by each warehouse.
df.groupby('Warehouse').sum().sort_values('Order_Demand', ascending = False)

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year

In [None]:
df2 = df[['Year', 'Warehouse', 'Order_Demand']].groupby(['Year', 'Warehouse'], as_index=False).count()

In [None]:
df2  = df2.pivot(index='Year', columns='Warehouse', values='Order_Demand')

In [None]:
df2.index = df2.index.map(int) # let's change the index values of df2 to type integer for plotting
df2.plot(kind='area', stacked=False, figsize=(20, 10))

plt.title('Order_Demand Trend')
plt.ylabel('Number of Order_Demand')
plt.xlabel('Years')
plt.show()

In [None]:
df2['Total'] = df2.sum(axis=1)

In [None]:
colors_list = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'lightgreen', 'pink', 'red']
explode_list = [0.2, 0, 0, 0, 0, 0, 0.2] # ratio for each year with which to offset each wedge.

df2['Total'].plot(kind='pie',
                            figsize=(15, 6),
                            autopct='%1.1f%%', 
                            startangle=90,    
                            shadow=True,       
                            labels=None,         # turn off labels on pie chart
                            pctdistance=1.12,    # the ratio between the center of each pie slice and the start of the text generated by autopct 
                            colors=colors_list,  # add custom colors
                            explode=explode_list 
                            )

# scale the title up by 12% to match pctdistance
plt.title('Order_Demand Trend [2011 - 2017]', y=1.12) 

plt.axis('equal') 

# add legend
plt.legend(labels=df2.index, loc='upper left') 

plt.show()


In [None]:
rcParams['figure.figsize']=20,5 #Figure Size in Inches for Plotting
f, axes = plt.subplots(1,2)
normalDW=sn.boxplot(df['Warehouse'],df['Order_Demand'],ax=axes[0]) #Create a variable for Regular Data for WH and OD 

logWH=sn.boxplot(df['Warehouse'],np.log1p(df['Order_Demand']),ax=axes[1]) #Craete a Variable with Log Transformation


In [None]:
df=df.groupby('Date')['Order_Demand'].sum().reset_index()
#Step-02: Indexing the Date Column as for further procssing.
df = df.set_index('Date')
df.index #Lets check the index
#Step-03:#Averages daily sales value for the month, and we are using the start of each month as the timestamp.
monthly_avg_sales = df['Order_Demand'].resample('MS').mean()
#In case there are Null values, they can be imputed using bfill.
monthly_avg_sales = monthly_avg_sales.fillna(monthly_avg_sales.bfill())
#Visualizing time series.

monthly_avg_sales.plot(figsize=(20,10))
plt.show()

**Displaying the trends with their seasons**

In [None]:
#Using Time Series for Decomposition. 
from pylab import rcParams
import statsmodels.api as sm
rcParams['figure.figsize'] = 20, 10
decomposition = sm.tsa.seasonal_decompose(monthly_avg_sales, model='additive')
fig = decomposition.plot()
plt.show()

In [None]:
df.head()

In [None]:
df2.head()

Creating the ARIMA Model

In [None]:
import itertools
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
#print('Examples of parameter combinations for Seasonal ARIMA...')
print('SARIMAX1: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX2: {} x {}'.format(pdq[1], seasonal_pdq[2]))
print('SARIMAX3: {} x {}'.format(pdq[2], seasonal_pdq[3]))
print('SARIMAX4: {} x {}'.format(pdq[2], seasonal_pdq[4]))

#STEP-02:
#Get the best params for the data. Choose the lowest AIC.

# The Akaike information criterion (AIC) is an estimator of the relative quality of statistical models for a 
# given set of data. 
# AIC measures how well a model fits the data while taking into account the overall complexity of the model.
# Large AIC: Model fits very well using a lot of features.
# Small AIC: Model fits similar fit but using lesser features. 
# Hence LOWER THE AIC, the better it is.

#The code tests the given params using sarimax and outputs the AIC scores.

for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMAX(monthly_avg_sales,
                                            order=param,
                                            seasonal_order=param_seasonal,enforce_stationarity=False,
                                            enforce_invertibility=False)
            results = mod.fit()
            print('SARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
        except:
            continue

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
mod = sm.tsa.statespace.SARIMAX(monthly_avg_sales,
                                order=(1, 1, 1),
                                seasonal_order=(0, 1, 1, 12),
                                enforce_stationarity=False,
                                enforce_invertibility=False)
results = mod.fit()
print(results.summary().tables[1])