In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima_model import ARIMA

import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
# Plotly and Cufflinks setup

#!pip install plotly
#!pip install cufflinks
#!pip install chart_studio

import chart_studio.plotly as py
import plotly.graph_objs as go

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
print(__version__) #requires version >= 1.9.0

import cufflinks as cf

#for Notebooks
init_notebook_mode(connected = True)

#for offline use
cf.go_offline()

# Loading Data

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
path = "../input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv"

In [None]:
df = pd.read_csv(path,  parse_dates = ["M-Y"], low_memory = False, na_values = ["NaN", 'NaT', ' -   '])

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df.head(5)

In [None]:
len(df)

In [None]:
df.info()

# Data Pre-Processing

### Droping off Un-necessary Columns

In [None]:
df.columns

#### Droping Columns = ['Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25']

In [None]:
drop_columns_0 = df.columns[-5:]
drop_columns_0

In [None]:
df.drop(drop_columns_0, axis = 1, inplace = True)

#### Droping Columns "items_id", "created_at", "increment_id", "sales_commission_code", "Working Date" , "BI Status" , "Year", "Month", "FY"

In [None]:
drop_columns_1 = ["item_id", "created_at", "increment_id", "sales_commission_code", "Working Date" , "BI Status" , "Year", "Month", "Customer Since", "FY"]

In [None]:
df.drop(drop_columns_1, axis = 1, inplace = True)

### Re arranging Columns

In [None]:
df = df[['Customer ID', 'sku', 'category_name_1', 'status', 'qty_ordered',  'price', 'grand_total',
       'discount_amount', ' MV ', 'payment_method', 'M-Y']]

#### Check For Missing Data

In [None]:
missing = pd.DataFrame(df.isnull().sum())
missing.transpose()

In [None]:
total_missing_values = df.isnull().sum().sum()
total_missing_values

### Droping off Missing Values Rows

In [None]:
df.dropna(axis = 0, how = 'any', inplace = True)

In [None]:
#Checking again for missing values
missing = pd.DataFrame(df.isnull().sum())
missing.transpose()

In [None]:
#Checking again
total_missing_values = df.isnull().sum().sum()
total_missing_values

In [None]:
#Checking again for missing values in ' MV ' col
df[' MV '].isnull().sum()#.sum()

In [None]:
df.head(5)

In [None]:
len(df)

In [None]:
df['M-Y'].unique()

In [None]:
df['M-Y'].nunique()

In [None]:
df['M-Y'].value_counts()

### Looking for 0 (zeros) in integer or float value columns

In [None]:
df[df['qty_ordered'] == 0]['qty_ordered'].value_counts()

In [None]:
df[df['price'] == 0]['price'].value_counts()

In [None]:
df[df[' MV '] == '0'][' MV '].value_counts()

In [None]:
df[df['grand_total'] == 0]['grand_total'].value_counts()

In [None]:
#found 9465 cells containing 0.0 in column 'grand_total'
#replace 0.0 from corresponding values of ' MV ' column

In [None]:
def replace_zeros(x, y):
    if x == 0:
        return y
    else: 
        return x

In [None]:
df.columns

In [None]:
df['grand_total'] = df.apply(lambda z: replace_zeros(z['grand_total'], z[' MV ']), axis = 1)

In [None]:
#Checking again
df[df['grand_total'] == 0]['grand_total'].value_counts()

### Now DataFrame Without 0(Zeros) and Null Values

In [None]:
df.head(5)

In [None]:
len(df)

In [None]:
df.columns

**Task 6: Predict and Forecasting**

In [None]:
df_no_of_orders_cat = pd.crosstab(df['M-Y'], df['category_name_1']) #, margins = True
df_no_of_orders_cat

In [None]:
df[df['M-Y'] == '2016-08-01']['category_name_1'].value_counts()

In [None]:
df_no_of_orders_cat.plot(figsize = (12, 8), legend = True)
plt.show()

In [None]:
# layout = go.Layout(title = "Number of Orders Per Category", 
#                    xaxis = {'title': 'Month-Year'}, 
#                    yaxis = {'title': 'No. of Orders'}, 
#                    showlegend = True, 
#                    width = 1000, 
#                    height = 600,)

# df_no_of_orders_cat.iplot(kind = 'line', layout = layout) 

In [None]:
# import plotly.express as px

# fig = px.line(df_no_of_orders_cat)

# fig.update_xaxes(title_text='Month-Year')
# fig.update_yaxes(title_text='No. of Orders')

# fig.show()

In [None]:
df = df_no_of_orders_cat

In [None]:
df.head(2)

In [None]:
#df['Appliances']

In [None]:
df_new = df["Men's Fashion"]

In [None]:
df_new.plot(figsize = (12, 8), legend = True)
plt.show()

In [None]:
df.index

### Determine the rolling statistics

In [None]:
rolmean = df_new.rolling(window = 12).mean()
rolstd = df_new.rolling(window = 12).std()

In [None]:
#rolmean, rolstd

### Plot rolling statistics

In [None]:
plt.figure(figsize = (10, 6))
orig = plt.plot(df_new, color = 'blue', label = 'Original')
mean = plt.plot(rolmean, color = 'red', label = 'Rolling Mean')
std = plt.plot(rolstd, color = 'green', label = 'Rolling Std')
plt.legend(loc = 'best')
plt.title("Rolling Mean & Standard Deviation")
plt.show(block = False)

### Perform Dickey-Fuller Test

In [None]:
# Need to have p-value around 0.5 or less to have data stationary

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
print("Results of Diceky-Fuller Test:\n")

dftest = adfuller(df_new, autolag = 'AIC')

dfoutput = pd.Series(dftest[0:4], index = ['Test Statistics', 'p-Value', '#Lags Used', 'No. of Obeservations Used'])

for key, value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value

dfoutput

### Estimate the Trend

In [None]:
#to make data stationary it depends on data sometime you need to take log of data, square of data, or cube of data
#in this case we already have stationary data as seen in 'Dickey-Fuller Test' and in the plot of 'Rolling Statistics'
#we are doing following steps only for illustration

In [None]:
plt.figure(figsize = (10, 6))
data_logScale = np.log(df_new)
plt.plot(data_logScale)
plt.show()

In [None]:
data_logScale

### Calculate Moving Average with the Same Window

In [None]:
plt.figure(figsize = (10, 6))
movingAverage = data_logScale.rolling(window = 12).mean()
movingSTD = data_logScale.rolling(window = 12).std()
plt.plot(data_logScale, color = 'blue')
plt.plot(movingAverage, color = 'red', label = 'Moving Average')
plt.plot(movingSTD, color = 'green', label = 'Moving STD')
plt.legend(loc = 'best')
plt.title('Moving Average and Moving STD Or Plotting with log of Data')
plt.show()

In [None]:
movingAverage

### Calculate difference b/w Log Scale Data and Moving Average

In [None]:
data_LogScale_Minus_movingAverage = data_logScale - movingAverage
data_LogScale_Minus_movingAverage

#Remove NaN Values
data_LogScale_Minus_movingAverage.dropna(inplace = True)
data_LogScale_Minus_movingAverage = pd.DataFrame(data_LogScale_Minus_movingAverage)
data_LogScale_Minus_movingAverage

### Creating a function to perform test on data to check its stationarity and plot of 'Rolling statistics' and 'Dickey-Fuller Test'

In [None]:
#this step is the copy of above steps, just putting all together in a function

In [None]:
from statsmodels.tsa.stattools import adfuller

def test_stationarity(timeseries):
    
    # Determine Rolling Statistics
    
    average = timeseries.rolling(window = 12).mean()
    std = timeseries.rolling(window = 12).std()
    
    # Plot Rolling Statistics
    
    plt.figure(figsize = (10, 6))
    plt.plot(timeseries, color = 'blue')
    plt.plot(average, color = 'red', label = 'Average')
    plt.plot(std, color = 'green', label = 'STD')
    plt.legend(loc = 'best')
    plt.title('Average and STD with Data')
    plt.show()
    
    # Perform Dickey-Fuller Test
    
    print("\n")
    print("Results of Dickey-Fuller Test: \n")
    dftest = adfuller(timeseries, autolag = 'AIC')

    dfoutput = pd.Series(dftest[0:4], index = ['Test Statistics', 'p-Value', '#Lags Used', 'No. of Obeservations Used'])

    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    
    return dfoutput

In [None]:
# pass original data or log of data or square/cube of data
# data either in Series or DataFrame
test_stationarity(data_LogScale_Minus_movingAverage)

In [None]:
test_stationarity(data_logScale)

In [None]:
test_stationarity(df_new)

In [None]:
test_stationarity(df_new)

In [None]:
#Write reasons which data you are using

# As Seen from above results, we have increasing value of p-value, therefore our original data or data_logScale was sufficient enough to preceed further since it had less p-value
# We are using data_logScale in this case

### Calculate Weighted Average of Time Series

#### Reason: To see the Trends present in Time Series

In [None]:
exponentialDecayWeightedAverage = data_logScale.ewm(halflife = 12, min_periods = 0, adjust = True).mean()

plt.figure(figsize = (10, 6))
plt.plot(data_logScale, color = 'blue')
plt.plot(exponentialDecayWeightedAverage, color = 'red')
plt.show()

In [None]:
# The red line in plot above shows data has slightly increased trend

### Calculate diff b/w logScale and exponentialDecayWeightedAverage

In [None]:
data_logScale_Minus_exponentialDecayWeightedAverage = data_logScale - exponentialDecayWeightedAverage

### Test Stationarity

In [None]:
test_stationarity(data_logScale_Minus_exponentialDecayWeightedAverage)

### Forecasting

In [None]:
# Now we know that our data is stationary

In [None]:
# Now we will shift the values into time series so that we can use it in the forecasting

In [None]:
# Subtract the value of Mean from the actual value 

In [None]:
# Here we have taken lag of one or just shifted values by 1
data_shifted = data_logScale - data_logScale.shift()

plt.figure(figsize = (10, 6))
plt.plot(data_shifted)
plt.show()

In [None]:
# ARIMA MODEL

# It has 3 (three) models in it
# 1) AR = Auto Regressive Model
# 2) MA = Moving Average Model
# 3) I = Integration

# It takes 3 (three) parameters
# P, Q, D
# D=1 (since we shifted by 1)
# Calculation of P, Q in later steps

In [None]:
data_shifted.values

In [None]:
# Drop Any NaN values
data_shifted.dropna(inplace = True)

In [None]:
test_stationarity(data_shifted)

In [None]:
# Output in above plot is quite flat
# So here you Null_Hypothesis or the augmented 'Dickey-Fuller Test' we take the Null Hypotheis is rejected and hence we can say that time series is stationary now

In [None]:
# Now lets see the componets of time series

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
time_series = data_logScale
decomposition = seasonal_decompose(time_series, model = 'additive')

observed = decomposition.observed # Original
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

# Plot 

# plt.figure(figsize = (10, 6))
# plt.subplot(411)
# plt.plot(observed, color = 'blue', label = 'Observed')
# plt.legend(loc = 1)


# plt.subplot(412)
# plt.plot(trend, color = 'black', label = 'Trend')
# plt.legend(loc = 1)


# plt.subplot(413)
# plt.plot(seasonal, color = 'green', label = 'Seasonal')
# plt.legend(loc = 1)


# plt.subplot(414)
# plt.plot(residual, color = 'yellow', label = 'Residual')
# plt.legend(loc = 1)

# plt.tight_layout()
# plt.show() 

In [None]:
# Plot 
lst_time_series_decompostion = [observed, trend, seasonal, residual]
str_lst = ['Observed', 'Trend', 'Seasonal', 'Residual']
mycolor = ['Black', 'Purple', 'Green', 'Red', 'Blue', 'Orange', 'Grey', 'Violet']

for i in range(len(lst_time_series_decompostion)):
    
    layout = go.Layout(title = "Time Series-Decomposed: " + str_lst[i], 
                   xaxis = {'title': 'Month-Year'}, 
                   #yaxis = {'title': ''}, 
                   showlegend = True, 
                   width = 700, 
                   height = 300)
    
    lst_time_series_decompostion[i].iplot(kind = 'line', layout = layout, color = mycolor[i]) 
    

In [None]:
# Check noise if its stationary or not

In [None]:
decomposedLogData = residual
decomposedLogData.dropna(inplace = True)
test_stationarity(decomposedLogData)

### Plot ACF and PACF Graphs

In [None]:
#### ACF will give Q
#### PACF will give P

In [None]:
# from statsmodels.tsa.stattools import acf, pacf

# lag_acf = acf(data_shifted, nlags = 11) #nlag must be less than the half of dataset
# lag_pacf = pacf(data_shifted, nlags = 11, method ='ols')

# # --Plot ACF and PACF Graphs--

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 5))
# fig.suptitle('')

# # Plot ACF Graph

# ax1.plot(lag_acf)
# ax1.axhline(y = 0, linestyle = '--', color = 'grey')
# ax1.axhline(y = -1.96/np.sqrt(len(data_shifted)), linestyle = '--', color = 'grey')
# ax1.axhline(y = 1.96/np.sqrt(len(data_shifted)), linestyle = '--', color = 'grey')
# ax1.set_title("Auto Correlation")

# # Plot PACF Graph

# ax2.plot(lag_pacf)
# ax2.axhline(y = 0, linestyle = '--', color = 'grey')
# ax2.axhline(y = -1.96/np.sqrt(len(data_shifted)), linestyle = '--', color = 'grey')
# ax2.axhline(y = 1.96/np.sqrt(len(data_shifted)), linestyle = '--', color = 'grey')
# ax2.set_title("Partial Auto Correlation")

# plt.tight_layout()
# plt.show()

In [None]:
from statsmodels.tsa.stattools import acf, pacf

lag_acf = acf(data_shifted, nlags = 11) #nlag must be less than the half of dataset
lag_pacf = pacf(data_shifted, nlags = 11, method ='ols')

lag_acf = pd.DataFrame(lag_acf)
lag_pacf = pd.DataFrame(lag_pacf)

# Plot

lag_acf_pacf = [lag_acf, lag_pacf]
lst_lag_acf_pacf = ['ACF', 'PACF']
mycolor = ['Black', 'Purple', 'Green', 'Red', 'Blue', 'Orange', 'Grey', 'Violet']

for i in range(len(lag_acf_pacf)):
    
    layout = go.Layout(title = lst_lag_acf_pacf[i], 
                   #xaxis = {'title': 'Month-Year'}, 
                   #yaxis = {'title': ''}, 
                   showlegend = True, 
                   width = 700, 
                   height = 500, hovermode = 'closest' )
    
    lag_acf_pacf [i].iplot(kind = 'line', layout = layout, color = mycolor[i]) 

In [None]:
# Now look for x-axis value when y = 0 
# in both graphs values on x-axis are 1, therefore
# P, Q = 1 (alsmost)

### ARIMA Model

In [None]:
from statsmodels.tsa.arima_model import ARIMA

In [None]:
## AR Model

In [None]:
model = ARIMA(data_logScale, order = (0, 1, 1)) #p = 0, d = 1, q = 1
results_AR = model.fit(disp = -1)

plt.figure(figsize = (10, 5))
plt.plot(data_shifted)
plt.plot(results_AR.fittedvalues, color = 'red')
plt.title('Residual Sum of Squares: %.4f'% sum((results_AR.fittedvalues - data_shifted)**2))
plt.tight_layout()
plt.show()

In [None]:
# ideal Rss = 1.0292
# RSS value in this case is quite high 

In [None]:
## MA Model

model = ARIMA(data_logScale, order = (1, 1, 0)) #p = 0, d = 1, q = 0
results_MA = model.fit(disp = -1)

plt.figure(figsize = (10, 5))
plt.plot(data_shifted)
plt.plot(results_MA.fittedvalues, color = 'red')
plt.title('Residual Sum of Squares: %.4f'% sum((results_MA.fittedvalues - data_shifted)**2))
plt.tight_layout()
plt.show()

In [None]:
# ARIMA Model
model = ARIMA(data_logScale, order = (2, 1, 1)) #p = 1, d = 1, q = 1
results_ARIMA = model.fit(disp = -1)

plt.figure(figsize = (10, 5))
plt.plot(data_shifted)
plt.plot(results_ARIMA.fittedvalues, color = 'red')
plt.title('Residual Sum of Squares: %.4f'% sum((results_ARIMA.fittedvalues - data_shifted)**2))
plt.tight_layout()
plt.show()

In [None]:
# Combined RSS must be less
# in this case RSS too high in any case
# which is trouble

In [None]:
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy = True)
predictions_ARIMA_diff.head()

In [None]:
# Convert to Cummulative Sum
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_diff_cumsum.head()

In [None]:
data_logScale.index

In [None]:
type(data_logScale)

In [None]:
predictions_ARIMA = pd.Series(data_logScale)

In [None]:
predictions_ARIMA = predictions_ARIMA.add(predictions_ARIMA_diff_cumsum, fill_value = 0)
predictions_ARIMA.head()

In [None]:
# Since we took the log earlier, therefore convert it back
predictions_ARIMA = np.exp(predictions_ARIMA)

In [None]:
plt.figure(figsize = (10, 5))
plt.plot(df_new, color = 'blue')
plt.plot(predictions_ARIMA, color = 'orange')
plt.tight_layout()
plt.show()

In [None]:
len(data_logScale)

In [None]:
data_logScale

In [None]:
plt.figure(figsize = (10, 5))
results_ARIMA.plot_predict(start = 1, end = 25)
plt.tight_layout()
plt.show() 
#results_ARIMA.forecast(steps = 100)


# start = 24 # which is the index position in time series
#end = 24 + 3 # 3 is no. of data points for future