## Import the dataset and preview

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import matplotlib.ticker as ticker
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/sample-sales-data/sales_data_sample.csv', engine='python')
df.head()

In [None]:
df.info()

## Preprocessing
* Check null value and solve it (Since there are lot of null values in ADDRESSLINE2, STATE, POSTALCODE, and TERRITORY, then I will drop them. COUNTRY and CITY will represent the order geographical information
* Sorting data by 'ORDERDATE' column
* Convert 'ORDERDATE' column to pandas datetime format
* Setting the index to be 'ORDERDATE' column

In [None]:
print(df.isnull().sum())

In [None]:
to_drop = ['ADDRESSLINE2','STATE','POSTALCODE','TERRITORY']
df = df.drop(to_drop, axis = 1)

In [None]:
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'])
dft = pd.DataFrame(df)

In [None]:
df.sort_values(by = ['ORDERDATE'], inplace = True)
df.set_index('ORDERDATE', inplace = True)

# Exploratory Data Analysis and Visualization

**Find out 20 Most Valuable Customers**

The Most Valuable Customers are the customer who are the most profitable for a company (have a big sales on them). These customers buy more or higher-value than the other customers.

In [None]:
top_customer = df.groupby(['CUSTOMERNAME']).sum().sort_values('SALES', ascending = False).head(20)
top_customer = top_customer[['SALES']].round(3)
top_customer.reset_index(inplace = True)

In [None]:
plt.figure(figsize = (15,5))
plt.title('20 Most Valueable Customer (2003 - 2005)', fontsize = 18)
plt.bar(top_customer['CUSTOMERNAME'], top_customer['SALES'], color = '#37C6AB', edgecolor = 'black', linewidth = 1)
plt.xlabel('Customer Name', fontsize = 15)
plt.ylabel('Revenue', fontsize = 15)
plt.xticks(fontsize = 12, rotation = 90)
plt.yticks(fontsize = 12)
for k, v in top_customer['SALES'].items():
    if v > 600000:
        plt.text(k, v-270000, '$' + str(v), fontsize = 12, rotation = 90, color = 'black', ha = 'center')
    else:
        plt.text(k, v+ 50000, '$' + str(v), fontsize = 12, rotation = 90, color = 'black', ha = 'center')

**Find out 20 Highest Revenue by Country**

Here are The Top 20 Country which generated the highest revenue

In [None]:
top_country = df.groupby(['COUNTRY']).sum().sort_values('SALES', ascending = False).head(20)
top_country = top_country[['SALES']].round(3)
top_country.reset_index(inplace = True)

In [None]:
plt.figure(figsize = (15,5))
plt.title('20 Highest Revenue by Country (2003 - 2005)', fontsize = 18)
plt.bar(top_country['COUNTRY'], top_country['SALES'], color = '#37C6AB', edgecolor = 'black', linewidth = 1)
plt.xlabel('Country', fontsize = 15)
plt.ylabel('Revenue', fontsize = 15)
plt.xticks(fontsize = 12, rotation = 90)
plt.yticks(fontsize = 12)
for k, v in top_country['SALES'].items():
    if v > 3000000:
        plt.text(k, v-1200000, '$' + str(v), fontsize = 12, rotation = 90, color = 'black', ha = 'center')
    else:
        plt.text(k, v+100000, '$' + str(v), fontsize = 12, rotation = 90, color = 'black', ha = 'center')

**Find out 20 Highest Revenue by City**

Then visualized revenue by city. Here are th Top 20 City which generated the highest revenue

In [None]:
top_city = df.groupby(['CITY']).sum().sort_values('SALES', ascending = False).head(20)
top_city = top_city[['SALES']].round(3)
top_city.reset_index(inplace = True)

In [None]:
plt.figure(figsize = (15,5))
plt.title('20 Highest Revenue by City (2003 - 2005)', fontsize = 18)
plt.bar(top_city['CITY'], top_city['SALES'], color = '#37C6AB', edgecolor = 'black', linewidth = 1 )
plt.xlabel('City', fontsize = 15)
plt.ylabel('Revenue', fontsize = 15)
plt.xticks(fontsize = 12, rotation = 90)
plt.yticks(fontsize = 12)
for k, v, in top_city['SALES'].items():
    if v > 800000:
        plt.text(k, v-350000, '$' + str(v), fontsize = 12, rotation = 90, color = 'black', ha = 'center')
    else:
        plt.text(k, v+35000, '$' + str(v), fontsize = 12, rotation = 90, color = 'black', ha = 'center')

Then visualized which products give the highest revenue

In [None]:
top_product = df.groupby(['PRODUCTLINE']).sum().sort_values('SALES', ascending = False)
top_product = top_product[['SALES']]
top_product.reset_index(inplace = True)
total_revenue_product = top_product['SALES'].sum()
total_revenue_product = str(int(total_revenue_product))
total_revenue_product = '$' + total_revenue_product

In [None]:
plt.rcParams['figure.figsize'] = (13,7)
plt.rcParams['font.size'] = 12.0
plt.rcParams['font.weight'] = 6
def autopct_format(values):
    def my_format(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return ' ${v:d}'.format(v = val)
    return my_format
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99','#55B4B0','#E15D44','#009B77']
explode = (0.05,0.05,0.05,0.05,0.05,0.05,0.05)
fig1, ax1 = plt.subplots()
pie1 = ax1.pie(top_product['SALES'], colors = colors, labels = top_product['PRODUCTLINE'], autopct = autopct_format(top_product['SALES']), startangle = 90, explode = explode)
fraction_text_list = pie1[2]
for text in fraction_text_list:
    text.set_rotation(315)
center_circle = plt.Circle((0,0), 0.80, fc = 'white')
fig = plt.gcf()
fig.gca().add_artist(center_circle)
ax1.axis('equal')
label = ax1.annotate('Total Revenue \n' + str(total_revenue_product), color = 'red', xy = (0,0), fontsize = 12, ha  ='center')
plt.tight_layout()
plt.show()

**Correlation Test**

Plotting correlation matrix to see the overview of how the features are related to one another

In [None]:
plt.figure(figsize = (10,10))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot = True)

**Observations**

* There is high co-relation in ORDERNUMBER and YEAR_ID, and between QTR_ID and MONTH_ID
* +velly correlated between SALES, QUANTITYORDERED, PRICEEACH and MSRP
* YEAR_ID is -velly correlated to QTR_ID and MONTH_ID

# Time Series Analysis

In [None]:
print('Order Date Description\n')
print(dft['ORDERDATE'].describe(datetime_is_numeric=True))

In [None]:
dft.sort_values(by = ['ORDERDATE'], inplace = True, ascending = True)
dft.set_index('ORDERDATE', inplace = True)
new_data = pd.DataFrame(dft['SALES'])
new_data.head()

In [None]:
new_data.plot()

A series is said to be stationary when its mean and variance do not change over time. From the above distribution of the sales it is not clear whether the sales distribution is stationary or not. Let us perform some stationarity tests to check whether the time series is stationary or not.

## Checking for Stationary

In [None]:
new_data = pd.DataFrame(new_data['SALES'].resample('D').mean())
new_data = new_data.interpolate(method = 'linear')

### Method 1
To check for stationarity by comparing the change in mean and variance over time, let us split teh data into train, test, and validation

In [None]:
train, test, validation = np.split(new_data['SALES'].sample(frac = 1), [int(.6*len(new_data['SALES'])), int(.8*len(new_data['SALES']))])

In [None]:
print('Train Dataset')
print(train)
print('Test Dataset')
print(test)
print('Validation Dataset')
print(validation)

**From the above values of mean and variance, it can be inferred that their is not much difference in the three values of mean and variance, indicating that the series is stationary. However, to verify our observations, let us perform a standard stationarity test, called Augmented Dicky Fuller test.**

**Augmented Dicky Fuller Test**

* The Augmented Dickey-Fuller test is a type of statistical test alsocalled a unit root test.The base of unit root test is that it helps in determining how strongly a time series is defined by a trend.
* The null hypothesis of the test is that the time series can be represented by a unit root, that it is not stationary. The alternate hypothesis (rejecting the null hypothesis) is that the time series is stationary.
    - Null Hypothesis(H0): Time series is not stationary
    - Alternate Hypothesis (H1): Time series is stationary
* This result is interpreted using the p-value from the test.
    - p-value > 0.05: Fail to reject the null hypothesis (H0), the data has a unit root and is non-stationary.
    - p-value <= 0.05: Reject the null hypothesis (H0), the data does not have a unit root and is stationary.

### Method 2 - Augmented Dicky Fuller Test

In [None]:
from statsmodels.tsa.stattools import adfuller
#statsmodel provied addfuller()
data1 = new_data.iloc[:,0].values
adf = adfuller(data1) 

print(adf)
print('\nADF = ', str(adf[0]))
print('\np-value = ', str(adf[1]))
print('\nCritical Values: ')

for key, val in adf[4].items():
    print(key,':',val)
    if adf[0] < val:
        print('Null Hypothesis Rejected. Time Series is Stationary')
    else:
        print('Null Hypothesis Accepted. Time Series is not Stationary')

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10

import statsmodels.api as sm
decomposition = sm.tsa.seasonal_decompose(new_data, model = 'additive')

fig = decomposition.plot()
plt.show()

# Sales Forecasting using ARIMA

Now that we know our time series is data is stationary. Let us begin with model training for forecasting the sales. We have chosen SARIMA model to forecast the sales.

Seasonal Autoregressive Integrated Moving Average, SARIMA or Seasonal ARIMA, is an extension of ARIMA that supports univariate time series data with a seasonal componen

In [None]:
import itertools
p = d = q = range(0, 2) 
pdq = list(itertools.product(p, d, q))
seasonal_pdq_comb = [(i[0], i[1], i[2], 12) for i in list(itertools.product(p, d, q))]
print('Examples of parameter combinations for Seasonal ARIMA:')
print('SARIMA: {} x {}'.format(pdq[1], seasonal_pdq_comb[1]))
print('SARIMA: {} x {}'.format(pdq[1], seasonal_pdq_comb[2]))
print('SARIMA: {} x {}'.format(pdq[2], seasonal_pdq_comb[3]))
print('SARIMA: {} x {}'.format(pdq[2], seasonal_pdq_comb[4]))

In [None]:
for parameters in pdq:
    for seasonal_param in seasonal_pdq_comb:
        try:
            mod = sm.tsa.statespace.SARIMAX(new_data,
                                            order=parameters,
                                            seasonal_param_order=seasonal_param,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False)
            results = mod.fit()
            print('SARIMA{}x{}12 - AIC:{}'.format(parameters, seasonal_param, results.aic))
        except:
            continue

In [None]:
mod = sm.tsa.statespace.SARIMAX(new_data,
                                order=(1, 1, 1),
                                seasonal_order=(1, 1, 1, 12),
                                enforce_stationarity=False,
                                enforce_invertibility=False)
results = mod.fit()
print(results.summary().tables[1])

In [None]:
results.plot_diagnostics(figsize=(16, 8)
plt.show()

In [None]:
pred = results.get_prediction(start=pd.to_datetime('2003-01-06'), dynamic=False)
pred_val = pred.conf_int()
ax = new_data['2002':].plot(label='observed')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))
ax.fill_between(pred_val.index,
                pred_val.iloc[:, 0],
                pred_val.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('ORDERDATE')
ax.set_ylabel('SALES')
plt.legend()
plt.show()

In [None]:
y_forecasted = pred.predicted_mean
y_truth = new_data['SALES']

from sklearn.metrics import mean_squared_error
from math import sqrt

mse = mean_squared_error(y_forecasted, y_truth)
rmse = sqrt(mse)
print('The Mean Squared Error of the forecasts is {}'.format(round(rmse, 2)))

Out of sample forecast:

To forecast sales values after some time period of the given data. In our case, we have to forecast sales with time period of 7 days.

## Sales Foecast for Next 7 Days

In [None]:
forecast = results.forecast(steps=7)
print(forecast.astype('float'))

In [None]:
forecast = forecast.astype('float')
forecast_df = forecast.to_frame()
forecast_df.reset_index(level=0, inplace=True)
forecast_df.columns = ['Prediction Date', 'Predicted Sales']
prediction = pd.DataFrame(forecast_df).to_csv('prediction.csv',index=False)

## Visualize Prediction Result

In [None]:
df = pd.read_csv('./prediction.csv')
df.plot()