# Rossman Sales Data

In [None]:
import pandas as pd
import numpy as np

%matplotlib inline

### Import the data

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('rossmann.txt.bz2', skipinitialspace=True,compression='bz2',
                   dtype={'Date': np.str,
                          'Store': np.int64,
                          'DayOfWeek':np.int64,
                          'Sales': np.float64,
                          'Customers': np.int64,
                          'Open': np.int64,
                          'Promo': np.int64,
                          'StateHoliday': np.str,
                          'SchoolHoliday': np.int64
                         })

### Set the dates to be the index and plot

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Filter to Store 1
store1_data = data[data.Store == 1]

# Filter to open days
store1_open_data = store1_data[store1_data.Open==1]

# Plot the sales over time
store1_open_data[['Sales']].plot()

### Plot the rolling mean of the sales data

In [None]:
pd.Series.rolling(store1_data['Sales'], 3).mean().plot()

### Compute autocorrelation between the Store 1 sales data at lag 1 and lag 2

Hint: Use the autocorr() function 

### Make an autocorrelation plot

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
from pandas.tools.plotting import autocorrelation_plot

autocorrelation_plot(store1_data.Sales)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
print plot_acf(store1_data.Sales, lags=10)

In [None]:
print plot_acf(store1_data.Sales, lags=25)

### AR-1 Model

In [None]:
from statsmodels.tsa.arima_model import ARMA

store1_sales_data = store1_open_data[['Sales']].astype(float)
model = ARMA(store1_sales_data, (1, 0)).fit()
model.summary()

### AR-2 Model

In [None]:
model = ARMA(store1_sales_data, (2, 0)).fit()
model.summary()

### Plot the residuals of the model

In [None]:
model.resid.plot()

### Plot the autocorrelation of the residuals

In [None]:
print plot_acf(model.resid, lags=50)

### ARMA Model that Includes an Autoregressive Component and a Moving Average Component

In [None]:
model = ARMA(store1_sales_data, (1, 1)).fit()
model.summary()

### Fit an ARIMA model equivalent to the ARMA model we just ran

In [None]:
help(ARIMA)

In [None]:
from statsmodels.tsa.arima_model import ARIMA

model = ARIMA(store1_sales_data, (1, 0, 1)).fit()
model.summary()

### Fit an ARIMA Model that has an Integrated Component (Difference Series)

In [None]:
model = ARIMA(store1_sales_data, (1, 1, 1)).fit()
model.summary()

### ARIMA model without the Moving Average Component

In [None]:
model = ARIMA(store1_sales_data, (1, 1, 0)).fit()
model.summary()

In [None]:
### Autocorrelation of the Differenced Sales data and a Lag of 1

In [None]:
store1_sales_data.Sales.diff(1).autocorr(1)

In [None]:
predictions = model.predict(
    '2015-07-30',
    '2015-06-02',
    dynamic=False, 
    typ='levels'
)
predictions.plot()
store1_sales_data.Sales['2015-07-30':'2015-06-02'].plot()

In [None]:
predictions = model.predict(
    str(store1_sales_data.Sales.diff(1).index[1]),
    str(store1_sales_data.Sales.diff(1).index[50]),
    dynamic=False, 
    typ='levels'
)
predictions.plot()
store1_sales_data.Sales.iloc[1:50].plot()

In [None]:
 import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax = store1_sales_data['2014'].plot(ax=ax)

fig = model.plot_predict(1, 200, ax=ax, plot_insample=False)


In [None]:
model = ARIMA(store1_sales_data, (7, 1, 2)).fit()
print model.summary()

print plot_acf(model.resid, lags=50)

# Walmart Sales Data

For the independent practice, we will analyze the weekly sales data from Walmart over a two year period from 2010 to 2012.

The data is again separated by store and by department, but we will focus on analyzing one store for simplicity.

The data includes:

- Store - the store number
- Dept - the department number
- Date - the week
- Weekly_Sales -  sales for the given department in the given store
- IsHoliday - whether the week is a special holiday week


#### Loading the train.csv data and setting the DateTimeIndex

#### Filter the dataframe to Store 1 sales and aggregate over departments to compute the total sales per store.

In [None]:
# Filter to store 1 sales and average over weeks
store1_sales = data[data.Store == 1][['Weekly_Sales']].resample('W').sum()
store1_sales.head()

#### Plot the rolling_mean for `Weekly_Sales` using a window of 3. What general trends do you observe?

#### Compute the 1, 2, 52 autocorrelations for `Weekly_Sales` and/or create an autocorrelation plot.

Create the autocorrelation plot below with a lag up to 60

In [None]:


# Components 1 and 2 seem particularly useful for autoregression, perhaps up to 4
# In the plot above notice, spike at around 52 - implying a yearly pattern as well
# No random spikes, probably not much use for a moving average model

#### Split the weekly sales data in a training and test set - using 75% of the data for training

In [None]:
n = len(store1_sales.Weekly_Sales)

train = store1_sales.Weekly_Sales[:int(.75*n)]
test = store1_sales.Weekly_Sales[int(.75*n):]

In [None]:
test

#### Create an AR(1) model on the training data and compute the mean absolute error of the predictions.

In [None]:
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error

In [None]:


print("Mean absolute error: ", mean_absolute_error(test, predictions))
model.summary()

#### Plot the residuals - where are their significant errors?

#### Compute and AR(2) model and an ARMA(2, 2) model - does this improve your mean absolute error on the held out set.

In [None]:

print("Mean absolute error: ", mean_absolute_error(test, predictions))
model.summary()

In [None]:


print("Mean absolute error: ", mean_absolute_error(test, predictions))
model.summary()

#### Finally, compute an ARIMA model to improve your prediction error - iterate on the p, q, and parameters comparing the model's performance.