# ARIMA Exploration
Explore the use of the ARIMA method in a notebook before structured implementation in pipeline.

## Data import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
base_path ="../datasets/raw/"
raw_categories_data_path = base_path + "solr_categories_2021_11_29.csv"
raw_data_path = base_path+"market_insights_overview_all.csv"
raw_data_51_path = base_path+"market_insights_51_headphones.csv"

### Import and filter
Import all time series data

Fitler out headphones data with id 12840

Save the data to a new file for later import

In [None]:
# Import all data
all_data = pd.read_csv(raw_data_path)

In [None]:
# Filter data
cat_id = 51
filtered_data = all_data[all_data.cat_id == cat_id]

In [None]:
filtered_data.to_csv(raw_data_51_path)

### Import filtered data

In [None]:
# Read raw data
data = pd.read_csv(raw_data_51_path)
categories = pd.read_csv(raw_categories_data_path)

In [None]:
category_data = categories[categories.id == 51]
category_data

In [None]:
# Display info of loaded raw time series data
data.head()
data.info()

## Group filtered data as one class

In [None]:
category_data = data.groupby(["cat_id", "date"], as_index=False).sum()
# Filter away early 2018 data as it seems to contain errors
category_data = category_data.loc[category_data['date'] > '2018-12-01']

In [None]:
# Display info of loaded raw time series data
category_data.head()
category_data.info()

## Visualize data
Graph to visualize the data

In [None]:
category_data.plot(
    x="date",
    y=["hits", "clicks"],
    title=f"category: Headphones",
    figsize=(30, 10),
),f"lineplot_51_headphones"

In [None]:
category_data.boxplot(column=["hits", "clicks"])

In [None]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(category_data["hits"])

# ARIMA

## Initial fit on entire time series

In [None]:
from statsmodels.tsa.arima.model import ARIMA

from pandas import datetime
from pandas import read_csv
from pandas import DataFrame
from matplotlib import pyplot

# Reduce data to 100 data-points for to validate the models runs
hits_data_series = category_data["hits"][:100]
clicks_data_series = category_data["clicks"][:100]

In [None]:
# Create model, data -> hits
model = ARIMA(
    hits_data_series,
    order=(5,1,0)
)
# Fit model
model_fit = model.fit()

In [None]:
# Model summary
model_fit.summary()

In [None]:
# line plot of residuals
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()

In [None]:
# density plot of residuals
residuals.plot(kind='kde')
pyplot.show()

In [None]:
# summary stats of residuals
residuals.describe()