In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import standard libraries
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [None]:
df_oil = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv", index_col=0, parse_dates=[0])
df_holidays = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv", index_col=0, parse_dates=[0])
df_stores = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv", index_col=0)
df_transactions = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv", parse_dates=[0])
df_train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv", index_col=0, parse_dates=[1])
df_test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv", index_col=0, parse_dates=[1])

In [None]:
df_train.head()

In [None]:
df_test.iloc[[0,-1],]

# Goal
Predict the sales for the second half of August 2017.

# Process
1. go through each dataset, explore the content, determine usefulness, fillna and transform as seems fit
2. join the datasets together to create a complete df_train and df_test
3. create new notebook for model building - try different models (XGB, LGB, LSTM, RandomForrest?)

# Oilprices

After importing the datasets, let's go through them one by one.

Though first question to ask is, what are we going to do?
Given we want to predict sales based on several predictors, we got ourself a simple regression problem. Now what we gonna do is figuring out, what kind of information we got, how useful it might be and what type of cleaning it requires as well as how we might approach it (dropping or filling).

First up we look at the oil prices.
Now at a first glance we will find a couple dozen null-values. Obviously we cannot assume oil didn't exist at these days or somehow didn't have a value.
So while we could try researching the oil prices on these days, given we have over 2000 entries and thus a very small amount of missing data, I will just fill the missing values with the inbetween of the most previous and next non-null entry.
I do this by first taking the indices of all missing values, then use both the forward-fill and backward-fill method and just average the two entries for the missing values.
After that, formatting the datetime correctly and plotting the oil-prices shows a decent graph with nothing seeming out of place.

However apart from a massive drop of oil prices from 2014-2015, it's also worth noting that this data is actually a false-friend. For a future prediction, the oilprice cannot be known in advance. So if the oilprice is included, this would limit the models ability to predict future data and instead would merely serve to create a modelled dataset that can be compared to real data to identify inconsistencies - like identify over-/underperformance which might spark further investigations into the difference.


## Conclusion

* include oil-prices in the dataset
* consider evaluation if oil-prices actually proof themself a **significant predictor**

In [None]:
print(df_oil.dcoilwtico.isna().sum())
df_oil.head()

In [None]:
#fill null-values with the mean of previous and following values - because the very first value is missing, we impute it with the second 
na_index = df_oil[df_oil.dcoilwtico.isna()].index
df_oil.loc[na_index, "dcoilwtico"] = (df_oil.dcoilwtico.fillna(method="ffill") + df_oil.dcoilwtico.fillna(method="bfill"))/2
df_oil.dcoilwtico[0] = df_oil.dcoilwtico[1]

Todo: Interpolate-function for fillna()?

In [None]:
# plotting the oil prices and a moving average over a week

fig, ax = plt.subplots(figsize=(20,7))
ax.plot(df_oil.rolling(window=7,
                       center=True,
                       min_periods=3).mean(),
       linewidth=2,
       color="red")
sns.scatterplot(data=df_oil, x="date", y="dcoilwtico", color="0.5", alpha=0.5, ax=ax)
sns.lineplot(data=df_oil, x="date", y="dcoilwtico", alpha=0.5, ax=ax, linewidth=0.5)
ax.set_title("Oil Prices", fontsize=18)

Most notably about the oilprice is the massive drop halfway through 2014. Ignoring most of the fluctuation, we could almost think about just halving the chart with a decently stable price at around 100 before the drop and 45 beginning 2015.

In [None]:
df_oil[df_oil.index >= "2017-08-15"].iloc[[0,-1],]

# Holidays

Next up let's look at the holidays.
* holidays that got moves have a True in transferred -> those were normal workdays and can be dropped
* days leading up to holidays or that happened after one have a "+" or "-" in their description

After running into a bunch of errors, holidays are VERY inconsistent. There is no data for the 25th of december of any year, yet other holidays. Also after plotting holidays onto a sales-chart, there is little consistency among the influence. Some holidays coincide with the high sales, other are just located on some slopes and one the first of january comes with actually 0 sales.

This level of inconsistency hardly offers any data for the model to learn and generalize.

Just to be absolutly sure and because we might discard the pre-2015 data, let's take a look at the holidays of 2015 and plot them together with the sales numbers. As we will see, there is no noticable correlation inbetween the two.

## Conclusion

- Mark Holidays as option for OneHotEncoding

In [None]:
holidays = df_holidays.loc[~df_holidays.description.str.contains('\+|\-')][df_holidays.transferred==False].set_index("date")
holidays.head()

In [None]:
holidays_2015 = holidays[("2015" <= holidays.index) & (holidays.index <= "2016")]

In [None]:
df_train[df_train.date.isin(holidays_2015.index)].head()

In [None]:
fig, ax = plt.subplots(figsize=(20,7))
sns.lineplot(data=df_train[(df_train.date >= "2015") & (df_train.date < "2016")].groupby("date").sum(),
             x="date",
             y="sales",
             ax=ax
            )
data = df_train[df_train.date.isin(holidays_2015.index)].groupby("date").sum()
data = data.join(df_holidays.set_index("date"), how="left")
sns.scatterplot(x=data.index,
            y=data.sales,
            hue=data.locale,
            s=150)

# Stores

Now we look at the information we have of the stores.
We got 54 different stores, with 5 different types in 17 clustern.
Grouping by the clusters shows that each cluster except for 10 only has one type of stores present.
According to the data-description, the clustering happened on similar stores, so it's not exactly a surprise that there is only one store-type per cluster.
Short notice that type-cluster groupby was also tried but didn't really offer any insights.

If we break it down further to count the different cities in the cluster... well it's getting complicated. Appearently the same cluster can cover several cities.
This is important as it undermines most information we could get from a cluster. If the clusters where in the same city, it could be assumed they are not considered competitors and thus their sales could be added up to reduce complexity - as we'd assume that people just chose the first store of the cluster they came across, instead of making a conscious decision to go to a specific store. However given clusters neither cover similar store-types nor regions, it's hard to determine what kind of information we can actually get from the cluster.

Maybe a later look into sales-numbers can offer more significant differences inbetween the cluster. If not we might need to do some more research on how the cluster came to be assigned.
However matter of fact is, at the current level of insight, the clusters don't seem to contain any distinct information that would make it a good predictor for any model. So let's keep it in mind as a column that might be safe to drop.
Speaking of dropping, taking a quick look at cities and states, we see each city has one state. So there is no trouble with doubled city-names. However we got some states with several cities. Meaning dropping either column is currently not feasable.

In the end, there isn't much to do right now. The table contains a lot of distinct entries which may or may not provide useful information for a later regression. We just need to keep in mind it's categorical data and must be encoded properly.
For the encoding we need to determine wether or not there is any significant relationship inbetween sales and cities or states, as this could be used to justify ordinal-encoding with it's implied order. However given the size of distinct values for both cities and states, One-Hot-Encoding might introduce a lot of additional dimensions for the regression and as such should be avoided if there is no significant relationship to be found inbetween either and the target.

## Judgement
Given most cities have only one shop to begin with, the model has nothing to gain here. Even if we .groupby("state") we get to little variation. We'd need significantly more entries to have a chance to gain some information out of it that isn't just noise. 10 would still be to little for that, let alone 1 shop which means it's identical to the shop-ID as far as distinct information goes.

For the cluster I can't find any relation or information that would imply these have any significance. Thus I could try including it just for fun, but I can't argue it's important.

Now I haven't looked into type, but given it's only 5 entries we can easily OneHotEncode it without bloating the model all to much. And it is to assume the type of store has influence on their sales-numbers in general. Plus we'd want to at least give the model a chance to find some generalizing information and I just decided to discard all other information that would allow any kind of grouping of shops.

## Conclusion
- test with type and cluster
- test with type
- note that without inherant order, OHE is necessary and this will heavily lengthen the dataframe

In [None]:
print(df_stores.shape)
df_stores.head()

In [None]:
df_stores.groupby("state").count()

In [None]:
df_stores.type.value_counts()

In [None]:
df_stores.cluster.value_counts()

In [None]:
df_stores[["type","cluster", "city", "state"]].groupby(["cluster", "type", "city"]).count()

In [None]:
df_stores[["city","state"]].groupby(["state", "city"]).count()

In [None]:
df_stores[df_stores.cluster==10].store_nbr.value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(25,10))
sns.boxplot(#x="cluster", 
            x="store_nbr",
            y="transactions", 
            data=df_transactions, 
            #hue="cluster", 
            ax=ax, 
            dodge=False)

# transactions

Second to last will be the transactions table. There isn't much here, just the number of tansactions in any particular store at any day.

Unlike the oil-price, transactions are not given for the test-data. Thus they cannot be used for the model.

I don't know if this is an oversight from the test-creation or if there is indeed a use-case where this kind of data would be present.

In [None]:
df_transactions.tail()

# Train data

Now let's take a look at the final dataset, which is the training data.

It's having about 3 million entries within 6 columns.

With about 56k entries per shop, saying per day and per product-family how many sales happened and if this family was having an promotion at the time. Now we cannot say what kind of promotion was happening - if it means actual time-limited sales or more time-fluid things like advertisements. Ads would be expected to have effects on the sale even after the promotion ended.

We also gotta keep in mind that this is the primary data to work with, meaning the information of the other tables has to be joined onto this one, both for the training and the testing data.

The "onpromotion" is a tricky thing though. It's but a counter of how many items in the specific categorie are on sale at the specific date. Which in itself is very bare information, as obviously promotions and as such sales affect different items differently. For example, if a medium priced product is reduced to compete with a cheap product, people might grab the reduced one instead of the generally cheap one, not generating any additional sales. Imagine toilet paper, just because it is cheaper, doesn't mean people will buy more of it. Unless they buy it in bulk. Point is, there are different strategies of people to react and utilize sales of different products and as such, the mere information how many products are on promotion at a given time, is a good generalization (as we are in no position to predict every single sale) BUT it also means this feature will contain a lot of noise and unless the noise cancels eachother out, might propably not be the best predictor.

Long ramble aside, it's also important to format the date correctly and infer the weekday. We gotta OneHotEncode that so the model can identify specific behavior for each day of the week.

# Conclusion

+ Join oil-price, store-type and store-cluster
+ [edit: add columns for time-series related EDA]

In [None]:
df_train.loc[ : , ['date', 'sales']][df_train.date>="2015-07"]

In [None]:
fig, ax = plt.subplots(figsize=(20,7))
data_lim = df_train.loc[:, ["date", "sales"]][df_train.date>="2015-06"].groupby("date").sum()
data_pre = df_train.loc[:, ["date", "sales"]][df_train.date<"2015-06"].groupby("date").sum()
data = df_train.loc[:, ["date", "sales"]].groupby("date").sum()
#data.head()
ax.plot(data_lim.rolling(window=30,
                    center=True,
                    min_periods=15).mean(),
        aa=True,
        color="red",
        alpha=0.4
       )
ax.plot(data_pre.rolling(window=30,
                    center=True,
                    min_periods=15).mean(),
        aa=True,
        color="blue",
        alpha=0.4
       )
ax.plot(data.rolling(window=365,
                    center=True,
                    min_periods=182).mean(),
       aa=True,
       color="black"
       )
ax.plot(data_lim.rolling(window=365,
                        center=True,
                        min_periods=182).mean(),
       aa=True,
       color="red")
ax.plot(data_pre.rolling(window=365,
                        center=True,
                        min_periods=182).mean(),
       aa=True,
       color="blue")
ax.legend(['30 day rolling window', '365 day rolling window'])
ax.set_title("Sales Trend - month + year", fontsize=20)

1. red = post 2015-06
2. blu = pre 2015-06

We can see a couple of notable things.
* over the years we see an upwards trend of the sales.
* over the month-wide average, there is a clear spike at the end of the years, which gets more prominent in later years
* 2013 is almost flat apart from christmas
* 2014 is all over the place with large spikes an valleys with another spike mid 2015 that seems to establish a new baseline
* after about half the year of 2015, the chart seems to flatten out, showing only 3 majore spikes
    * two end-of-year spikes as usual
    * one spike around the end of the first quarter 2016, propably related to the April 16, 2016 earthquake and subsequent donations happening
* the monthly line is ragged, showing some periodic behavior

In [None]:
fig, ax = plt.subplots(figsize=(20,7))
data = df_train.loc[:,["date","sales"]].groupby("date").sum()
ax.plot(data.rolling(window=7,
                        center=True).mean(),
       color="blue")
ax.set_title("Moving average sales per day over a weekly window", fontsize=20)

This time we see it more clearly. We don't know what happened in 2014 but we do know that this might very well severly limit the predictive power of the model.

Given we are looking at data for the future, the massive shift we see happening before and after mid-2015, will confuse the model. As it will train to predict ALL datapoints, date itself as a continous predictor will he hella confusing. In order to minimize the loss, it will shift it's prediction inbetween the average. Let's just grab some info to tell how much a difference we are facing.

This goes along with the oil-prices which behaved massively different after 2015. However the generally more eradic behavior of pre-2015 in sales doesn't go along with a similar behavior in the oil-prices. They are arguably more eradic after 2016, yet the sales seem to fall into a steady pattern of average sales outside of christmas and the earthquake.

On top of that, if we actually assume to disregard data, we might also take into consideration to clean the data around the earthquake. We can savely say that this datapoint is noise and nothing the model should learn to generalize.

# Conclusion
With the three option to alter the training data, let's see out resulting datasets:
1. full dataset
2. full dataset with imputed data for earthquake
3. post2015_06 data with imputed data for the earthquake

The non-altered post2015 set will not be created, due to the increased influence of the unordinary data. In fact it is to be highly expected, that taking in the earthquake-data will only result in worse predictions.

In [None]:
df = df_train

df["year"],df["month"], df["day"] = pd.DatetimeIndex(df['date']).year, pd.DatetimeIndex(df['date']).month, pd.DatetimeIndex(df['date']).day

df['month'].replace([var for var in range (1, 13)],['Jan','Feb','Mar','Apr','May','June','July','Aug','Sept','Oct','Nov','Dec'],inplace=True)
df['month'] = pd.Categorical(df['month'],
                             categories=['Jan','Feb','Mar','Apr','May','June','July','Aug','Sept','Oct','Nov','Dec'],
                             ordered=True)

df = df.set_index('date')
df['dayofyear'] = df.index.dayofyear
df['dayofweek'] = df.index.dayofweek
df['week'] = df.index.week

df.head()

In [None]:
fig, axs = plt.subplots(3, 1, figsize = (16, 15))

for year, color in zip(df.year.unique(), sns.color_palette("RdPu_r")):
#     yearly = train[train.year == year]
    sns.lineplot(data = df[df.year == year].groupby('dayofyear')['sales'].mean(), color=color,ax = axs[0], linewidth = 1.5, label = str(year))
sns.lineplot(data = df.groupby('dayofyear')['sales'].mean(), color = 'black',ax = axs[0], linewidth = 6, label = 'mean')   

    
axs[0].set_title("Yearly Sales", fontsize = 18)
    
for month, color in zip(df.month.unique(), sns.color_palette("winter", n_colors = 12)):
#     monthly = train[train.month == month]
    sns.lineplot(data = df[df.month == month].groupby('day')['sales'].mean(), color=color,ax = axs[1], linewidth = 1.5, label = month)             
sns.lineplot(data = df.groupby('day')['sales'].mean(), color = 'black',ax = axs[1], linewidth = 6, label = 'mean')   

axs[1].set_title("Monthly Sales", fontsize = 18)

for week, color in zip(df.week.unique(), sns.color_palette('summer', n_colors = 53)):
    sns.lineplot(data = df[df.week == week].groupby('dayofweek')['sales'].mean(), color=color, ax = axs[2], linewidth = 1.5)
sns.lineplot(data = df.groupby('dayofweek')['sales'].mean(), color = 'black', ax = axs[2], linewidth = 6, label = 'mean')    

axs[2].set_title("Weekly Sales", fontsize = 18)

plt.tight_layout()

# Create Datasets

- join data
- create copy with no-earthquake
- copy post 2015-06 into own set

In [None]:
df_stores.head()

In [None]:
df_full = df_train.set_index("date").join(df_oil, how="left").join(df_stores[["type","cluster"]], on="store_nbr", how="left").rename(columns = {'dcoilwtico' : 'oilprice'})
#df_full['dayofyear'] = df.index.dayofyear
df_full['dayofweek'] = df.index.dayofweek
#df_full['week'] = df.index.week

df_full.head()

In [None]:
df_neq = df_full.copy()

In [None]:
plt.subplots(figsize=(20,7))
data = df_neq[["sales"]].groupby("date").sum()

plt.plot(data.rolling(window=7).mean())

dat = data[(data.index>"2016-04") & (data.index<="2016-05")].groupby("date").sum()
plt.plot(dat.rolling(window=7).mean(),
        color="red")

In [None]:
replace = df_full.loc["2016-06":"2016-08-30", "sales"]

df_neq.loc["2016-04":"2016-06","sales"] = list(replace*1.08)
# df_neq.loc["2016-04":"2016-05","sales"].head()

df_post2015 = df_neq.loc["2015-06":].copy()

plt.subplots(figsize=(20,7))
data = df_full[["sales"]].groupby("date").sum()

plt.plot(data.rolling(window=7).mean(),
        color="red")

dat = df_post2015[["sales"]].groupby("date").sum()
# dat = dat[(dat.index>"2016-04") & (dat.index<="2016-05")].groupby("date").sum()
plt.plot(dat.rolling(window=7).mean(),
        color="blue")

In [None]:
# df_full.to_csv("df_full.csv")
# df_neq.to_csv("df_neq.csv")
# df_post2015.to_csv("df_post2015.csv")

# Modelling and predictions

I'll start with a default-prediction as a jumping off point to evaluate later models. The idea is to have a prediction that involves as little work while providing some at least somewhat reasonable result.

For this exercise I will predict the data for 2017 by using the data from 2016 of the same timeframe.

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(df_full.drop("sales"), df_full["sales"])
y_pred = pd.DataFrame(data = model.predict(df_full.drop("sales")),
                      index = y.index,
                      columns=y.columns)

In [None]:
df_test.iloc[[0,-1]]

In [None]:
df_full.head()

In [None]:
df_full.groupby(["date","family"]).sum()

In [None]:
unique = df_full.family.unique()

In [None]:
df_test.head()

# Next Step
1. Going through the TimeSeries course to actually understand option to deal with this kind of data xD
2. Chose the post_2015_noeq dataset(note)
3. Use newly learned tools to gain additional insights for modelling

Note: given the further exploration takes additional time, it seems less reasonable to try doing the same process for the other datasets.
Giving the model data to predict something 