# Import Relevant Libraries

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline 

In [None]:
hols = pd.read_csv('../input/store-sales-time-series-forecasting/holidays_events.csv').copy()
oil = pd.read_csv('../input/store-sales-time-series-forecasting/oil.csv').copy()
stores = pd.read_csv('../input/store-sales-time-series-forecasting/stores.csv').copy()
transactions = pd.read_csv('../input/store-sales-time-series-forecasting/transactions.csv').copy()

# Analyse Dataset
Additional info that may or may not be useful <br>
Wages in the public sector are paid every two weeks on the 15th and on the last day of the month. <br>
Supermarket sales could be affected by this. <br> 
A magnitude 7.8 earthquake struck Ecuador on April 16, 2016. <br> 
People rallied in relief efforts donating water and other first need products which greatly affected supermarket sales for several weeks after the earthquake.


In [None]:
transactions.info()

In [None]:
transactions.head(3)

In [None]:
import datetime as dt 

In [None]:
transactions['date'] = pd.to_datetime(transactions['date']) # convert to datetime object 

In [None]:
transactions.isnull().sum() # check for missing values 

In [None]:
plt.figure(figsize = (12,6))
sns.scatterplot(x='date', y='transactions', data=transactions)
plt.gcf().autofmt_xdate()

We notice sharp spikes in sales transactions towards the end of the year. 
<br> Our scatterplot shows that there is no linear relationship between transactions against time. 
<br> Take note of the seasonality factor that comes into play. Will result in missing values and we'll need to deseasonalise the dataset. 

Seems that lag variable does have a linear correlation with transactions made. We'll further explore other variables that increases our prediction accuracy. 

In [None]:
hols.info() # alot smaller df than transactions and oil - expected 

In [None]:
hols.head(3)

Description and locale_name are additional info. Drop. 

In [None]:
hols = hols.drop(['description', 'locale_name'], axis=1)
hols.head()

**We will be keeping the categorical data to label encode or create dummy variables alongside the null values**

In [None]:
hols['locale'].unique()

In [None]:
hols['type'].unique()

In [None]:
hols['date'] = pd.to_datetime(hols['date'])

In [None]:
hols.info()

In [None]:
oil.info()

In [None]:
oil['date'] = pd.to_datetime(oil['date'])

In [None]:
oil.info()

In [None]:
oil.isnull().sum()

Interpolate all null values since the plots form a general trend 

In [None]:
oil['dcoilwtico'] = oil['dcoilwtico'].interpolate() 

In [None]:
oil.isnull().sum()

In [None]:
x = oil['date']
y = oil['dcoilwtico']

fig, ax = plt.subplots()
ax.plot(x,y)
fig.autofmt_xdate()
plt.show()

Comparing transactions to oil prices <br> 
Sales does not seem to be affected by oil prices, as seen by the seasonal increase per year despite the crash

In [None]:
stores.info()

In [None]:
stores['city'].unique() # lots of categorical types, use label encoder 

In [None]:
stores['state'].unique()

In [None]:
stores['type'].unique()

In [None]:
stores.isnull().sum()

**Examine the test and train sets and combine them together, referencing their index**

In [None]:
test = pd.read_csv('../input/store-sales-time-series-forecasting/test.csv').copy()
test['id'].min()

In [None]:
train = pd.read_csv('../input/store-sales-time-series-forecasting/train.csv').copy()

In [None]:
train['id'].max()

In [None]:
train['family'].unique() # use LabelEncoder

In [None]:
train['date'] = pd.to_datetime(train['date'])
train.info()

In [None]:
test['date'] = pd.to_datetime(test['date'])

In [None]:
data = pd.concat([train, test])
data.iloc[3000887:3000900]

**Let id be the new index**

In [None]:
data = data.set_index('id')
data.head()

# Merge DataFrames 

In [None]:
df1 = pd.merge(data, transactions, how='outer', on=['date', 'store_nbr'])

In [None]:
df2 = pd.merge(df1, hols, how='outer', on='date')

In [None]:
df3 = pd.merge(df2, oil, how='outer', on='date')
df4 = pd.merge(df3, stores, how='outer', on='store_nbr')
df4.head()

# Deal with Missing and Categorical Values 

Drop the locale column, since store_nbr is tied to a location 

In [None]:
df4 = df4.drop('locale', axis = 1)

In [None]:
df4.head()

In [None]:
df4.isnull().sum()

**Drop rows without store_nbr, family, sales, etc. as it is not useful data**
<br> Drop 'city' as store number and state is indicative enough

In [None]:
df4 = df4.dropna(axis=0, subset=['store_nbr'])
df4 = df4.drop('city', axis=1)
df4.isnull().sum()

**Checking and Dealing with Missing Values**
<br> create a seaborn heatmap to visualise missing values (optional but fun way to visualise) 

In [None]:
# plt.figure(figsize=(18,6))
# sns.heatmap(df3.isna().transpose(),
#            cmap="YlGnBu",
#            cbar_kws={'label': 'Missing Data'}

**Drop duplicates in dataframe**
<br> Improves the quality of our dataset

In [None]:
df4.shape

In [None]:
df4 = df4.drop_duplicates()
df4.shape

**Columns with missing values: sales, transactions, type_x, transferred, dcoilwtico** <br>
Interpolate dcoilwtico on the new dataset linearly, in the backward direction, since first entry is NaN type.

In [None]:
df4['dcoilwtico'] = df4['dcoilwtico'].interpolate(method='linear', limit_direction='backward')

Are there 0 transactions or sales recorded? Time to check 

In [None]:
df4.loc[df4['transactions'] == 0]

In [None]:
df4.loc[df4['sales'] == 0]

Map all NaN values with values or strings. <br>
Since no transactions are recorded with NaN, we replace NaN with zero. <br>
For sales, we will fill the missing values with mean

In [None]:
values = {'transactions': 0, 'type_x': 'No Holiday', 'transferred': False}
df4 = df4.fillna(value = values)

In [None]:
df4.isnull().sum()

**Let's analyse the store sales trend to determine how we should go fill in the missing values, since its numeric data**

In [None]:
x = df4['date']
y = df4['sales']

fig, ax = plt.subplots()
ax.plot(x,y)
fig.autofmt_xdate()
plt.show()

Since there is no linear correlation between time and sales (seasonality), we will fill missing values with mean sales over the years for now. 

In [None]:
df4['sales'].mean()

In [None]:
df4['sales'].fillna(358.8, inplace=True)
df4.isnull().sum()

Nice! All missing values are finally dealt with :) <br> 
**Let's not forget to convert our categorical data for machine learning**

In [None]:
from sklearn import preprocessing 
le = preprocessing.LabelEncoder()

category = ['type_x', 'type_y', 'state', 'family']

for cat in category: 
    df4[cat] = le.fit_transform(df4[cat])

In [None]:
df4.info()

In [None]:
df4['transferred'].unique()

In [None]:
df4 = pd.get_dummies(df4, columns = ['transferred'], drop_first=True, dtype=np.int64)

In [None]:
df4.info()

Rename certain columns to avoid confusion

In [None]:
rename = {'type_x': 'event', 'type_y': 'store_type', 'transferred_True': 'transferred'}
df4.rename(columns=rename, inplace=True)

In [None]:
df4.head(3)

# Feature engineering on datetime object

In [None]:
import datetime as dt 
df4['year'] = df4['date'].dt.year
df4['month'] = df4['date'].dt.month
df4['day'] = df4['date'].dt.day
df4.head()

In [None]:
df4.to_csv('timeseries.csv', index=False)