In [None]:
import datetime 

import numpy as np
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import IterativeImputer

# Dataset Overview

The dataset is composed of three csv files.

Features contains several features valid for each (store, date) tuple from 2010 to 2013

Sales contains the weekly sales value (our prediction target) for each (store, department, date) tuple

Stores contains information about each store in the dataset.

In [None]:
features_df = pd.read_csv('../input/retaildataset/Features data set.csv')
sales_df = pd.read_csv('../input/retaildataset/sales data-set.csv')
stores_df = pd.read_csv('../input/retaildataset/stores data-set.csv')

In [None]:
features_df.head()

In [None]:
sales_df.head()

In [None]:
stores_df.head()

Type is a categorical feature with only three possible options. So it's better to one hot encode this feature for an easier modeling process.

In [None]:
one_hot_encoding = pd.get_dummies(stores_df['Type'])
stores_df = stores_df.join(one_hot_encoding)
stores_df = stores_df.drop(columns=['Type'])
stores_df.head()

All three dataframes are grouped into a single one.

In [None]:
week_sales_df = sales_df.groupby(['Date', 'Store', 'Dept']).agg({'Weekly_Sales': 'sum'}).sort_index()
week_sales_df.reset_index(inplace=True)
training_df = pd.merge(features_df, week_sales_df, how='left', on=["Date", "Store"])
training_df = pd.merge(training_df, stores_df, how='left', on=["Store"])

In [None]:
training_df.head()

The dataframe contains several NaN values that must be removed/replaced

In [None]:
training_df.isna().sum()

In order to interpolate NaN values, the dataframe must be indexed by DateTime object

In [None]:
def gen_datetime(date_str):
    return datetime.datetime.strptime(date_str, '%d/%m/%Y')

training_df['DateTime'] = training_df['Date'].map(gen_datetime)

In [None]:
training_df = training_df.set_index(['DateTime', 'Store', 'Dept']).sort_index()

In [None]:
training_df.head()

In [None]:
training_df['CPI'] = training_df['CPI'].interpolate(method='linear')
training_df['Unemployment'] = training_df['Unemployment'].interpolate(method='linear')

In [None]:
training_df.isna().sum()

In [None]:
training_df['MarkDown1'] = IterativeImputer(random_state=0).fit_transform(training_df['MarkDown1'].values.reshape(-1, 1))
training_df['MarkDown2'] = IterativeImputer(random_state=0).fit_transform(training_df['MarkDown2'].values.reshape(-1, 1))
training_df['MarkDown3'] = IterativeImputer(random_state=0).fit_transform(training_df['MarkDown3'].values.reshape(-1, 1))
training_df['MarkDown4'] = IterativeImputer(random_state=0).fit_transform(training_df['MarkDown4'].values.reshape(-1, 1))
training_df['MarkDown5'] = IterativeImputer(random_state=0).fit_transform(training_df['MarkDown5'].values.reshape(-1, 1))

In [None]:
training_df = training_df.reset_index()

In [None]:
training_df = training_df[training_df['Dept'].notna()]

In [None]:
training_df.isna().sum()

In [None]:
training_df.head()

Date should be decomposed into Year and a cyclical feature for Day of Year

In [None]:
training_df['Year'] = training_df['Date'].map(lambda x: int(x[-4:]))

In [None]:
training_df.head()

In [None]:
def day_of_year(date_str):
    date = datetime.datetime.strptime(date_str, '%d/%m/%Y')
    return date.timetuple().tm_yday

training_df['DayOfYear'] = training_df['Date'].map(day_of_year)
training_df['DayOfYearCos'] = np.cos(training_df['DayOfYear'])
training_df['DayOfYearSin'] = np.sin(training_df['DayOfYear'])

In [None]:
training_df.head()

In [None]:
training_df = training_df.drop(columns=['Date'])

In [None]:
training_df.head()

Normalizing values between 0 and 1

In [None]:
training_df['Store'] = MinMaxScaler().fit_transform(training_df['Store'].values.reshape((-1, 1)))
training_df['Temperature'] = MinMaxScaler().fit_transform(training_df['Temperature'].values.reshape((-1, 1)))
training_df['Fuel_Price'] = MinMaxScaler().fit_transform(training_df['Fuel_Price'].values.reshape((-1, 1)))
training_df['MarkDown1'] = MinMaxScaler().fit_transform(training_df['MarkDown1'].values.reshape((-1, 1)))
training_df['MarkDown2'] = MinMaxScaler().fit_transform(training_df['MarkDown2'].values.reshape((-1, 1)))
training_df['MarkDown3'] = MinMaxScaler().fit_transform(training_df['MarkDown3'].values.reshape((-1, 1)))
training_df['MarkDown4'] = MinMaxScaler().fit_transform(training_df['MarkDown4'].values.reshape((-1, 1)))
training_df['MarkDown5'] = MinMaxScaler().fit_transform(training_df['MarkDown5'].values.reshape((-1, 1)))
training_df['CPI'] = MinMaxScaler().fit_transform(training_df['CPI'].values.reshape((-1, 1)))
training_df['Unemployment'] = MinMaxScaler().fit_transform(training_df['Unemployment'].values.reshape((-1, 1)))
training_df['IsHoliday'] = training_df['IsHoliday'].astype(int)
training_df['Dept'] = MinMaxScaler().fit_transform(training_df['Dept'].values.reshape((-1, 1)))
training_df['Weekly_Sales'] = MinMaxScaler().fit_transform(training_df['Weekly_Sales'].values.reshape((-1, 1)))
training_df['Size'] = MinMaxScaler().fit_transform(training_df['Size'].values.reshape((-1, 1)))

In [None]:
training_df.head()

In [None]:
training_df.tail()

Split training set from 2010 to 2011 and prediction set from 2012 to 2013

In [None]:
training_df = training_df.set_index('DateTime')

In [None]:
training_set_df = training_df[:'2012-01-01']

In [None]:
training_set_df.head()

In [None]:
training_set_df.tail()

In [None]:
test_set_df = training_df['2012-01-01':]

In [None]:
test_set_df.head()

In [None]:
test_set_df.tail()

In [None]:
X_training = training_set_df.drop(columns=['Weekly_Sales', 'Year', 'DayOfYear']).values
y_training = training_set_df['Weekly_Sales'].values
X_prediction = test_set_df.drop(columns=['Weekly_Sales', 'Year', 'DayOfYear']).values
y_prediction = test_set_df['Weekly_Sales'].values

In [None]:
X_training.shape, y_training.shape, X_prediction.shape, y_prediction.shape

Split training set into train and test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_training, y_training, test_size=0.33, random_state=0)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

Decision Tree is able to predict value of unseen years with score of 0.85

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr_regressor = DecisionTreeRegressor(max_depth=50, min_samples_leaf=200, random_state=0)

In [None]:
dtr_regressor = dtr_regressor.fit(X_train, y_train)
dtr_regressor.score(X_test, y_test)

In [None]:
future_pred = dtr_regressor.predict(X_prediction)
dtr_regressor.score(X_prediction, y_prediction)

Random Forest is able to predict value of unseen years with score of 0.86

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr_regressor = RandomForestRegressor(n_estimators=20, max_depth=100, min_samples_leaf=150, random_state=0)

In [None]:
rfr_regressor = rfr_regressor.fit(X_train, y_train)
rfr_regressor.score(X_test, y_test)

In [None]:
future_pred = rfr_regressor.predict(X_prediction)
rfr_regressor.score(X_prediction, y_prediction)

Gradient Boosting Regressor is able to predict the value of unseen years with score of 0.87

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_regressor = GradientBoostingRegressor(n_estimators=20, max_depth=150, min_samples_leaf=150, random_state=0)

In [None]:
gbr_regressor = gbr_regressor.fit(X_train, y_train)
gbr_regressor.score(X_test, y_test)

In [None]:
future_pred = gbr_regressor.predict(X_prediction)
gbr_regressor.score(X_prediction, y_prediction)