In this competition, you will predict sales for the thousands of product families sold at Favorita stores located in Ecuador. The training data includes dates, store and product information, whether that item was being promoted, as well as the sales numbers. Additional files include supplementary information that may be useful in building your models.

Import

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
import matplotlib.style
import matplotlib as mpl
mpl.style.use('classic')
from mpl_toolkits.mplot3d import Axes3D

Load

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read files

In [None]:
train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
train

In [None]:
train.info()

In [None]:
test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
test

In [None]:
oil = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv")
oil

holiday2

In [None]:
holiday = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv")
holiday

In [None]:
type_hols = holiday['locale'].value_counts()
type_hols

Map type_hols

In [None]:
dict_hols = {"National":1, "Regional":2, "Local":3}
holiday['type_hols'] = holiday['locale'].map(dict_hols)
holiday

In [None]:
holiday.info()

In [None]:
holiday2 = pd.DataFrame()

holiday2['date'] = oil['date']
holiday2['type_hols'] = 0
holiday2

In [None]:
type_holiday = []

i = 0
j = 0

for i in range(len(holiday2)):
    for j in range(len(holiday)):
        if holiday2['date'].iloc[i] == holiday['date'].iloc[j]:
            num_holiday = holiday['type_hols'].iloc[j]
            j = j + 1
            break
        else:
            num_holiday = 0
            j = j + 1
    type_holiday.append(num_holiday)
    i = i + 1
print(type_holiday)
print(len(type_holiday))

In [None]:
holiday2['type_hols'] = type_holiday
holiday2

In [None]:
stores = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
stores

In [None]:
transactions = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv")
transactions

In [None]:
submission = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv")
submission

Analyse target

In [None]:
sns.distplot(train['sales'])

Combine train and test

In [None]:
target = train['sales']

train_copy = train

combi = train_copy.drop(['sales'], axis=1).append(test)
combi

Map oil to combi

In [None]:
combi = combi.assign(dcoilwtico=combi.date.map(oil.set_index('date').dcoilwtico))
combi

Map holiday to combi

In [None]:
combi = combi.assign(type_hols=combi.date.map(holiday2.set_index('date').type_hols))
combi

Combine holiday and combi

In [None]:
combi.info()

Create new column

In [None]:
#combi.insert(6,'type_hols', 0)
#combi

In [None]:
#type_holiday = []

#i = 0
#j = 0

#for i in range(len(combi)):
#    for j in range(len(holiday)):
#        if combi['date'].iloc[i] == holiday['date'].iloc[j]:
#            num_holiday = holiday['type_hols'].iloc[j]
#            j = j + 1
#            break
#        else:
#           num_holiday = 0
#           j = j + 1
#   type_holiday.append(num_holiday)
#   i = i + 1
#print(type_holiday)
#print(len(type_holiday))

Search for null values

In [None]:
combi.isnull().sum()

Replace nan with 0

In [None]:
combi['dcoilwtico'] = combi['dcoilwtico'].fillna(0)
combi['type_hols'] = combi['type_hols'].fillna(0)
combi

Ordinal encode categories

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

for col in combi:
    if combi[col].dtype=="object":
        combi[col] = enc.fit_transform(combi[col].values.reshape(-1,1))
combi

Heatmap

In [None]:
corrmat = combi.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

Normalise

In [None]:
combi = (combi - combi.min()) / (combi.max() - combi.min())
combi

Standardise

In [None]:
#combi = (combi = combi.min()) / np.std(combi)
#combi

Define X, y and X_test

In [None]:
features = ['date', 'store_nbr', 'family', 'onpromotion', 'dcoilwtico']

y = target
X = combi[: len(train)][features]
X_test = combi[len(train) :][features]

Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape, X_val.shape, y_train.shape,y_val.shape, X_test.shape

Select model

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(X_train, y_train)
print(model.score(X_train, y_train))

Predict on validation set

In [None]:
y_pred = model.predict(X_val)
print(model.score(X_val, y_val))

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_val, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_val, y_pred, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

Predict on test set

In [None]:
prediction = model.predict(X_test)
prediction[prediction < 0] = 0
submission['sales'] = prediction

In [None]:
submission.to_csv('submission.csv', index=False)
submission = pd.read_csv("submission.csv")
submission