In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import lightgbm as lgb
import optuna

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.feature_selection import RFE
from plotly.offline import iplot, init_notebook_mode
from sklearn.metrics import mean_squared_log_error
from plotly.subplots import make_subplots
init_notebook_mode()

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1.FILE

In [None]:
rootPath = "/kaggle/input/store-sales-time-series-forecasting"
train = pd.read_csv(rootPath + "/train.csv")
test = pd.read_csv(rootPath + "/test.csv")
holidays = pd.read_csv(rootPath + "/holidays_events.csv")
stores = pd.read_csv(rootPath + "/stores.csv")
transactions = pd.read_csv(rootPath  +"/transactions.csv")
oil = pd.read_csv(rootPath +"/oil.csv")

## Train

In [None]:
def describe(df):
    '''
    make dataframe which describe the details about null count, etc
    '''
    print(f'Shape : {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=['DataType']).reset_index()
    summary = summary.rename(columns={'index': 'Feature'})
    summary['null count'] = df.isnull().sum().values
    summary['unique count'] = df.nunique().values
    summary['First value'] = df.loc[0].values
    summary['Second value'] = df.loc[1].values
    summary['Third value'] = df.loc[2].values
    
    return summary
display(describe(train))

In [None]:
_, axes = plt.subplots(2, 2, figsize=(20, 10), facecolor='yellow')
plt.suptitle('Check the numeric distribution', color='blue', fontsize=25)

sns.distplot(train['sales'], ax=axes[0, 0])
axes[0, 0].set_title('sales displot', fontsize=25)

sns.boxplot(x='sales', data=train, ax=axes[0, 1])
axes[0, 1].set_title('sales boxplot', fontsize=25)

sns.histplot(x='onpromotion', data=train, bins=20, ax=axes[1, 0])
axes[1, 0].set_title('onpromotion hist', fontsize=25)

sns.boxplot(x='onpromotion', data=train, ax=axes[1, 1])
axes[1, 1].set_title('onpromotion boxplot', fontsize=22)

plt.tight_layout()
plt.show()

We can see there is outlier in sales column

In [None]:
fig = make_subplots(rows=1, cols=1, specs=[[ {'type':'domain'}]])
fig.add_trace(go.Pie(labels=train['family'].value_counts().index, values=train['family'].value_counts()),
              1, 1)

The percentage of components in family are same.

## Transactions

In [None]:
display(describe(transactions))
sns.displot(x='transactions', data=transactions)


## Holidays

In [None]:
display(describe(holidays))
holidays['locale_name'].unique()


In [None]:
locale_name = holidays['locale_name'].value_counts()
locale_name

In [None]:
fig = px.pie(stores, values=locale_name, names=locale_name.index)

fig.update_layout(
title_font_color="#fff",paper_bgcolor="#283747",title_font_size=20,title_x=.5,font_color="#bbb",
    plot_bgcolor="#D6EAF8")

fig.show()

It's is imbalance in transactions when Ecuador took up nearly a half.

In [None]:
holidays['transferred'] = holidays['transferred'].apply(lambda x: 1 if x else 0) # encoding transfer
specs = [[{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=1, cols=2, specs=specs, subplot_titles=['type_holiday', 'transferred'])
type_holiday = holidays['type'].value_counts()
transferred = holidays['transferred'].value_counts()

fig.add_trace(go.Pie(labels=type_holiday.index, values=type_holiday),
              row=1, col=1)
fig.add_trace(go.Pie(labels=transferred.index, values=transferred),
              row=1, col=2)

fig.update_layout(
title_font_color="#fff",paper_bgcolor="#283747",title_font_size=20,title_x=.5,font_color="#bbb",
    plot_bgcolor="#D6EAF8")
fig = go.Figure(fig)
fig.show()

## store

In [None]:
display(describe(stores))

In [None]:
stores = stores.drop('state', axis=1)
city = stores['city'].value_counts()
cluster = stores['cluster'].value_counts()

specs = [[{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=1, cols=2, specs=specs, subplot_titles=['city', 'cluster'])

fig.add_trace(go.Pie(labels=city.index, values=city), row=1, col=1)
fig.add_trace(go.Pie(labels=cluster.index, values=cluster), row=1, col=2)

fig.update_layout(
title_font_color="#fff",paper_bgcolor="#283747",title_font_size=20,title_x=.5,font_color="#bbb",
    plot_bgcolor="#D6EAF8")
fig = go.Figure(fig)
fig.show()

In [None]:
sns.countplot(x='type', data=stores)

## Oil

In [None]:
display(describe(oil))
sns.displot(x='dcoilwtico', data=oil)

"dcoilwtico" has null values, I will replace them by 0

In [None]:
oil['dcoilwtico'] = oil['dcoilwtico'].fillna(0)

## Merge file

In [None]:
merge_data = train.merge(oil, on='date', how='left')
merge_data = merge_data.merge(holidays, on='date', how='left')
merge_data = merge_data.merge(stores, on='store_nbr', how='left')
merge_data = merge_data.merge(transactions, on=['date', 'store_nbr'], how='left')

In [None]:
merge_copy = merge_data.copy()

# change dtype and get the date col
merge_copy['date'] = pd.to_datetime(merge_copy['date']).dt.date
merge_copy['year'] = pd.to_datetime(merge_copy['date']).dt.year
merge_copy['month'] = pd.to_datetime(merge_copy['date']).dt.month
merge_copy['day'] = pd.to_datetime(merge_copy['date']).dt.day

In [None]:
describe(merge_copy)

In [None]:
merge_copy['transactions'] = merge_copy['transactions'].fillna(0)
merge_copy['dcoilwtico'] = merge_copy['dcoilwtico'].fillna(0)
display(describe(merge_copy))

There are several holiday events per year, so encoding 1 if occurs holiday else 0

In [None]:
merge_copy['holiday_flag'] = [1 if not val else 0 for val in merge_copy['type_x'].isnull()]
merge_copy = merge_copy.drop(['type_x', 'locale_name', 'transferred'], axis=1)
merge_copy = merge_copy.rename(columns={'type_y': 'stores_type'})
display(describe(merge_copy))

In [None]:
df = merge_copy.copy()
df = df.sort_values('date')
df_g = df[['date', 'sales']].groupby('date').agg(date_sum=('sales', np.mean))
# month avg 
df_g['moving_avg'] = df_g.date_sum.rolling(30, min_periods=3).mean()

plt.figure(figsize=(20, 5))
plt.plot(df_g['moving_avg'])
plt.title("Average sales per month")
plt.show()

del df

In [None]:
df = merge_copy.groupby(['year', 'month'], as_index=False).agg(sales_mean=('sales', np.mean))
plt.figure(figsize=(20, 5))
plt.title('Mean sales each year-month', fontsize=20)
sns.barplot(x='month', y='sales_mean', data=df, hue='year')
plt.show()
del df

**Correlation**

In [None]:
plt.figure(figsize=(10, 10))
corr = merge_copy.corr()
sns.heatmap(corr, annot=True, cmap="YlGnBu")

Key: id-year, sales-onpromotion

In [None]:
del merge_data

# 2.Model

## 2.1 Prepare data

In [None]:
data = merge_copy.copy().drop(['id', 'date','locale','description','day','dcoilwtico'], axis=1)
data = pd.get_dummies(data, drop_first=True)
X = data.drop('sales', axis=1)
y = data['sales']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=1234)

In [None]:
display(describe(data))

## 2.2 Linear Regression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred = np.where(y_pred<0, 0, y_pred)
print(np.sqrt(mean_squared_log_error(y_true=y_test, y_pred=y_pred)))

## 2.3 PCA + Linear Regression

In [None]:

n = [2,3,10,15,20,25,30,35,40]
for i in n:
    pipe = Pipeline([('pca', PCA(n_components=i)), ('lr', LinearRegression())])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_pred = np.where(y_pred<0, 0, y_pred)
    print("PCA: {} ------- RMSLE: {}".format(i, np.sqrt(mean_squared_log_error(y_true=y_test, y_pred=y_pred))))



## 2.4 RandomForest

In [None]:
RF = RandomForestRegressor(random_state=12, n_jobs=-1,max_depth=30, max_features='log2',max_leaf_nodes=20, verbose  =2)
RF.fit(X_train,y_train)
y_pred = RF.predict(X_test)
y_pred = np.where(y_pred<0, 0, y_pred)
np.sqrt(mean_squared_log_error(y_true=y_test, y_pred=y_pred))

In [None]:
importances = np.array(RF.feature_importances_)
forest_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)[:16]
fig, ax = plt.subplots(1, 1, figsize=(20, 10))
forest_importances.plot.bar(ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

## 2.4 LightGBM

In [None]:
important_cols = [ 'onpromotion', 'cluster', 'transactions', 'year', 'store_nbr','family_BEVERAGES', 'family_CLEANING',
                  'family_DAIRY', 'family_GROCERY I', 'family_PRODUCE']
X_importance = X[important_cols]
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_importance, y, test_size=0.2, random_state=123)


In [None]:
lgb_train = lgb.Dataset(X_train2, y_train2)
lgb_test = lgb.Dataset(X_test2, y_test2)

params = {'metric' : 'rmsle', 'seed': 123, 'verbosity':-1}

# train data
gbm = lgb.train(params, lgb_train, num_boost_round=500, valid_sets=[lgb_test])

In [None]:
y_pred = gbm.predict(X_test2, num_iteration=gbm.best_iteration)
y_pred = np.where(y_pred<0, 0, y_pred)
print(np.sqrt(mean_squared_log_error(y_true=y_test2, y_pred=y_pred)))

In [None]:
importance = pd.DataFrame(gbm.feature_importance(), index=X_train2.columns, columns=['importance'])

importance.sort_values(by='importance', ascending=False).plot.bar(figsize=(20, 8))

## 2.5 MLP

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler
from keras import optimizers

from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten
from sklearn.metrics import mean_squared_error
from keras import backend as K
from keras import optimizers
import tensorflow as tf 
from tensorflow import keras

In [None]:
epochs = 40
batch = 2046
lr = 0.0003
adam = tf.keras.optimizers.Adam(lr)
def root_mean_squared_log_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(K.log(y_pred+1) - K.log(1+y_true)))) 

In [None]:
model_mlp = Sequential()
model_mlp.add(Dense(100, activation='relu', input_dim=X_train2.shape[1]))
model_mlp.add(Dense(1))
model_mlp.compile(loss='mse', optimizer=adam)
model_mlp.summary()

In [None]:
mlp_history = model_mlp.fit(X_train2.values, y_train2, validation_data=(X_test2.values, y_test2), epochs=epochs, verbose=5)

In [None]:
plt.plot(mlp_history.history['loss'], label='Train loss')
plt.plot(mlp_history.history['val_loss'], label='Validation loss')
plt.legend(loc='best')
plt.title('MLP')
plt.xlabel("Epochs")
plt.ylabel("MSE")
plt.show()

In [None]:
pred = model_mlp.predict(X_test2.values)
pred = np.where(pred<0, 0, pred)
print(np.sqrt(mean_squared_log_error(y_true=y_test2, y_pred=pred)))

# 3. Submit

In [None]:
col = importance.query('importance>100').index

test_copy = test.copy()
# merge data
test_copy = test_copy.merge(oil, on='date', how='left')
test_copy = test_copy.merge(holidays, on='date', how='left')
test_copy = test_copy.merge(stores, on='store_nbr', how='left')
test_copy = test_copy.merge(transactions, on=['date', 'store_nbr'], how='left')
# change dtype and get the date col
test_copy['date'] = pd.to_datetime(test_copy['date']).dt.date
test_copy['year'] = pd.to_datetime(test_copy['date']).dt.year
test_copy['month'] = pd.to_datetime(test_copy['date']).dt.month
test_copy['day'] = pd.to_datetime(test_copy['date']).dt.day
# fillna with 0
test_copy['transactions'] = test_copy['transactions'].fillna(0)
test_copy['dcoilwtico'] = test_copy['dcoilwtico'].fillna(0)
# create new col as I did above
test_copy['holiday_flag'] = [1 if not val else 0 for val in test_copy['type_x'].isnull()]
# test_copy = merge_copy.drop(['type_x', 'locale_name', 'transferred'], axis=1)
test_copy = test_copy.rename(columns={'type_y': 'stores_type'})

test_copy = test_copy.drop(['id', 'date'], axis=1)
test_copy = pd.get_dummies(test_copy, drop_first=True)

In [None]:
describe(test_copy)

In [None]:
col = importance.query('importance>0').index
X2 = data[col]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X2)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=12)

# best param!
params = {
    'metric' : 'rmsle', 
    'verbosity': -1, 
    'seed': 123,
    'boosting_type': 'gbdt'
}

# Preparing dataset for LightGBM
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)
# train data
gbm = lgb.train(params, lgb_train, num_boost_round=500, valid_sets=[lgb_test], verbose_eval=False)


# choose importance cols
test_copy = test_copy[col]
scaler = StandardScaler()
test_scaled = scaler.fit_transform(test_copy)
prediction= gbm.predict(test_scaled, num_iteration=gbm.best_iteration)
prediction = np.where(prediction<0, 0, prediction)
del X2
del lgb_train
del lgb_test

In [None]:
submission = pd.read_csv(rootPath+'/sample_submission.csv')

submission['sales'] = prediction
submission.to_csv('submission.csv', index=False)
submission = pd.read_csv("submission.csv")
submission