# Importing the Packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go


import statsmodels.api as sm
import warnings
from scipy import stats
from itertools import product
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

# <p><font size="10" color="#8214c7">Loading the data

<p><font size="3" color="#8214c7" style="Comic Sans MS;">
    
**Train Data** The training data, comprising time series of features store_nbr, family, and onpromotion as well as the target sales. store_nbr identifies the store at which the products are sold. family identifies the type of product sold. sales gives the total sales for a product family at a particular store at a given date. Fractional values are possible since products can be sold in fractional units (1.5 kg of cheese, for instance, as opposed to 1 bag of chips). onpromotion gives the total number of items in a product family that were being promoted at a store at a given date.

In [None]:
train = pd.read_csv('../input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('../input/store-sales-time-series-forecasting/test.csv')

In [None]:
train.head()

<p><font size="3" color="#8214c7" style="Comic Sans MS;">
    
**Stores data** gives some information about stores such as city, state, type, cluster.

**Transaction data** is highly correlated with train's sales column. You can understand the sales patterns of the stores.

***Holidays and events data is a meta data***. This data is quite valuable to understand past sales, trend and seasonality components. However, it needs to be arranged. You are going to find a comprehensive data manipulation for this data. That part will be one of the most important chapter in this notebook.
</font></p>


In [None]:
#Loading other data oil,holidays,transaction,stores
oil_df = pd.read_csv('../input/store-sales-time-series-forecasting/oil.csv')
holidays_df = pd.read_csv('../input/store-sales-time-series-forecasting/holidays_events.csv')
stores_df = pd.read_csv('../input/store-sales-time-series-forecasting/stores.csv')
trans_df = pd.read_csv('../input/store-sales-time-series-forecasting/transactions.csv',parse_dates=['date'])

In [None]:
#shape of train/test data
train.shape,test.shape

In [None]:
print("Column names in the Training Dataset are:\n",train.columns)

In [None]:
train.info()

In [None]:
train.describe().style.set_properties(**{"background-color": "#c69be0","color": "black", "border-color": "black"})

# <p><font size="10" color="#8214c7">Basic Exploration 


# <p><font size="6" color="#8214c7">Oil Data
<p><font size="3" color="#8214c7" style="Comic Sans MS;">
Daily oil price. Includes values during both the train and test data timeframes. (Ecuador is an oil-dependent country and it's economical health is highly vulnerable to shocks in oil prices.)

In [None]:
oil_df.head()

In [None]:
fig = px.line(oil_df, x='date', y="dcoilwtico")
fig.update_layout(title = "Oil by Date")
fig.show()

In [None]:
oil_df["date"] = pd.to_datetime(oil_df.date)
# Resample
oil_df = oil_df.set_index("date").dcoilwtico.resample("D").sum().reset_index()
# Interpolate
oil_df["dcoilwtico"] = np.where(oil_df["dcoilwtico"] == 0, np.nan, oil_df["dcoilwtico"])
oil_df["dcoilwtico_interpolated"] =oil_df.dcoilwtico.interpolate()

In [None]:
# Plot
p = oil_df.melt(id_vars=['date']+list(oil_df.keys()[5:]), var_name='Legend')
px.line(p.sort_values(["Legend", "date"], ascending = [False, True]), x='date', y='value', color='Legend',title = "Daily Oil Price interpolated" )

# Holiday's Data


In [None]:
print(holidays_df.info())
holidays_df.head()

In [None]:
holidays_df['date'] = pd.to_datetime(holidays_df['date'])
holidays_df['day_holiday'] = holidays_df['date'].dt.day_name()
holidays_df['month_holiday'] = holidays_df['date'].dt.month
holidays_df['year_holiday'] = holidays_df['date'].dt.year

In [None]:
holidays_df['day_holiday'].value_counts().plot.bar(figsize=(15,8),cmap = 'nipy_spectral')
plt.title('Counts of Holidays',fontsize = 26)

In [None]:
fig = px.pie(holidays_df,names='type', color_discrete_sequence=px.colors.sequential.RdBu,template="plotly_dark")

fig.update_layout(title = "Holidays Type")
fig.show()

In [None]:
holidays_df['locale'].value_counts()

In [None]:
f,ax=plt.subplots(1,2,figsize=(10,5))
holidays_df['transferred'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True,cmap ='tab20c')
ax[0].set_title('Transferred Holidays Pie chart')
ax[0].set_ylabel('')
sns.countplot('transferred',data=holidays_df,ax=ax[1],palette='Set2_r')
ax[1].set_title('Transferred Holidays bar chart')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=holidays_df,x='type',hue='transferred',palette = 'YlOrBr_r')
plt.legend(['Not Transferred', 'Transferred'])
plt.xticks(rotation=45)
plt.title('Transferred Holidays')
plt.show()

# Word Cloud 

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(
                          background_color='black',
                          max_font_size=50, 
                         ).generate(str(holidays_df['description']))

print(wordcloud)
plt.figure(figsize=(15,7))
plt.title('Word Cloud for Holidays description',fontsize=25)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# **Store Data**

In [None]:
stores_df.head()

In [None]:

fig = px.histogram(stores_df, x="city",color='type',template="plotly_dark")
fig.update_layout(title = "Citys by Store Type")
fig.show()

# Transactions data 

In [None]:
trans_df.head()

In [None]:
trans_df.info()

In [None]:
fig = px.histogram(trans_df, x="transactions",template="plotly_dark")
fig.update_layout(title = "Transaction Distribution")
fig.show()

In [None]:
copy_df= trans_df.copy()
copy_df["year"] = copy_df.date.dt.year
copy_df["dayofweek"] = copy_df.date.dt.dayofweek+1
copy_df = copy_df.groupby(["year", "dayofweek"]).transactions.mean().reset_index()
px.line(copy_df, x="dayofweek", y="transactions" , color = "year", title = "Transactions")

In [None]:
train.head()

In [None]:
#Timeline feature
def time_feature(df):
    df['date'] = pd.to_datetime(df['date'])
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    return df

In [None]:
time_feature(train)
time_feature(test)

In [None]:
# THANKS TO https://www.kaggle.com/shivamb/store-sales-forecasting-exploration
def hbar(col):
    temp = train.groupby(col).agg({"sales" : "mean"}).reset_index()
    temp = temp.sort_values(col, ascending = False)
    c = {
        'y' : list(temp['sales']), 
        'x' : list(temp[col]),
        'title' : "Average sales by "+col
    }
    trace = go.Bar(y=c['y'], x=c['x'], orientation="v", marker=dict(color="#edb705"))
    return trace 

    layout = go.Layout(title=c['title'], 
                           paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)',
                           xaxis_title="", yaxis_title="", width=650)
    fig = go.Figure([trace], layout=layout)
    fig.update_xaxes(tickangle=45, tickfont=dict(color='crimson'))
    fig.update_yaxes(tickangle=0, tickfont=dict(color='crimson'))
    fig.show()
    
trace1 = hbar('dayofweek') 
trace2 = hbar('dayofmonth') 
trace3 = hbar('dayofyear') 
trace4 = hbar('month') 
trace5 = hbar('quarter') 
trace6 = hbar('year') 

titles = ['Day of Week', 'Day of Month', 'Day of Year', 'Month', 'Quarter', 'Year']
titles = ['Avg Sales by ' + _ for _ in titles]
fig = make_subplots(rows=3, cols=2, subplot_titles = titles)

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=2, col=1)
fig.add_trace(trace4, row=2, col=2)
fig.add_trace(trace5, row=3, col=1)
fig.add_trace(trace6, row=3, col=2)

fig.update_layout(height=1200, paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', showlegend = False)
fig.show()

In [None]:
#
# data 
df_2013 = train[train['year']==2013][['month','sales']]
df_2013 = df_2013.groupby('month').agg({"sales" : "mean"}).reset_index().rename(columns={'sales':'s13'})
df_2014 = train[train['year']==2014][['month','sales']]
df_2014 = df_2014.groupby('month').agg({"sales" : "mean"}).reset_index().rename(columns={'sales':'s14'})
df_2015 = train[train['year']==2015][['month','sales']]
df_2015 = df_2015.groupby('month').agg({"sales" : "mean"}).reset_index().rename(columns={'sales':'s15'})
df_2016 = train[train['year']==2016][['month','sales']]
df_2016 = df_2016.groupby('month').agg({"sales" : "mean"}).reset_index().rename(columns={'sales':'s16'})
df_2017 = train[train['year']==2017][['month','sales']]
df_2017 = df_2017.groupby('month').agg({"sales" : "mean"}).reset_index()
df_2017_no = pd.DataFrame({'month': [9,10,11,12], 'sales':[0,0,0,0]})
df_2017 = df_2017.append(df_2017_no).rename(columns={'sales':'s17'})
df_year = df_2013.merge(df_2014,on='month').merge(df_2015,on='month').merge(df_2016,on='month').merge(df_2017,on='month')

# top levels
top_labels = ['2013', '2014', '2015', '2016', '2017']

colors = ['#97c20a', '#a9cf30',
          '#badb51', '#d0eb7a',
          '#e3f2b1']

# X axis value 
df_year = df_year[['s13','s14','s15','s16','s17']].replace(np.nan,0)
x_data = df_year.values

# y axis value (Month)
df_2013['month'] =['1M','2M','3M','4M','5M','6M','7M','8M','9M','10M','11M','12M']
y_data = df_2013['month'].tolist()

fig = go.Figure()
for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        fig.add_trace(go.Bar(
            x=[xd[i]], y=[yd],
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(color='rgb(248, 248, 249)', width=1)
            )
        ))

fig.update_layout(title='Avg Sales for each Year',
    xaxis=dict(showgrid=False, 
               zeroline=False, domain=[0.15, 1]),
    yaxis=dict(showgrid=False, showline=False,
               showticklabels=False, zeroline=False),
    barmode='stack', #barnorm='percent',
    plot_bgcolor='#fff', paper_bgcolor='#fff',
    margin=dict(l=0, r=50, t=100, b=10),
    showlegend=False, 
)

annotations = []
for yd, xd in zip(y_data, x_data):
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    # labeling the first Likert scale (on the top)
    if yd == y_data[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=top_labels[0],
                                font=dict(family='Arial', size=14,
                                          color='rgb(67, 67, 67)'),
                          showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the Likert scale
            if yd == y_data[-1]:
                annotations.append(dict(xref='x', yref='paper',
                                        x=space + (xd[i]/2), y=1.1,
                                        text=top_labels[i],
                                        font=dict(family='Arial', size=14,
                                                  color='rgb(67, 67, 67)'),
                                        showarrow=False))
            space += xd[i]
fig.update_layout(
    annotations=annotations)
fig.show()

In [None]:
agg = train.groupby('date').agg({"sales" : "mean"}).reset_index()
fig = px.line(agg, x='date', y=['sales'])
fig.update_layout(title = "Average Sales by Date")
fig.show()

In [None]:
train.head()

In [None]:
#oil_df.columns
#Index(['date', 'dcoilwtico', 'dcoilwtico_interpolated'], dtype='object')
#holidays_df.columns
#Index(['date', 'type', 'locale', 'locale_name', 'description', 'transferred','day_holiday', 'month_holiday', 'year_holiday'],dtype='object')
#trans_df.columns
#Index(['date', 'store_nbr', 'transactions'], dtype='object')
#stores_df.columns
#Index(['store_nbr', 'city', 'state', 'type', 'cluster'], dtype='object')

In [None]:
## combine datasets
train1 = train.merge(oil_df, on = 'date', how='left')
train1 = train1.merge(holidays_df, on = 'date', how='left')
train1 = train1.merge(stores_df, on = 'store_nbr', how='left')
train1 = train1.merge(trans_df, on = ['date', 'store_nbr'], how='left')
train1 = train1.rename(columns = {"type_x" : "holiday_type", "type_y" : "store_type"})

test1 = test.merge(oil_df, on = 'date', how='left')
test1 = test1.merge(holidays_df, on = 'date', how='left')
test1 = test1.merge(stores_df, on = 'store_nbr', how='left')
test1 = test1.merge(trans_df, on = ['date', 'store_nbr'], how='left')
test1 = test1.rename(columns = {"type_x" : "holiday_type", "type_y" : "store_type"})

train1.head()

In [None]:
# Function to calculate missing values by column# Funct 
def missing_data(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
missing_value = missing_data(train1)
missing_value.head(20)

In [None]:
test_missing = missing_data(test1)
test_missing.head()

In [None]:
#Data Wranggling
#train1.isnull().any()

In [None]:
agg = train1.groupby(["year", "store_type"]).agg({"sales"  :"mean", "transactions" : "mean"}).reset_index()
fig = px.box(agg, y="sales", facet_col="store_type", color="store_type",
             boxmode="overlay", points='all')
fig.update_layout(title = "Average Sales Distribution by Store Type")
fig.show()

In [None]:
agg = train1.groupby(["year", "state"]).agg({"sales"  :"mean", "transactions" : "mean"}).reset_index()
fig = px.box(agg, y="sales", facet_col="state", color="state",
             boxmode="overlay", points='all')
fig.update_layout(title = "Average Sales Distribution by State")
fig.show()

In [None]:
agg = train1.groupby(["year", "day_holiday"]).agg({"sales"  :"mean", "transactions" : "mean"}).reset_index()
fig = px.box(agg, y="sales", facet_col="day_holiday", color="day_holiday",
             boxmode="overlay", points='all')
fig.update_layout(title = "Average Sales Distribution by Holiday day's")
fig.show()

In [None]:
agg = train1.groupby(["year", "holiday_type"]).agg({"sales"  :"mean", "transactions" : "mean"}).reset_index()
fig = px.box(agg, y="sales", facet_col="holiday_type", color="holiday_type",
             boxmode="overlay", points='all')
fig.update_layout(title = "Average Sales Distribution by Holiday Type")
fig.show()

In [None]:
# data
df_st_sa = train1.groupby('store_type').agg({"sales" : "mean"}).reset_index().sort_values(by='sales', ascending=False)
df_fa_sa = train1.groupby('family').agg({"sales" : "mean"}).reset_index().sort_values(by='sales', ascending=False)[:10]
df_cl_sa = train1.groupby('cluster').agg({"sales" : "mean"}).reset_index() 
# chart color
df_fa_sa['color'] = '#32cf76'
df_fa_sa['color'][2:] = '#a5d993'
df_cl_sa['color'] = '#c8d984'

# chart
fig = make_subplots(rows=2, cols=2, 
                    specs=[[{"type": "bar"}, {"type": "pie"}],
                           [{"colspan": 2}, None]],
                    column_widths=[0.7, 0.3], vertical_spacing=0, horizontal_spacing=0.02,
                    subplot_titles=("Top 10 Highest Product Sales", "Highest Sales in Stores", "Clusters Vs Sales"))

fig.add_trace(go.Bar(x=df_fa_sa['sales'], y=df_fa_sa['family'], marker=dict(color= df_fa_sa['color']),
                     name='Family', orientation='h'), 
                     row=1, col=1)
fig.add_trace(go.Pie(values=df_st_sa['sales'], labels=df_st_sa['store_type'], name='Store type',
                     marker=dict(colors=['#68992f','#80b346','#9ac963','#c5e0a6','#e1edd3']), hole=0.7,
                     hoverinfo='label+percent+value', textinfo='label'), 
                    row=1, col=2)
fig.add_trace(go.Bar(x=df_cl_sa['cluster'], y=df_cl_sa['sales'], 
                     marker=dict(color= df_cl_sa['color']), name='Cluster'), 
                     row=2, col=1)

# styling
fig.update_yaxes(showgrid=False, ticksuffix=' ', categoryorder='total ascending', row=1, col=1)
fig.update_xaxes(visible=False, row=1, col=1)
fig.update_xaxes(tickmode = 'array', tickvals=df_cl_sa.cluster, ticktext=[i for i in range(1,17)], row=2, col=1)
fig.update_yaxes(visible=False, row=2, col=1)
fig.update_layout(height=500, bargap=0.2,
                  margin=dict(b=0,r=20,l=20), xaxis=dict(tickmode='linear'),
                  title_text="Average Sales Analysis",
                  template="plotly_white",
                  title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  hoverlabel=dict(bgcolor="#f2f2f2", font_size=13, font_family="Lato, sans-serif"),
                  showlegend=False)
fig.show()

In [None]:
def vbar(col):
    temp = train1.groupby(col).agg({"sales" : "mean"}).reset_index()
    temp = temp.sort_values('sales', ascending = False)
    c = {
        'x' : list(temp['sales'])[:15][::-1], 
        'y' : list(temp[col])[:15][::-1],
        'title' : "Average sales by "+col
    }
    trace = go.Bar(y=[str(_) + "    " for _ in c['y']], x=c['x'], orientation="h", marker=dict(color="#c69be0"))
    return trace 

    layout = go.Layout(title=c['title'], 
                           paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)',
                           xaxis_title="", yaxis_title="", width=650)
    fig = go.Figure([trace], layout=layout)
    fig.update_xaxes(tickangle=45, tickfont=dict(color='crimson'))
    fig.update_yaxes(tickangle=0, tickfont=dict(color='crimson'))
    fig.show()
    
trace1 = vbar('family') 
trace2 = vbar('store_type') 
trace3 = vbar('state') 
trace4 = vbar('city')

titles = ['Store Family', 'Store Type', 'State', 'City']
titles = ['Top ' + _ + " by Average Sales" for _ in titles]
fig = make_subplots(rows=2, cols=2, subplot_titles = titles)

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=2, col=1)
fig.add_trace(trace4, row=2, col=2)

fig.update_layout(height=800, paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', showlegend = False)
fig.show()