In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()

from sklearn import model_selection
from sklearn import metrics, ensemble, linear_model
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore') 

In [None]:
!mkdir /kaggle/working/data
!unzip ../input/walmart-recruiting-store-sales-forecasting/features.csv.zip -d /kaggle/working/data
!unzip ../input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip -d /kaggle/working/data
!unzip ../input/walmart-recruiting-store-sales-forecasting/train.csv.zip -d /kaggle/working/data
!unzip ../input/walmart-recruiting-store-sales-forecasting/test.csv.zip -d /kaggle/working/data

# **1. Read data**

In [None]:
features = pd.read_csv('./data/features.csv')
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sampleSubmission = pd.read_csv('./data/sampleSubmission.csv')
stores = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/stores.csv')


Join feature and store 

In [None]:
feature_store = features.merge(stores, how = 'inner', on = 'Store')
train_df = train.merge(feature_store, how='inner', on = ['Store','Date','IsHoliday']).sort_values(by=['Store','Dept','Date']).reset_index(drop=True)
test_df = test.merge(feature_store, how='inner', on = ['Store','Date','IsHoliday']).sort_values(by = ['Store','Dept','Date']).reset_index(drop=True)

# **2.EDA**

In [None]:
train_df.describe().T

Change "Date" column to timedate format and parsing day, week, month, year

In [None]:
feature_store = features.merge(stores, how='inner', on = "Store")

feature_store['Date'] = pd.to_datetime(feature_store['Date'])
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

feature_store['Day'] = feature_store['Date'].dt.day
feature_store['Week'] = feature_store['Date'].dt.isocalendar().week
feature_store['Week'] = feature_store['Week'].astype(int)
feature_store['Month'] = feature_store['Date'].dt.month
feature_store['Year'] = feature_store['Date'].dt.year


train_df = train.merge(feature_store, how='inner', on = ['Store','Date','IsHoliday']).sort_values(by=['Store','Dept','Date']).reset_index(drop=True)
test_df = test.merge(feature_store, how='inner', on = ['Store','Date','IsHoliday']).sort_values(by = ['Store','Dept','Date']).reset_index(drop=True)
df_weeks = train_df.groupby('Week').sum()

# **2.1 Sales**

In [None]:
px.line( data_frame = df_weeks, x = df_weeks.index, y = 'Weekly_Sales', 
        labels = {'Weekly_Sales' : 'Weekly Sales', 'x' : 'Weeks' }, 
        title = 'Sales over weeks')


**Sale consistency over the year**

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['MarkDown1'], name = 'MarkDown1', mode = 'lines') )
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['MarkDown2'], name = 'MarkDown2', mode = 'lines') )
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['MarkDown3'], name = 'MarkDown3', mode = 'lines') )
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['MarkDown4'], name = 'MarkDown4', mode = 'lines') )
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['MarkDown5'], name = 'MarkDown5', mode = 'lines') )
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['Weekly_Sales'], name = 'Weekly Sales', mode = 'lines') )
fig.update_layout(title = "Sales vs Markdown's", xaxis_title = 'Weeks')

# **Mean sale over the year**

In [None]:
weekly_sales = train_df.groupby(['Year','Week'], as_index = False).agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2010 = train_df.loc[train_df['Year']==2010].groupby(['Week']).agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2011 = train_df.loc[train_df['Year']==2011].groupby(['Week']).agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2012 = train_df.loc[train_df['Year']==2012].groupby(['Week']).agg({'Weekly_Sales': ['mean', 'median']})

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter( x = weekly_sales2010['Weekly_Sales']['mean'].index, y = weekly_sales2010['Weekly_Sales']['mean'], name = 'Mean Sales 2010', mode = 'lines') )
fig.add_trace(go.Scatter( x = weekly_sales2011['Weekly_Sales']['mean'].index, y = weekly_sales2011['Weekly_Sales']['mean'], name = 'Mean Sales 2011', mode = 'lines') )
fig.add_trace(go.Scatter( x = weekly_sales2012['Weekly_Sales']['mean'].index, y = weekly_sales2012['Weekly_Sales']['mean'], name = 'Mean Sales 2012', mode = 'lines') )
fig.add_annotation(text="Thanskgiving", x=47, y=25000, showarrow=False)
fig.add_annotation(text="Christmas", x=51, y=29000, showarrow=False)
fig.update_layout(title = 'Sales 2010, 2011, 2012', xaxis_title = 'Weeks')

**Clearly there's a pattern about sales across the year, at Thanksgiving and Christmas (2 main holiday for USA) sales rise up by a huge margin**

# **2.2 Other features**

**Convert temperature to Celius**

In [None]:
train_df['Temperature'] = train_df['Temperature'].apply(lambda x :  (x - 32) / 1.8)
train_df['Temperature'] = train_df['Temperature'].apply(lambda x :  (x - 32) / 1.8)

In [None]:
px.scatter(train_df, x='Temperature', y ='Weekly_Sales', color='IsHoliday', 
           title = 'Temperature and sales by holiday')

**No clearly pattern between the coldness and sales**

In [None]:
px.scatter(train_df, x='Fuel_Price', y ='Weekly_Sales', color='IsHoliday', 
           title='Fuel price and sales by holiday')

**No clearly pattern between the coldness and sales**

**But is seem like sales drop when fuel price rise**

In [None]:
px.scatter(train_df, x='CPI', y ='Weekly_Sales', color='IsHoliday', 
           title='CPI and sales by holiday')

**There's no clearly pattern, but let's look close each year**

In [None]:
weekly_sales2010 = train_df.loc[train_df['Year']==2010]
weekly_sales2011 = train_df.loc[train_df['Year']==2011]
weekly_sales2012 = train_df.loc[train_df['Year']==2012]

In [None]:
px.scatter(weekly_sales2010, x='CPI', y ='Weekly_Sales', color='IsHoliday', 
           title='CPI and sales by holiday of 2010')

In [None]:
px.scatter(weekly_sales2011, x='CPI', y ='Weekly_Sales', color='IsHoliday', 
           title='CPI and sales by holiday of 2011')

In [None]:
px.scatter(weekly_sales2012, x='CPI', y ='Weekly_Sales', color='IsHoliday', 
           title='CPI and sales by holiday of 2012')

**There is 3 group of CPI, 3 of them have similar sales record despite the fact that the CPI is higher**

In [None]:
px.scatter(train_df, x='Unemployment', y ='Weekly_Sales', color='IsHoliday', 
           title='Unemployment rate and sales by holiday')

**In relation to unemployment, it can be seen that the lower the value, higher the sales, it makes sense.**

In [None]:
sizes= train_df.groupby('Size').mean()
px.line(sizes, x = sizes.index, y = sizes.Weekly_Sales, 
        title='Store size and sales')

**Size is an important factor when it comes to sales, the bigger the size, the more salse that store make**

In [None]:
store_type = pd.concat([stores['Type'], stores['Size']], axis=1)
px.box(store_type, x='Type', y='Size', color='Type', 
       title='Store size and Store type')

**The A size Store is the most present**

In [None]:
store_sale = pd.concat([stores['Type'], train_df['Weekly_Sales']], axis=1)
px.box(store_sale.dropna(), x='Type', y='Weekly_Sales', color='Type', 
       title='Store type and sales')

**Although Store C is the smallest one but they have the highest median sales**

In [None]:
depts= train_df.groupby('Dept').mean().sort_values(by='Weekly_Sales', ascending='False')
bar=px.bar(depts, x = depts.index, y =  depts.Weekly_Sales, 
           title='Departament and sales',color=depts.Weekly_Sales)
bar.update_layout(barmode='group', xaxis={'categoryorder':'total descending'})

**There is a big diffence in sales contribute in sale department**

# **2.3 | Heatmap and correlation between features**

weekly_sales_corr = train_df.corr().iloc[2,:]
corr_df = pd.DataFrame(data = weekly_sales_corr, index = weekly_sales_corr.index ).sort_values (by = 'Weekly_Sales', ascending = False)
corr_df = corr_df.iloc[1:]
bar = px.bar(corr_df, x = corr_df.index, y = 'Weekly_Sales', color=corr_df.index, labels={'index':'Featues'},
             title='Feature correlation with sales',color_discrete_sequence=palette)
bar.update_traces(showlegend=False)



In [None]:
corr = train_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
df_mask = corr.mask(mask).round(2)

fig = ff.create_annotated_heatmap(z=df_mask.to_numpy(), 
                                  x=df_mask.columns.tolist(),
                                  y=df_mask.columns.tolist(),
                                  colorscale=px.colors.diverging.RdBu,
                                  hoverinfo="none", 
                                  showscale=True, ygap=1, xgap=1
                                 )

fig.update_xaxes(side="bottom")

fig.update_layout(
    title_text='Heatmap', 
    title_x=0.5, 
    width=1000, 
    height=1000,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    yaxis_autorange='reversed',
    template='plotly_white'
)

for i in range(len(fig.layout.annotations)):
    if fig.layout.annotations[i].text == 'nan':
        fig.layout.annotations[i].text = ""

fig.show()

In [None]:
weekly_sales_corr = train_df.corr().iloc[2,:]
corr_df = pd.DataFrame(data = weekly_sales_corr, index = weekly_sales_corr.index ).sort_values (by = 'Weekly_Sales', ascending = False)
corr_df = corr_df.iloc[1:]
bar = px.bar(corr_df, x = corr_df.index, y = 'Weekly_Sales', color=corr_df.index, labels={'index':'Featues'},
             title='Feature correlation with sales')
bar.update_traces(showlegend=False)

# **3 | Feature engineering**

In [None]:
data_train = train_df.copy()
data_test = test_df.copy()

# **3.1 | Holidays**

**There are some main holiday in USA, I try FE in the below:**

    - Christmas (December 25)
    
    - Thanksgiving (4th Thursday in November) i wil get 24th of November for fixed date
    
    - Independence Day (4th of July)

In [None]:
data_train['Days_to_Thansksgiving'] = (pd.to_datetime(train_df["Year"].astype(str)+"-11-24", format="%Y-%m-%d") - pd.to_datetime(train_df["Date"], format="%Y-%m-%d")).dt.days.astype(int)

data_train['Days_to_Christmas'] = (pd.to_datetime(train_df["Year"].astype(str)+"-12-24", format="%Y-%m-%d") - pd.to_datetime(train_df["Date"], format="%Y-%m-%d")).dt.days.astype(int)

data_test['Days_to_Thansksgiving'] = (pd.to_datetime(test_df["Year"].astype(str)+"-11-24", format="%Y-%m-%d") - pd.to_datetime(test_df["Date"], format="%Y-%m-%d")).dt.days.astype(int)

data_test['Days_to_Christmas'] = (pd.to_datetime(test_df["Year"].astype(str)+"-12-24", format="%Y-%m-%d") - pd.to_datetime(test_df["Date"], format="%Y-%m-%d")).dt.days.astype(int)

data_train['Days_to_Independence_Day'] = (pd.to_datetime(test_df["Year"].astype(str)+"-5-4", format="%Y-%m-%d") - pd.to_datetime(test_df["Date"], format="%Y-%m-%d")).dt.days.astype(int)

# **3.2 | Markdowns**

In [None]:
data_train['MarkdownsSum'] = train_df['MarkDown1'] + train_df['MarkDown2'] + train_df['MarkDown3'] + train_df['MarkDown4'] + train_df['MarkDown5'] 
data_test['MarkdownsSum'] = test_df['MarkDown1'] + test_df['MarkDown2'] + test_df['MarkDown3'] + test_df['MarkDown4'] + test_df['MarkDown5']

# **4 | Preprocessing**

# **4.1 | Encoding categorical data**

In [None]:
data_train['IsHoliday'] = data_train['IsHoliday'].apply(lambda x: 1 if x == True else 0)
data_test['IsHoliday'] = data_test['IsHoliday'].apply(lambda x: 1 if x == True else 0)

data_train['Type'] = data_train['Type'].apply(lambda x: 1 if x == 'A' else (2 if x == 'B' else 3))
data_test['Type'] = data_test['Type'].apply(lambda x: 1 if x == 'A' else (2 if x == 'B' else 3))

# **4.2 | Filling missing values**

In [None]:
data_train.isna().sum()[data_train.isna().sum() > 0].sort_values(ascending=False)

data_test.isna().sum()[data_test.isna().sum() > 0].sort_values(ascending=False)

data_train.fillna(0, inplace = True)

data_test['CPI'].fillna(data_test['CPI'].mean(), inplace = True)
data_test['Unemployment'].fillna(data_test['Unemployment'].mean(), inplace = True)

data_test.fillna(0, inplace = True)

# **5 | Feature selection**

In [None]:
features = [feature for feature in data_train.columns if feature not in ('Date','Weekly_Sales')]

In [None]:
X = data_train[features].copy()
y = data_train.Weekly_Sales.copy()

In [None]:
data_sample = data_train.copy().sample(frac=.25)
X_sample = data_sample[features].copy()
y_sample = data_sample.Weekly_Sales.copy()

In [None]:
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X_sample, y_sample, random_state=0, test_size=0.15)

In [None]:
feat_model = xgb.XGBRegressor(random_state=0).fit(X_train, y_train)

In [None]:
xgb.plot_importance(feat_model);

# **6 | Modeling**

In [None]:
def WMAE(dataset, real, predicted):
    weights = dataset.IsHoliday.apply(lambda x: 5 if x else 1)
    return np.round(np.sum(weights*abs(real-predicted))/(np.sum(weights)), 2)

In [None]:
models = {
          '    LGBM': lgb.LGBMRegressor(random_state = 0),
          ' XGBoost': xgb.XGBRegressor(random_state = 0, objective = 'reg:squarederror'),
          'Catboost': cb.CatBoostRegressor(random_state = 0, verbose=False),          
          '    HGBR': HistGradientBoostingRegressor(random_state = 0),
          ' ExtraTr': ensemble.ExtraTreesRegressor(bootstrap = True, random_state = 0),
          ' RandomF': ensemble.RandomForestRegressor(random_state = 0),
         }

In [None]:
def model_evaluation (name, model, models, X_train, y_train, X_valid, y_valid):
   
    rmses = []
    
    for i in range(len(models)):
    
        # Model fit
        model.fit(X_train, y_train)
        
        # Model predict
        y_preds = model.predict(X_valid)

        # RMSE
        rmse = np.sqrt(np.mean((y_valid - y_preds)**2))
        rmses.append(rmse)
        
    return np.mean(rmses)

In [None]:
for name, model in models.items():
    print(name + ' Valid RMSE {:.4f}'.format(model_evaluation(name, model, models,  X_train, y_train, X_valid, y_valid)) )

In [None]:
X_baseline = X[['Store','Dept','IsHoliday','Size','Week','Type','Year','Day']].copy()

In [None]:
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X_baseline, y, random_state=0, test_size=0.1)

In [None]:
RF = ensemble.RandomForestRegressor(n_estimators=60, max_depth=25, min_samples_split=3, min_samples_leaf=1)
RF.fit(X_train, y_train)

In [None]:
XG = xgb.XGBRegressor(random_state = 0, objective = 'reg:squarederror')
XG.fit(X_train, y_train)

In [None]:
test = data_test[['Store','Dept','IsHoliday','Size','Week','Type','Year','Day']].copy()
predict_rf = RF.predict(test)

In [None]:
sampleSubmission['Weekly_Sales'] = predict_rf
sampleSubmission.to_csv('submission.csv',index=False)