### ASSIGNMENT

<b>Problem Statement:</b>
You are provided with historical sales data for 45 stores of a Retail chain located in different
regions. Each store contains a number of departments, and you are tasked with predicting the
department-wide sales for each store.

The data is provided in 4 different CSVs.



In [None]:
# importing basic packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import datetime # manipulating date formats
# Viz
import matplotlib.pyplot as plt # basic plotting
import seaborn as sns # for prettier plots
%matplotlib inline

# settings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# reading data
features=pd.read_csv("../input/walmart-recruiting-store-sales-forecasting/features.csv.zip")
stores=pd.read_csv("../input/walmart-recruiting-store-sales-forecasting/stores.csv")
train=pd.read_csv("../input/walmart-recruiting-store-sales-forecasting/train.csv.zip")
test=pd.read_csv("../input/walmart-recruiting-store-sales-forecasting/test.csv.zip")

In [None]:
df_names=['features','stores','train','test']
df_list=[features,stores,train,test]
for i in range(4):
    print('--'*15)
    print(f'Dataframe {df_names[i]} has {df_list[i].shape[0]} rows and {df_list[i].shape[1]} columns.')
    print('--'*15)
    display(df_list[i].head(5))

In [None]:
# checking time duration of records
for i in [0,2,3]:
    print(f'Dataframe {df_names[i]} contains data from {df_list[i].Date.min()} to {df_list[i].Date.max()}.\n')

In [None]:
# checking missing values
for i in range(4):
    print(f'Dataframe {df_names[i]} has missing values.\n') if (df_list[i].isna().sum().any()==True) else print(f'Dataframe {df_names[i]} does not have missing values.')

In [None]:
import missingno as msno
msno.bar(features,figsize=(15, 5),fontsize=15,color='orange');

In [None]:
print('Percentages of missing values in features dataframe.')
(100*features.isna().sum()/features.shape[0]).sort_values()

All the markdown<sub>i</sub> columns have more than 50% missing values. These are anonymized data related to promotional markdowns that the retail chain is running. MarkDown data is only available after Nov 2011. So, it's quite difficult to choose best imputation technique. If correlation of these columns is not strong with target variable, I will drop them. Let's do EDA first.

### EDA

In [None]:
# pie chart
labels = stores.Type.value_counts().index.tolist()
sizes = stores.Type.value_counts().values.tolist()
explode = (0.05, 0.02, 0)
plt.figure(figsize=(5,5))
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=60,
        textprops={'fontsize': 18},colors=['#f538cc','#fa5282','#facc69'])
plt.title('Different types of stores');

In [None]:
ax = sns.countplot(stores.Type ,facecolor=(0,0,0,0),linewidth=10,
                   edgecolor=sns.color_palette("spring", 3))
for p in ax.patches:
    ax.annotate(f'Number of\n stores:\n {p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()-4),
               ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points',fontsize=12);

Almost half of the stores are of type A. Type C stores are least in number.

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x='Store',y='Size',data=stores,order=stores.sort_values('Size')['Store'].tolist())
plt.title('Sizes of all the stores.',fontsize=15)
plt.tight_layout();

There are broadly 3 types of stores: small-sized, medium-sized and large-sized. This numerical variable can be converted into categorical variable using pd.cut function but first let's check relation between size and type of the stores.

In [None]:
sns.set_style('whitegrid')
sns.boxplot(x='Type',y='Size',data=stores,palette='spring')
plt.title('Type vs Size',fontsize=15);

Though we don't have any knowledge about how stores were divided into these types, we can see from the graph that it covers the effect of size column.

In [None]:
# combining train/test and stores and features dataframes for further analysis

train_expanded = train.merge(features, how='inner',on=['Store','Date','IsHoliday']).sort_values(by=
                            ['Store','Dept','Date']).reset_index(drop=True)
train_expanded = train_expanded.merge(stores, how='inner', on=['Store'])

test_expanded = test.merge(features, how='inner',on=['Store','Date','IsHoliday']).sort_values(by=
                            ['Store','Dept','Date']).reset_index(drop=True)
test_expanded = test_expanded.merge(stores, how='inner', on=['Store'])

# converting dtype of date column
train_expanded['Date'] = pd.to_datetime(train_expanded['Date'])
test_expanded['Date'] = pd.to_datetime(test_expanded['Date'])

In [None]:
train_expanded.head(3)

In [None]:
plt.figure(figsize=(15,3))
train_expanded.groupby('Date')['Weekly_Sales'].mean().plot()
plt.title('Average weekly Sales of the company across all stores in given timeframe', fontsize=18)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Date', fontsize=16);

In [None]:
# creating 3 new features from date column

for df in [train_expanded,test_expanded]:
    df['Week'] = df['Date'].dt.week
    df['Month'] = df['Date'].dt.month
    df['Year'] = df['Date'].dt.year

plt.figure(figsize=(15,3))
train_expanded[train_expanded['Year']==2010].groupby('Month').mean()['Weekly_Sales'].plot()
train_expanded[train_expanded['Year']==2011].groupby('Month').mean()['Weekly_Sales'].plot()
train_expanded[train_expanded['Year']==2012].groupby('Month').mean()['Weekly_Sales'].plot()
plt.title('Average weekly Sales of the company in each year', fontsize=18)
plt.legend(['2010', '2011', '2012'], loc='best', fontsize=16)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Months', fontsize=16);

In [None]:
plt.figure(figsize=(15,3))
train_expanded[train_expanded['Type']=='A'].groupby('Month').mean()['Weekly_Sales'].plot()
train_expanded[train_expanded['Type']=='B'].groupby('Month').mean()['Weekly_Sales'].plot()
train_expanded[train_expanded['Type']=='C'].groupby('Month').mean()['Weekly_Sales'].plot()
plt.title('Average weekly Sales of the company by type of the store', fontsize=18)
plt.legend(['Type A', 'Type B', 'Type C'], loc='best', fontsize=16)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Months', fontsize=16);

Sales of the company see rise during the end of the year. Possible reasons could be any tradition or festival in that company.

Type A stores seem to have comparitively high sales. But this can be due to difference in number of stores of different type. Also, we can notice that sales of type C are constant over the year.

In [None]:
def av_sales_plotter(str):
    plt.figure(figsize=(20,5))
    train_expanded.groupby(str).mean()['Weekly_Sales'].sort_values().plot(kind='bar',color='#b7f28a')
    plt.title(f'Average Sales of each {str} in given timeframe.', fontsize=18)
    plt.ylabel('Sales', fontsize=16)
    plt.xlabel(str, fontsize=16)
    plt.tick_params(axis='x', labelsize=14)
    
av_sales_plotter('Store')

In [None]:
av_sales_plotter('Dept')

Around 10 departments have lowest sales. The company can further analyse as to what are the possible reasons and how it can be improved.

In [None]:
print('IsHoliday vs Weekly_Sales')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.stripplot(y=train_expanded['Weekly_Sales'],x=train_expanded['IsHoliday'])
plt.subplot(1,2,2)
sns.violinplot(y=train_expanded['Weekly_Sales'],x=train_expanded['IsHoliday']);

Highest sales events were recorded in the special holiday week.

In [None]:
print('Type vs Weekly_Sales')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.stripplot(y=train_expanded['Weekly_Sales'],x=train_expanded['Type'])
plt.subplot(1,2,2)
sns.boxenplot(y=train_expanded['Weekly_Sales'],x=train_expanded['Type']);

Highest sales events were recorded in the type B stores.

In [None]:
print('Weekly sales vs size')
sns.jointplot(train_expanded['Weekly_Sales'],train_expanded['Size']);

No specific pattern.

In [None]:
train_expanded[['Date', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']].plot(x='Date', subplots=True, figsize=(20,15));

As expected, temperature has high seasonality. Week and month column can effectively cover its effect.

Fuel price and CPI show an upward trend and unemployment shows downward trend.

Let's explore effect of these feature with weekly sales.

In [None]:
sns.set_palette("summer")
sns.pairplot(train_expanded[['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']],
             corner=True,diag_kind="kde");

There is no particular relationship between these features and target variable.

Also, distribution of target variable is highly skewed. That's why, I will not go with linear regression.

In [None]:
train_expanded.dropna().corr()['Weekly_Sales'].abs().sort_values()[:-1].plot(kind='bar');

Dize and Dept are most correlated with the target variable.

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(train_expanded.corr(),annot=True,cmap='summer');

In above heatmap, correlation between different columns can be checked.

## Pre-processing and modelling

In [None]:
# importing relevant libraries

from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

I will label encode the IsHoliday and Type feature. From EDA, I concluded to drop Year, CPI, unemployment, temperature and fuel price. I am also dropping all the 'markdown' columns.

In [None]:
# preprocessing

def preprocessor(df):
    # label-encoding
    df['IsHoliday'] = df['IsHoliday'].astype('str').map({'True':0,'False':1})
    df.Type = df.Type.map({'A':2,'B':1,'C':0})
    # deleting less important features
    return df.drop(['Date','Year','MarkDown1','MarkDown2','MarkDown4','MarkDown3','MarkDown5','CPI',
             'Unemployment','Temperature','Fuel_Price'],axis=1)

train_preprocessed = preprocessor(train_expanded)
test_preprocessed = preprocessor(test_expanded)

In [None]:
# splitting data into 2 parts

y = train_preprocessed["Weekly_Sales"]
X = train_preprocessed.drop(['Weekly_Sales'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 101)

# function for displaying scores

def score_calc(predictions):
    scores = pd.DataFrame([mean_absolute_error(y_test, predictions)
#                           ,mean_squared_error(y_test, predictions)
                          ,r2_score(y_test, predictions)],columns=['score'],index=['MAE','R2 score'])
    return scores

# function for building model

def run_model(model,name):
    
    name = model()
    name.fit(X_train, y_train)
    preds = name.predict(X_test)
    try:
        feat_imp = name.feature_importances_
        plt.bar(X_train.columns,feat_imp,color='green')
        plt.title('Feature Importance graph')
        plt.xticks(rotation=45)
    except:
        pass
    return score_calc(preds)

In [None]:
train_preprocessed.head()

<b> Approach 1: Taking all columns</b>

In [None]:
run_model(DecisionTreeRegressor,'dtree')

In [None]:
run_model(RandomForestRegressor,'rfc')

In [None]:
run_model(XGBRegressor,'xgb')

In [None]:
run_model(KNeighborsRegressor,'knn')

<b> Approach 2: Standardizing size column</b>

Here, we can see tree based models do not need feature scaling.

In [None]:
# scaling size column
scaler=StandardScaler()
scaler.fit(train_preprocessed['Size'].values.reshape(-1,1))
X_train['Size'] = scaler.transform(X_train['Size'].values.reshape(-1, 1))
X_test['Size'] = scaler.transform(X_test['Size'].values.reshape(-1, 1))

In [None]:
run_model(DecisionTreeRegressor,'dtree')

In [None]:
# run_model(XGBRegressor,'xgb')

In [None]:
# run_model(RandomForestRegressor,'rfc')

<b> Approach 3: Using only Store, Dept and Size columns</b>

In [None]:
X_train = X_train[['Store','Dept','Size']]
X_test = X_test[['Store','Dept','Size']]

In [None]:
run_model(DecisionTreeRegressor,'dtree')

In [None]:
# I tried one hot encoding week feature but score did not improve so I dropped the idea.

# dummies=pd.get_dummies(train_preprocessed.Week.astype(str),drop_first=True,prefix='week')
# dum_week = pd.concat([train_preprocessed,dummies],axis=1)
# dum_week.drop('Week',axis=1,inplace=True)

### Cross-validation

In [None]:
# option A
from sklearn.model_selection import cross_val_score
score = cross_val_score(RandomForestRegressor(), X, y, cv=4)
print(f"Average 4-Fold CV Score: {score.mean().round(4)}")

### Hyper parameter tuning

In [None]:
random_grid = {'n_estimators': [50,60,70],
               'max_features': [3,4],
               'max_depth': [25,30,35],
               'min_samples_split': [3,4],
              'min_samples_leaf':[1,2]}

rf_grid = RandomizedSearchCV(RandomForestRegressor(),
                        random_grid,
                        cv = 4,
                        n_jobs = 5,
                        verbose=True)

rf_grid.fit(X,y)

print(rf_grid.best_score_)
print(rf_grid.best_params_)

So, cv scored increased after hyper-parameter tuning.

In [None]:
# parameters = {'learning_rate': [.03, 0.05, .07],
#               'max_depth': [6,7,8,9],
#               'n_estimators': [500,700]}

# xgb_grid = GridSearchCV(XGBRegressor(),
#                         parameters,
#                         cv = 3,
#                         n_jobs = 5,
#                         verbose=True)

# xgb_grid.fit(X,y)

# print(xgb_grid.best_score_)
# print(xgb_grid.best_params_)

In [None]:
# option B - cross-validation using kfold

predictor_train_scale = train_preprocessed.drop('Weekly_Sales',axis=1)
predictor_test_scale = test_preprocessed
target_train = train_preprocessed.Weekly_Sales

kf=KFold(n_splits=4,shuffle=True)

preds_3   = list()
y_pred_3  = []
r2_score_ = []
mae=[]

# Applying model on each fold and calculating mean of score
for i,(train_idx,val_idx) in enumerate(kf.split(predictor_train_scale)):    
    
    X_train, y_train = predictor_train_scale.iloc[train_idx,:], target_train.iloc[train_idx]    
    X_val, y_val = predictor_train_scale.iloc[val_idx, :], target_train.iloc[val_idx]
   
    print('\nFold: {}\n'.format(i+1))
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)

    r2 = r2_score(y_val,rf.predict(X_val))
    mae_ = mean_absolute_error(y_val,rf.predict(X_val))
    r2_score_.append(r2)
    mae.append(mae_)
    preds_3.append(rf.predict(predictor_test_scale[predictor_test_scale.columns]))
    
y_pred_final_3 = np.mean(preds_3,axis=0)    

print('R2 - CV Score: {}'.format((sum(r2_score_)/4)),'\n')
print('MAE Score: {}'.format((sum(mae)/4)),'\n')
print("Score : ",r2_score_)

Here cv score is much higher. I was not able to figure out why. If you know please tell in comments section.

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(15,5))
test_expanded['Weekly_Sales'] = y_pred_final_3
train_expanded.groupby('Date')['Weekly_Sales'].mean().plot()
test_expanded.groupby('Date')['Weekly_Sales'].mean().plot(color='orange')
plt.legend(['Actual', 'Predicted'], loc='best', fontsize=16)
plt.ylabel('Sales', fontsize=16);

### Making final predictions

In [None]:
submission = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip')
submission['Weekly_Sales'] = y_pred_final_3
submission.to_csv('results_rf_cv_tuned.csv',index=False)

pLEASE PROVIDE FEEDBACK SO THAT I CAN IMPROVE!!