## Rossmann Store Data Analysis and Sales Prediction
### Created by : Ezhilarasan 
To analyse Rossmann Store Data and prodict the future Sales using XGB Regressor along with Parameter Tuning (using Randomized Search)


Import necessary libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler 

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from xgboost.sklearn import XGBRegressor

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor

In [None]:
import time
import gc
from scipy.stats import uniform
import calendar

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
pd.set_option('display.max_columns', 100)

Read given datasets (train & store)

In [None]:
train = pd.read_csv("/kaggle/input/rossmann-store-sales/train.csv")
train.head()

In [None]:
store = pd.read_csv("/kaggle/input/rossmann-store-sales/store.csv")
store.head()

Check the data set for any null values and correct (fill with some values or drop them)

In [None]:
train.isnull().sum()

In [None]:
store.isnull().sum()

In [None]:
#store.shape
#store.dropna(inplace=True)
#store.shape

Replace numerical values for the columns which have labels

In [None]:
store['StoreType'].value_counts()
store['Assortment'].value_counts()

In [None]:
store['StoreType']= store['StoreType'].map({'a':1, 'b' : 2, 'c': 3, 'd' : 4})
store['Assortment'] = store['Assortment'].map({'a':1, 'b' : 2, 'c': 3})

In [None]:
store.head()

Merge train and store data

In [None]:
data = pd.merge(train, store,on = 'Store', how='left')
data.head()

In [None]:
data.shape

StateHoliday field has label values. we will convert the values to numeric group

In [None]:
data['StateHoliday'].value_counts()

Date column has to be converted to numerical fields

Since the data size is more (almost 10 lakhs) and system is hanging if I run fulle data set, I am trying to minimize the data set before processing.

In [None]:
#Tried with reducing the dataset by taking only the records have Sales > 0.
#still system is getting longer time for procesing the model

#data = data[data['Sales'] > 0]

data.dropna(inplace = True)

In [None]:
data.shape

We will process the dataset for changing the date field to numerical fields and StateHoliday field as well

In [None]:
# credits to kaggle link on specifying how to handle date and month values
# https://www.kaggle.com/rohinigarg/random-forest-and-xgboost-parameter-tuning

def checkpromomonth(row):
 if (row['MonthName'] in row['PromoInterval']):
    return 1
 else:
    return 0


In [None]:
def ProcessData(data):
    data["CompetitionDistance"].fillna(data["CompetitionDistance"].mean(), inplace = True)
    
    data['StateHoliday']= data['StateHoliday'].map({'0':0, 0: 0,'a':1, 'b' : 2, 'c': 3})
    
    data['Date']=pd.to_datetime(data['Date'])
    data['Year']=data['Date'].dt.year
    data['MonthNumber']=data['Date'].dt.month
    data['MonthName']=data['MonthNumber'].apply(lambda x: calendar.month_abbr[x])
    data['Day']=data['Date'].dt.day
    data['WeekNumber']=data['Date'].dt.weekofyear

    data['CompetitionOpen'] = 12 * (data['Year'] - data['CompetitionOpenSinceYear']) + (data['MonthNumber'] - data['CompetitionOpenSinceMonth'])
    data['CompetitionOpen'] = data['CompetitionOpen'].apply(lambda x: x if x > 0 else 0)

    data['Promo2Open'] = 12 * (data['Year'] - data['Promo2SinceYear']) + (data['WeekNumber'] - data['Promo2SinceWeek']) / float(4)
    data['Promo2Open'] = data['Promo2Open'].apply(lambda x: x if x > 0 else 0)

    data['PromoInterval']=data['PromoInterval'].astype(str)
    
    data['IsPromoMonth'] =  data.apply(lambda row: checkpromomonth(row),axis=1)

    data.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'], axis = 1,  inplace = True)
    data.drop(['Promo2SinceYear', 'Promo2SinceWeek'], axis = 1,  inplace = True)
    data.drop(['Date', 'MonthName','PromoInterval'], axis = 1,  inplace = True)

In [None]:
ProcessData(data)

In [None]:
data.head()

In [None]:
data.isnull().sum()
data.shape

We will check minimum and maximum values of the data set and try to reduce the memory size

In [None]:
data.min().min()           
data.max().max() 

In [None]:
data = data.astype('int32')

In [None]:
data.info()

Now we use the organized data set for regression problem

drop both sales and customer column in train data set since in test data set sales and customer fields are not there. before that use sales as target variable from train data set as we will be predicting sales in test data set

In [None]:
y = data['Sales']
data.drop(['Sales','Customers'], axis = 1,  inplace = True)

In [None]:
data.nunique()

In [None]:
num_columns = data.columns[data.nunique() > 12]
cat_columns = data.columns[data.nunique() <= 12]
num_columns
cat_columns

In [None]:
plt.figure(figsize=(15,10))
sns.distributions._has_statsmodels=False
for i in range(len(num_columns)):
    plt.subplot(2,3,i+1)
    sns.distplot(data[num_columns[i]])
    
plt.tight_layout()

Based on the graph, we will use RobustScaler for the numerical column

In [None]:
ct=ColumnTransformer([
    ('rs',RobustScaler(),num_columns),
    ('ohe',OneHotEncoder(),cat_columns),
    ],
    remainder="passthrough"
    )
ct.fit_transform(data)

In [None]:
X=data

We will use the scaled data split it internally to train and test, create our model and predict the test data (before going to predict the Sales for the actual test data given in the kaggle link)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30)

In [None]:
X_train.shape
X_test.shape
y_train.shape
y_test.shape

Pipeline creation for XGB Regressor. 

In [None]:
steps_xg = [('sts', StandardScaler() ),
            ('pca', PCA()),
            ('xg',  XGBRegressor(objective='reg:squarederror',silent = False, n_jobs=3, reg_lambda=1,gamma=0))
            ]

pipe_xg = Pipeline(steps_xg)

pipe_xg.get_params()

Below function is for evluating the RMSPE (Root Mean Square Percentage Error).

In [None]:
#credit : https://www.kaggle.com/tushartilwankar/sklearn-rf
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def RMSPE(y, yhat):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe 

In [None]:
#from sklearn.metrics import make_scorer, r2_score, mean_squared_error

We will use Random Search for tuning the pipeline parameters

In [None]:
#Randomized Search
parameters = {'xg__learning_rate':  uniform(0, 1),
              'xg__n_estimators':   range(50,300),
              'xg__max_depth':      range(3,10),
              'pca__n_components' : range(10,17)}

rs = RandomizedSearchCV(pipe_xg,
                        param_distributions=parameters,
                        #scoring=make_scorer(mean_squared_error, squared=False),
                        #scoring= RMSPE,
                        n_iter=15,    
                        verbose = 1,
                        #refit = RMSPE,
                        n_jobs = 3,
                        cv = 3              
                        )

In [None]:
start = time.time()
rs.fit(X_train, y_train)
end = time.time()
(end - start)/60 

In [None]:
rs.best_estimator_.named_steps["xg"].feature_importances_
rs.best_estimator_.named_steps["xg"].feature_importances_.shape

Creating the Model with the best parameters from RandomSearch Results

In [None]:
# Model with parameters of random search
model_rs = XGBRegressor(objective='reg:squarederror',silent = False, n_jobs=3, reg_lambda=1,gamma=0,
                    learning_rate = rs.best_params_['xg__learning_rate'],
                    max_depth = rs.best_params_['xg__max_depth'],
                    n_estimators=rs.best_params_['xg__max_depth']
                    )


model_rs.fit(X_train, y_train)

Predict our test data in the final model created

In [None]:
y_pred_rs = model_rs.predict(X_test)

In [None]:
RMSPE(y_test,y_pred_rs)

rs.best_score_

In [None]:
import math

In [None]:
accuracy_rs =  math.sqrt(sum((y_test - y_pred_rs)**2)/y_test.count())
print("Accuracy with Random search XGB model:",accuracy_rs*100)

We will check the predicted sales values from randomized search with target data as below

In [None]:
X_test_df = X_test.reset_index()
y_test_df = y_test.reset_index()
y_pred_df  = pd.DataFrame(y_pred_rs)

final = X_test_df
#final
final = final.merge(y_test_df, left_index=True, right_index=True)
final = final.merge(y_pred_df, left_index=True, right_index=True)
final

Based on the above table, last 2 columns (Actual Sales & Predicted Sales) are relatively matching.

We will load test dat given in the kaggle merge with store data, process it and predict the sales in our randomized search model

In [None]:
test = pd.read_csv("/kaggle/input/rossmann-store-sales/test.csv")
test.head()

test.isnull().sum()

In [None]:
test.shape

We will consider 0 where the Store field has null value

In [None]:
test.Open.fillna(0, inplace= True)

In [None]:
test.isnull().sum()

In [None]:
store.head()

In [None]:
data = pd.merge(test, store,on = 'Store', how='left')
data.head()

data.shape

In [None]:
ProcessData(data)

data.head()

In [None]:
submission = data['Id']
data=data.drop('Id',axis=1)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.min().min()           
data.max().max()

In [None]:
data = data.astype('int32')

In [None]:
data.nunique() 

In [None]:
num_columns = data.columns[data.nunique() > 12]
cat_columns = data.columns[data.nunique() <= 12]

In [None]:
num_columns
cat_columns

In [None]:
plt.figure(figsize=(15,10))
sns.distributions._has_statsmodels=False
for i in range(len(num_columns)):
    plt.subplot(2,3,i+1)
    graph = sns.distplot(data[num_columns[i]])
    
plt.tight_layout()



In [None]:
ct=ColumnTransformer([
    ('rs',RobustScaler(),num_columns),
    ('ohe',OneHotEncoder(),cat_columns),
    ],
    remainder="passthrough"
    )
ct.fit_transform(data)


In [None]:
y_pred_rs = model_rs.predict(data)

In [None]:
y_pred_rs

In [None]:
final = submission.reset_index()
y_pred_df  = pd.DataFrame(y_pred_rs)

final = final.merge(y_pred_df, left_index=True, right_index=True)
final

In [None]:
final.to_csv('submission.csv')

Conclusion : We could predict the Sales using XGB Regressor (upon parameter tuning using Random Search)