Walmart Holiday Sale Prediction

Given data about weekly Walmart sales, let's try to predict whether a given record will be from during a holiday.

We will use logistic regression, support vector machine, and decision tree classifiers to make our predictions.

In [None]:
#Importing Packages
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression

In [None]:
data=pd.read_csv('../input/retail-analysis-with-walmart-data/Walmart_Store_sales.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

Observations:
1. Date column is not in date type
2. All the columns are of numerical type

In [None]:
data.describe(include = 'all')

In [None]:
data.isnull().sum()

There are no missing values in the dataset

In [None]:
#Converting date column to datetime type
data['Date'] = pd.to_datetime(data['Date'])
data['year'] = pd.DatetimeIndex(data['Date']).year
data['month'] = pd.DatetimeIndex(data['Date']).month

In [None]:
data.head()

In [None]:
# which store has the best Weekly _ Sales from this data 
data[data['Weekly_Sales']==max(data['Weekly_Sales'])]

Store 14 looks like the one with the maximmum weekly sale . Wow (3818686) this is high.

**Analysis**

In [None]:
#Which store has maximum sales?

data.groupby('Store')['Weekly_Sales'].max().sort_values(ascending=False).head()

Store 14 has maximum weekly sales

In [None]:
#Which Store has maximum variations in weekly sales?

std_devs = []
total_stores = data['Store'].nunique()
for store in range(1,int(total_stores)+1):
    values = data[data['Store']==store]['Weekly_Sales'].values
    sd = np.std(values)
    std_devs.append(sd)
    
max_std_dev = max(std_devs)
print("Store with maximum standard deviation: Store",std_devs.index(max_std_dev)+1)

Store 14 has maximum standard deviation

In [None]:
#Which store/s has good quarterly growth rate in Q3’2012?



In [None]:
qtr_profits =[]
for store in range(1,int(total_stores)+1):
    values = data[(data['Store']==store) & (data['Date']> '2012-06-01') & (data['Date']<'2012-09-01')]['Weekly_Sales'].values
    qtr_profits.append(np.mean(values))
    
print("Store with maximum profit in Q3’2012: Store",qtr_profits.index(max(qtr_profits))+1)

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=[store for store in range(1,int(total_stores)+1)],y=qtr_profits)
plt.show()

Find out holidays which have higher sales than the mean sales in non-holiday season for all stores together

In [None]:
non_holiday_sales = data[data['Holiday_Flag']==0]['Weekly_Sales'].mean()
df_v1 = pd.DataFrame(data[data['Holiday_Flag']==1].groupby('Date')['Weekly_Sales'].mean())
print(df_v1[df_v1['Weekly_Sales'] > non_holiday_sales])


In [None]:
data.head()

In [None]:
#Which year has maximum weekly sales?

data.groupby(['year'])['Weekly_Sales'].sum().sort_values(ascending=False).plot.bar(x="Year", y="Weekly Sales", title="Year vs Weekly Sales")


Sales are higher at 2011 when compared to 2010 and 2012

In [None]:
#Which store has highest sales in holiday?

data_v1 = data[(data['Holiday_Flag'] == 1)&(data['year']==2011) ]
data_v1.groupby('Store')['Weekly_Sales'].sum().sort_values(ascending = False).head(10).plot.bar(color='violet', title = 'Store Vs Sales for 2011')

Store 4 has the highest weekly sales in the year 2011

In [None]:
#Correlation plot
corr = data.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, vmin=-2.0)
plt.show()

Predicting weekly sales for all the stores

In [None]:
data['Day']=data['Date'].dt.day
data = data.drop('Date',axis=1)
data_v2 = pd.get_dummies(data, columns = ['Holiday_Flag','Store'])
y = data_v2['Weekly_Sales']
X= data_v2.drop('Weekly_Sales',axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
ln_model = LinearRegression()
ln_model.fit(X_train,y_train)
y_pred = ln_model.predict(X_test)
print("r2 score:",r2_score(y_test,y_pred))

Predicting whether the given sales record belongs to holiday or not

In [None]:
#Splitting independent and dependent variables
y = data['Holiday_Flag']
X= data.drop('Holiday_Flag',axis=1)

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7)

In [None]:
penalty = ["l1","l2"]
c = np.logspace(0,8,10)
hyperparameters = dict(C=c, penalty=penalty, solver =['liblinear'])

In [None]:
model = LogisticRegression()
clf = GridSearchCV(model,hyperparameters, cv = 10 , verbose = 0)
best_model = clf.fit(X_train,y_train)
print("Best hyperparameters : ",best_model.best_params_)

In [None]:
y_pred = best_model.predict(X_test)
print(best_model.score(X_test,y_test))

In [None]:
confusion_matrix = confusion_matrix(y_pred,y_test)
print(confusion_matrix)