In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
features = pd.read_csv('../input/walmart-datasets/features.csv')
stores = pd.read_csv('../input/walmart-datasets/stores.csv')
train = pd.read_csv('../input/walmart-datasets/train_walmart.csv')
test = pd.read_csv('../input/walmart-datasets/test_walmart.csv')

# <b>EXPLORATORY DATA ANALYSIS<b>

In [None]:
features.head()

In [None]:
stores.head()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
features.shape,stores.shape,train.shape,test.shape

In [None]:
import missingno as msno
msno.bar(features,figsize=(10,4),fontsize=12,color='orange')

In [None]:
stores['Type'].unique()

In [None]:
px.pie(data_frame=stores,names='Type',template=None,title='Different types of Stores')

In [None]:
sns.set_style('whitegrid')
fig,ax = plt.subplots(figsize = (11,6))
sns.countplot(x='Type',data=stores,palette=None)
plt.ylim(0,25)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 1,height ,ha="center")

**Almost half the stores are of Type A.**

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x='Store',y='Size',data=stores,order=stores.sort_values('Size')['Store'].tolist(),saturation=1)
plt.ylim(0,250000)

**There are broadly three types of stores : small-sized, medium-sized and larg-sized.**

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='Type',y='Size',data=stores)
plt.ylim(0,250000)

In [None]:
train_expanded = train.merge(features, how='inner',on=['Store','Date','IsHoliday']).sort_values(by=
                            ['Store','Dept','Date']).reset_index(drop=True)
train_expanded = train_expanded.merge(stores, how='inner', on=['Store'])

test_expanded = test.merge(features, how='inner',on=['Store','Date','IsHoliday']).sort_values(by=
                            ['Store','Dept','Date']).reset_index(drop=True)
test_expanded = test_expanded.merge(stores, how='inner', on=['Store'])

In [None]:
train_expanded.head(3)

In [None]:
train_expanded.info()

In [None]:
train_expanded['Date'] = pd.to_datetime(train_expanded['Date'])
test_expanded['Date'] = pd.to_datetime(test_expanded['Date'])

In [None]:
avg_sales = pd.DataFrame(train_expanded.groupby('Date')['Weekly_Sales'].mean())
avg_sales.plot(figsize=(17,5))
plt.ylim(10000,30000)
plt.title('Average Weekly Sales of the company across all stores',fontsize=15)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Sales',fontsize=15)

In [None]:
for df in [train_expanded,test_expanded]:
    df['Week'] = df['Date'].dt.week
    df['Month'] = df['Date'].dt.month
    df['Year'] = df['Date'].dt.year
plt.figure(figsize=(17,5))
train_expanded[train_expanded['Year']==2010].groupby('Month').mean()['Weekly_Sales'].plot()
train_expanded[train_expanded['Year']==2011].groupby('Month').mean()['Weekly_Sales'].plot()
train_expanded[train_expanded['Year']==2012].groupby('Month').mean()['Weekly_Sales'].plot()
plt.legend(labels=['2010','2011','2012'],loc='best',fontsize=12)
plt.title('Average Weekly Sales of the company per annum',fontsize=15)
plt.xlabel('Months',fontsize=15)
plt.ylabel('Sales',fontsize=15)
plt.ylim(14000,20000)
plt.xlim(0,14)

In [None]:
plt.figure(figsize=(17,5))
train_expanded[train_expanded['Type']=='A'].groupby('Month').mean()['Weekly_Sales'].plot()
train_expanded[train_expanded['Type']=='B'].groupby('Month').mean()['Weekly_Sales'].plot()
train_expanded[train_expanded['Type']=='C'].groupby('Month').mean()['Weekly_Sales'].plot()
plt.legend(labels=['A','B','C'],loc='best',fontsize=12)
plt.title('Average Weekly Sales of the company per annum',fontsize=15)
plt.xlabel('Months',fontsize=15)
plt.ylabel('Sales',fontsize=15)
plt.xlim(0,14)
plt.ylim(5000,25000)

* **Sales of the company rise during the end of the end of the year probably because of any tradition or festival in the company.**
* **Type A store has the high sales value as compared to other stores.**
* **Type C store has a constant sales.**

In [None]:
def avg_sales_plot(str):
    plt.figure(figsize=(17,5))
    train_expanded.groupby(str)['Weekly_Sales'].mean().sort_values().plot(kind='bar',cmap='summer')
    plt.title(f'Average weekly sales of the company in each {str}',fontsize=15)
    plt.xlabel(xlabel=str,fontsize=15)
    plt.ylabel(ylabel='Sales',fontsize=15)
    plt.tick_params(axis='x',labelsize=10)

avg_sales_plot('Store')
plt.xlim(-1,45)
plt.ylim(0,30000)

In [None]:
avg_sales_plot('Dept')

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plot_1 = sns.stripplot(x=train_expanded['IsHoliday'],y=train_expanded['Weekly_Sales'])
plt.subplot(1,2,2)
plot_2 = sns.violinplot(x=train_expanded['IsHoliday'],y=train_expanded['Weekly_Sales'])

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plot_1 = sns.stripplot(x=train_expanded['Type'],y=train_expanded['Weekly_Sales'])
plt.subplot(1,2,2)
plot_2 = sns.violinplot(x=train_expanded['Type'],y=train_expanded['Weekly_Sales'])

In [None]:
train_expanded[['Date', 'Temperature', 'Fuel_Price', 'CPI','Unemployment']].plot(x='Date',subplots=True,figsize=(20,15),kind='line')

* **As we can see, Temperature has high seasonality.**
* **Fuel Price and CPI shows an upward trend, while Unemployment shows a downward trend.**

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(10,6))
train_expanded.dropna().corr()['Weekly_Sales'].abs().sort_values()[:-1].plot(kind='bar')
plt.xlim(-1,16.0)

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(train_expanded.drop(['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5'],axis=1).corr(),annot=True,cmap='Blues')

In [None]:
train_expanded = train_expanded.fillna(0)
test_expanded = test_expanded.fillna(0)

In [None]:
msno.bar(train_expanded)

# **DATA PROCESSING AND MODELLING**

**We will use three ML models to predict the sales of the company:**
* **Linear Regression (Linear model)**
* **Decision Tree Regressor (Tree model)**
* **Random Forest Regressor (Ensemble model)**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn import metrics

In [None]:
train_expanded['IsHoliday'] = train_expanded['IsHoliday'].astype('str').map({'True':0,'False':1})
train_expanded['Type'] = train_expanded['Type'].astype('str').map({'A':0,'B':1,'C':2})
train_expanded['Year'] = train_expanded['Year'].astype(int).map({2010:1,2011:2,2012:3})

In [None]:
X = train_expanded.drop(['Date','Weekly_Sales'],axis=1)
y = train_expanded['Weekly_Sales']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.50,random_state=101)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

### **Linear Regression**

In [None]:
lm = LinearRegression()
lm.fit(X_train,y_train)
predictions = lm.predict(X_test)

In [None]:
print('Coefficients:\n',lm.coef_)
print('\n')
print('Intercept:',lm.intercept_)
print('\n')
print('Mean Absolute Error:',metrics.mean_absolute_error(y_test,predictions))
print('\n')
print('Mean Squared Error:',metrics.mean_squared_error(y_test,predictions))
print('\n')
print('Root Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_test,predictions)))
print('\n')
print('R-Squared Score:',r2_score(y_test,predictions))

### **Decision Tree Regressor (Tree model)**

In [None]:
print('Mean Absolute Error:',metrics.mean_absolute_error(y_test,predictions))
print('\n')
print('Mean Squared Error:',metrics.mean_squared_error(y_test,predictions))
print('\n')
print('Root Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_test,predictions)))
print('\n')
print('R-Squared Score:',r2_score(y_test,predictions))

In [None]:
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)
predictions = dt.predict(X_test)

### **Random Forest Regressor (Ensemble model)**

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)

In [None]:
predictions = rfr.predict(X_test)

In [None]:
print('Mean Absolute Error:',metrics.mean_absolute_error(y_test,predictions))
print('\n')
print('Mean Squared Error:',metrics.mean_squared_error(y_test,predictions))
print('\n')
print('Root Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_test,predictions)))
print('\n')
print('R-Squared Score:',r2_score(y_test,predictions))

In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(x=predictions,y=y_test)
plt.xlabel(xlabel='Predicted Sales')
plt.ylabel(ylabel='Weekly Sales')

# **SUBMISSION**

In [None]:
submission = pd.read_csv('../input/walmart-datasets/sampleSubmission.csv')
submission = submission.drop('Weekly_Sales',axis=1)
submission['Weekly_Sales'] = pd.DataFrame(predictions)

In [None]:
submission.to_csv('Final Submission.csv',index=False)