In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from sklearn import metrics
import pickle 
from os import path
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('train.csv')
stores = pd.read_csv('stores.csv')
features = pd.read_csv('features.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
data.shape

In [None]:
data.tail()


In [None]:
data.info()

In [None]:
stores.shape

In [None]:
stores.tail()

In [None]:
stores.info()

In [None]:
features.shape

In [None]:
features.tail()

In [None]:
features.info()

# Handling Missing Values of Features Dataset

In [None]:
features['CPI'].fillna(features['CPI'].median(), inplace=True)
features['Unemployment'].fillna(features['Unemployment'].median(), inplace=True)

In [None]:
for i in range(1, 6):
    features['MarkDown'+str(i)] = features['MarkDown'+str(i)].apply(lambda x: 0 if x < 0 else x)
    features['MarkDown'+str(i)].fillna(value=0, inplace=True)

In [None]:
features.info()

# Merging Train, Stores & Features Dataset

In [None]:
data = pd.merge(data, stores, on='Store', how='left')

In [None]:
data = pd.merge(data, features, on=['Store','Date'], how='left')

data['Date'] = pd.to_datetime(data['Date'])

data.sort_values(by=['Date'], inplace=True)

data.set_index(data.Date, inplace=True)

In [None]:
data.head()

In [None]:
data['IsHoliday_x'].isin(data['IsHoliday_y']).all()

In [None]:
data.drop(columns='IsHoliday_x',inplace=True)
data.rename(columns={"IsHoliday_y" : "IsHoliday"}, inplace=True)
data.info()

In [None]:
data.head()

# Splitting Date Column

In [None]:
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

In [None]:
data.head()

# Otlier Detection & Abnormalities

In [None]:
agg_data = data.groupby(['Store', 'Dept']).Weekly_Sales.agg(['max','min','mean','median','std']).reset_index()
agg_data

In [None]:
agg_data.isnull().sum()

In [None]:
store_data = pd.merge(left=data, right=agg_data, on=['Store', 'Dept'], how='left')
store_data.dropna(inplace=True)
data = store_data.copy()
del store_data

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data.sort_values(by=['Date'], inplace=True)
data.set_index(data.Date, inplace=True)
data.head()

In [None]:
data['Total_MarkDown'] = data['MarkDown1'] + data['MarkDown2'] + data['MarkDown3'] + data['MarkDown4'] + data['MarkDown5']
data.drop(['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5'], axis=1, inplace=True)

In [None]:
numeric_col = ['Weekly_Sales','Size','Temperature','Fuel_Price','CPI','Unemployment','Total_MarkDown']
data_numeric = data[numeric_col].copy()

In [None]:
data.shape

In [None]:
data = data[(np.abs(stats.zscore(data_numeric)) < 2.5).all(axis=1)]
data.shape

In [None]:
# Negative Weekly Sales

y = data['Weekly_Sales'][data.Weekly_Sales < 0]
sns.displot(y, height=6, aspect=2)
plt.title("Negative Weekly Sales", fontsize=14);

In [None]:
data = data[data["Weekly_Sales"] >= 0]

In [None]:
data.shape

In [None]:
data['IsHoliday'] = data['IsHoliday'].astype('int')

In [None]:
data.info()

In [None]:
data.to_csv('walmart_dataset.csv')

# Data Visualization

Average Monthly Sales

In [None]:
plt.figure(figsize=(14, 8))
sns.barplot(x="Month", y="Weekly_Sales", data=data)
plt.xlabel("Months", fontsize=14)
plt.ylabel("Sales", fontsize=14)
plt.title("Average Monthly Sales", fontsize=16)
plt.grid();

In [None]:
data_monthly = pd.crosstab(data['Year'], data['Month'], values=data['Weekly_Sales'], aggfunc='sum')
data_monthly

In [None]:
fig, axes = plt.subplots(3,4,figsize=(16,9))
plt.suptitle('Monthly Sales for each Year', fontsize=18)
k=1
for i in range(3):
    for j in range(4):
      sns.lineplot(ax=axes[i,j],data=data_monthly[k])
      plt.subplots_adjust(wspace=0.4,hspace=0.4)
      plt.ylabel(k,fontsize=12)
      plt.xlabel('Years',fontsize=12)
      k+=1

plt.show();

# Average Weekly Sales Store wise

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x='Store', y='Weekly_Sales', data=data)
plt.grid()
plt.title("Average Weekly Sales Store-wise", fontsize=18)
plt.xlabel("Store", fontsize=16)
plt.ylabel("Sales", fontsize=16)
plt.show();

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x='Dept', y='Weekly_Sales', data=data)
plt.grid()
plt.title("Average Sales per Department", fontsize=18)
plt.xlabel("Department", fontsize=16)
plt.ylabel("Sales", fontsize=16)
plt.show();

# Holiday Distribution

In [None]:
plt.figure(figsize=(8,8))
plt.pie(data['IsHoliday'].value_counts(), labels=['No Holiday', 'Holiday'], autopct='%0.2f%%')
plt.title("Pie Chart Distribution of holiday", fontsize=15)
plt.legend()
plt.show();

Encoding

In [None]:
cat_col = ['Type']
data_cat = data[cat_col].copy()

In [None]:
data_cat.tail()

In [None]:
data_cat = pd.get_dummies(data_cat, columns=cat_col)

In [None]:
data_cat.head()

In [None]:
data.shape

In [None]:
data = pd.concat([data, data_cat], axis=1)

In [None]:
data.shape

In [None]:
data.drop(columns=cat_col, inplace=True)

data.drop(columns=['Date'], inplace=True)

In [None]:
data.shape

In [None]:
data.head()

# Data Normalization

In [None]:
num_col = ['Weekly_Sales','Size','Temperature','Fuel_Price','CPI','Unemployment','Total_MarkDown','max','min','mean','median','std']

In [None]:
minmax_scale = MinMaxScaler(feature_range=(0, 1))

def normalization(df,col):
    for i in col:
        arr = df[i]
        arr = np.array(arr)
        df[i] = minmax_scale.fit_transform(arr.reshape(len(arr),1))
    return df

In [None]:
data = normalization(data.copy(),num_col)

In [None]:
data.head()

# Correlation Matrix

In [None]:
plt.figure(figsize=(15,8))
corr = data[num_col].corr()
sns.heatmap(corr,vmax=1.0,annot=True)
plt.title('Correlation Matrix',fontsize=16)
plt.show()

# Split Dataset in train and test

In [None]:
X = data.drop('Weekly_Sales', axis=1)
y = data['Weekly_Sales']
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

In [None]:
y.head()

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

# Machine Learning Model Training

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
lr_acc = lr.score(X_test,y_test)*100
print("Linear Regressor Accuracy - ",lr_acc)

In [None]:
y_lr_pred = lr.predict(X_test)

In [None]:
print("MAE" , metrics.mean_absolute_error(y_test, y_lr_pred))
print("MSE" , metrics.mean_squared_error(y_test, y_lr_pred))
print("RMSE" , np.sqrt(metrics.mean_squared_error(y_test, y_lr_pred)))
print("R2" , metrics.explained_variance_score(y_test, y_lr_pred))

In [None]:
lr_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_lr_pred})
lr_df

In [None]:
plt.figure(figsize=(20,10))
plt.plot(lr.predict(X_test[:200]), label="prediction", linewidth=2.0,color='blue')
plt.plot(y_test[:200].values, label="real_values", linewidth=2.0,color='lightcoral')
plt.legend(loc="best")
plt.show()