# Machine Learning Project

## Walmart Sales Forecasting System

In this project, I will create a machine learning model that learns to forecast Walmart weekly sales for different stores. This is the first independent project undertaken and the dataset was downloaded from Kaggle. 


Scikit-learn will be used. 

### Implementation
I started by importing the modules required for this project and loading in the data.

In [None]:
#import the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

%matplotlib inline

In [None]:
train = pd.read_csv("train.csv")
stores = pd.read_csv("stores.csv")
features = pd.read_csv("features.csv")

In [None]:
train.head()

In [None]:
new_data = pd.merge(features,train,on=["Date","IsHoliday","Store"],how="inner")
train_data = pd.merge(stores,new_data,on=["Store"],how="inner")

train_data.info()

In [None]:
sns.heatmap(train_data.isnull(),cbar=False,yticklabels=False,cmap="viridis")

In [None]:
#fill in null values

train_data.fillna(0,inplace=True)
train_data["Temperature"] = train_data["Temperature"].replace([27.56], 43.89)

In [None]:
train_data.head()

In [None]:
sns.boxplot("Type","Size",data=train_data)

In [None]:
store_type = pd.get_dummies(train_data["Type"])
train_data.drop(["Type", 'MarkDown2', 'MarkDown4'],axis=1,inplace=True)
train_data.head()

In [None]:
train_data = pd.concat([train_data,store_type],axis=1)

train_data["Date"] = pd.to_datetime(train_data["Date"])
train_data["Month"] = train_data["Date"].dt.month

train_data.drop(["Date"],axis=1,inplace=True)

train_data.head()

In [None]:
#show correlation of dataset

plt.figure(figsize=(15,10))
sns.heatmap(train_data.corr(),annot = True)

In [None]:
#scale train data

scaler = MinMaxScaler()
scaler.fit(train_data.drop(["Weekly_Sales"],axis=1))
scaled_features = scaler.transform(train_data.drop(["Weekly_Sales"],axis=1))

scaled_features = pd.DataFrame(scaled_features,columns=[['Store', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 
                                                         'MarkDown3','MarkDown5',"CPI","Unemployment",
                                                         'IsHoliday', 'Dept', 'A', 'B', 'C', 'Month']])

lin_model = LinearRegression()
rfc = RandomForestRegressor()
gbr = GradientBoostingRegressor()
dtree = DecisionTreeRegressor()

models = [lin_model, rfc, gbr, dtree]

In [None]:
X = scaled_features
y = train_data["Weekly_Sales"]
scaled_features.head()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
#train data 

for model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    print(r2_score(y_val,predictions))

The Random Forest Regressor had the highest regression score, so I will use it on our test data. 

In [None]:
#load in test data, fill in null values

test = pd.read_csv("test.csv")

data = pd.merge(features,test,how="inner",on=["Date","IsHoliday","Store"])
test_data = pd.merge(stores,data,on=["Store"],how="inner")

store_type = pd.get_dummies(test_data["Type"])
test_data = pd.concat([test_data,store_type],axis=1)
test_data.drop(["Type","MarkDown2","MarkDown4"],axis=1,inplace=True)

test_data["Date"] = pd.to_datetime(test_data["Date"])
test_data["Month"] = test_data["Date"].dt.month
test_date = test_data["Date"]
test_data.drop(["Date"],axis=1,inplace=True)

test_data.fillna(0,inplace=True)

#scale test data

scaler = MinMaxScaler()
scaler.fit(test_data)
scaled_features = scaler.transform(test_data)

scaled_features = pd.DataFrame(scaled_features,columns=[['Store', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 
                                                         'MarkDown3','MarkDown5',"CPI","Unemployment",
                                                         'IsHoliday', 'Dept', 'A', 'B', 'C', 'Month']])


In [None]:
pred_i = rfc.predict(scaled_features)

In [None]:
#creating output file and matching it to sampleSubmission.csv

test_data["weeklySales"] = pred_i
test_data["Date"] = test_date
test_data["id"] = test_data["Store"].astype(str) + "_" + test_data["Dept"].astype(str) + "_" + test_data["Date"].astype(str)

output = pd.DataFrame(test_data[["id","weeklySales"]],columns=["id","weeklySales"])
output.to_csv("my_output.csv",index=False)