<a href="https://colab.research.google.com/github/stp511/sales_prediction1/blob/main/Project1_Part5_Sales_Predictions_Steven_Phillips.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project 1 - Part 5 of Sales Predictions using Machine Learning
- **Steven Phillips**
- 10/13/2022

## Import all libraries and load the Sales Predictions data set

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, \
OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(display='diagram')

In [26]:
#mount data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
#Load the data for Sales Predictions csv file as a data frame and present the first five rows
filename = '/content/sales_predictions.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


We will make a copy of original df to avoid any manipulations

In [28]:
# copy of df named df_ml
df_ml = df.copy()

In [29]:
df_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [30]:
df_ml.describe().round(2)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.86,0.07,140.99,1997.83,2181.29
std,4.64,0.05,62.28,8.37,1706.5
min,4.56,0.0,31.29,1985.0,33.29
25%,8.77,0.03,93.83,1987.0,834.25
50%,12.6,0.05,143.01,1999.0,1794.33
75%,16.85,0.09,185.64,2004.0,3101.3
max,21.35,0.33,266.89,2009.0,13086.96


In [31]:
df_ml.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [32]:
# Check to see if there are any duplicates
df_ml.duplicated().sum()

0

Check for inconsistencies within categorial data

In [33]:
#check for inconsistencies 'Item_Identifier' [none]
df_ml['Item_Identifier'].value_counts()

FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
         ..
FDY43     1
FDQ60     1
FDO33     1
DRF48     1
FDC23     1
Name: Item_Identifier, Length: 1559, dtype: int64

In [34]:
#check for inconsistencies 'Item_Fat_Content'
df_ml['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [35]:
#Within Item_Fat_Content replace reg with Regular and Lf/low fat with Low Fat and confirm
df_ml['Item_Fat_Content'].replace(['LF', 'low fat'], ['Low Fat', 'Low Fat'], inplace=True)
df_ml['Item_Fat_Content'].replace(['reg'], ['Regular'], inplace=True)
df_ml['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [36]:
#check for inconsistencies 'Item_Type' [none]
df_ml['Item_Type'].value_counts()

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64

In [37]:
#check for inconsistencies 'Outlet_Identifier' [none]
df_ml['Outlet_Identifier'].value_counts()

OUT027    935
OUT013    932
OUT049    930
OUT046    930
OUT035    930
OUT045    929
OUT018    928
OUT017    926
OUT010    555
OUT019    528
Name: Outlet_Identifier, dtype: int64

In [38]:
#check for inconsistencies 'Outlet_Size' [none]
df_ml['Outlet_Size'].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [39]:
#check for inconsistencies 'Outlet_Location_Type' [none]
df_ml['Outlet_Location_Type'].value_counts()

Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64

In [40]:
#check for inconsistencies 'Outlet_Type' [none]
df_ml['Outlet_Type'].value_counts()

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64

Outlet_Size and Outlet_Location_Type are ordinal categorical features.

## Split the data (X, y and train, test) (Validation Split)

In [41]:
# The target vector y is Ite# split X and y, you are predicting Item_Outlet_Sales
X = df_ml.drop(columns=['Item_Outlet_Sales'])
y = df_ml['Item_Outlet_Sales']

In [42]:
# split training and test
# set random_state to 42 for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Instantiate Transformers

In [43]:
#instantiate the StandardScaler, OneHotEncoder, OrdinalEncoder and Imputers
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

mean_imputer = SimpleImputer(strategy='mean')
freq_imputer = SimpleImputer(strategy='most_frequent')

In [44]:
#create a list of ordinal labels, in order of least to most

outlet_size_labels = ['Small', 'Medium', 'High']
outlet_location_type_labels = ['Tier 1', 'Tier 2', 'Tier 3']

#combine the ordered list in the order that the columns appear
ordered_labels = [outlet_size_labels, outlet_location_type_labels]
 
ordinal = OrdinalEncoder(categories = ordered_labels)

## Create a Pipeline for Numeric Data and a Pipeline for Categorical Data

In [45]:
# Setup the pipelines for the numeric and categorical data

num_pipeline = make_pipeline(mean_imputer, scaler)
ord_pipeline = make_pipeline(freq_imputer, ordinal)
nom_pipeline = make_pipeline(freq_imputer, ohe)

## Create Tuples to Pair Pipelines with Columns

In [46]:
# Create column lists for objects and a number selector
ordinal_cols = ['Outlet_Size', 'Outlet_Location_Type']
nominal_cols = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Type']

num_selector = make_column_selector(dtype_include='number')

# Setup the tuples to pair the processors with the column selectors

numeric_tuple = (num_pipeline, num_selector)
ordinal_tuple = (ord_pipeline, ordinal_cols)
nominal_tuple = (nom_pipeline, nominal_cols)

## Create the Column Transformer to Apply Different Preprocessing to Different Columns

In [47]:
# Instantiate the column transformer

preprocessor = make_column_transformer(ordinal_tuple, 
                                          numeric_tuple, 
                                          nominal_tuple, 
                                          remainder='drop')

## Fit the Column Transformer on the Training Data

In [48]:
# Fit the column transformer on the X_train

preprocessor.fit(X_train)

## Transform Both the Training and Testing Data

In [49]:
# Transform the X_train data and the X_test data

X_train_transformed = preprocessor.transform(X_train)

X_test_transformed = preprocessor.transform(X_test)

## Display the Results

In [50]:
X_train_transformed

array([[ 1.        ,  2.        ,  0.81724868, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  2.        ,  0.5563395 , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  0.        , -0.13151196, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.        ,  1.        ,  1.11373638, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.76600931, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.81724868, ...,  1.        ,
         0.        ,  0.        ]])