<a href="https://colab.research.google.com/github/shellymduncan/Sales-Predictions/blob/main/Sales_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Shelly-Ann Duncan

## 9/16/22

## Project 1, Part 5

# Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')

# Load data 

In [5]:
filename = '/content/drive/MyDrive/02 - Week 2/sales_predictions (1).csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Make a copy of the original dataframe to avoid manipulations

In [6]:
ml_df = df.copy()

# Check the datatypes 

In [7]:
ml_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


# Check for missing data 
* Identify how much data is missing.

In [8]:
ml_df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [9]:
ml_df.isna().sum().sum()

3873

# Check for duplicates
* We may not want duplicate data in our data set

In [10]:
ml_df.duplicated().sum()

0

# Checking for and fixing any inconsistencies in the categorical data

In [14]:
# Item Fat Content column
ml_df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [16]:
# Creating two categories, Low Fat and Regular
ml_df['Item_Fat_Content'].replace({'reg': 'Regular', 'LF': 'Low Fat', 'low fat': 'Low Fat'}, inplace = True)
ml_df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [17]:
# Outlet Size column
ml_df['Outlet_Size'].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [18]:
# Changing the category High to Large so it can be ordinal
ml_df['Outlet_Size'].replace({'High': 'Large'}, inplace = True)
ml_df['Outlet_Size'].value_counts()

Medium    2793
Small     2388
Large      932
Name: Outlet_Size, dtype: int64

* All inconsistencies in the categorical data are fixed.

# Summary of the statistics for the numberical columns

In [19]:
ml_df.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


# Keeping the relevant features (X) columns and target (y): Item_Outlet_Sales.

In [34]:
# Keeping relevant columns for this dataset
ml_df = ml_df[['Item_Fat_Content', 'Item_Type', 'Item_MRP', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales']]
ml_df.head()


Unnamed: 0,Item_Fat_Content,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,Low Fat,Dairy,249.8092,1999,1.0,Tier 1,Supermarket Type1,3735.138
1,Regular,Soft Drinks,48.2692,2009,1.0,Tier 3,Supermarket Type2,443.4228
2,Low Fat,Meat,141.618,1999,1.0,Tier 1,Supermarket Type1,2097.27
3,Regular,Fruits and Vegetables,182.095,1998,,Tier 3,Grocery Store,732.38
4,Low Fat,Household,53.8614,1987,2.0,Tier 3,Supermarket Type1,994.7052


# Ordinal encoding Outlet_Size and Outlet_Location_Type

In [32]:
# Ordinal encoding Outlet Size column to avoid error in data when preprocessing
replace_dict = {'Small': 0, 'Medium': 1, 'Large': 2}
ml_df['Outlet_Size'].replace(replace_dict, inplace = True)
ml_df['Outlet_Size']

0       1.0
1       1.0
2       1.0
3       NaN
4       2.0
       ... 
8518    2.0
8519    NaN
8520    0.0
8521    1.0
8522    0.0
Name: Outlet_Size, Length: 8523, dtype: float64

In [35]:
# Ordinal endcoding Outlet Location Type column to avoid error in data when preprocessing
ml_df['Outlet_Location_Type'].value_counts()

Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64

In [36]:
replace_dict1 = {'Tier 1': 0, 'Tier 2': 1, 'Tier 3': 2}
ml_df['Outlet_Location_Type'].replace(replace_dict1, inplace = True)
ml_df['Outlet_Location_Type']

0       0
1       2
2       0
3       2
4       2
       ..
8518    2
8519    1
8520    1
8521    2
8522    0
Name: Outlet_Location_Type, Length: 8523, dtype: int64

In [37]:
# define X features and y target
X = ml_df.drop(columns = ['Item_Outlet_Sales'])
y = ml_df['Item_Outlet_Sales']

In [27]:
# test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# Preprocessing the dataset for Machine Learning

In [43]:
# instantiate column selectors
cat_selector = make_column_selector(dtype_include = 'object')
num_selector = make_column_selector(dtype_include = 'number')

# Instantiate Transformers 

In [38]:
# Imputers 
freq_imputer = SimpleImputer(strategy = 'most_frequent')
mean_imputer = SimpleImputer(strategy = 'mean')

# Scale
scaler = StandardScaler()

# One hot encoder
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)

# Instantiate Pipelines

In [39]:
# numeric pipleine for numerical data
num_pipe = make_pipeline(mean_imputer, scaler)
num_pipe

In [40]:
# categorical pipeline for categorical data
cat_pipe = make_pipeline(freq_imputer, ohe)
cat_pipe

# Instantiate ColumnTransformer

In [44]:
# create tuples for the column transformer
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)

# ColumnTransformer
processed = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')
processed

# Transform data

In [45]:
# fit the column transfer(processed) only on the training data
processed.fit(X_train)

In [46]:
# now we use the fitted ColumnTransformer(processed) on training and testing datasets
X_train_processed = processed.transform(X_train)
X_test_processed = processed.transform(X_test)

# Inspect results

In [47]:
# check for missing values, scaled data and one hot encoded
print(np.isnan(X_train_processed).sum().sum(), 'missing values \n')
print(np.isnan(X_test_processed).sum().sum(), 'missing values \n')
X_train_processed

0 missing values 

0 missing values 



array([[ 1.82810922,  1.32784893,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.60336888,  1.32784893,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.24454056,  0.13618724,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.52302674,  0.49368575,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.38377708,  1.0895166 ,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.73836105, -0.10214509,  1.        , ...,  1.        ,
         0.        ,  0.        ]])