<a href="https://colab.research.google.com/github/ryonce/Sales-Predictions/blob/main/Project_1_Part_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Imports
import pandas as pd
import numpy as np

# Modeling & Preprocessing import
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer,make_column_transformer,make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer

In [2]:
# Load in the data

df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vTOdd9ucw6tCewOdXmi_zGCnXLdAtUKs1-k5KgSD6TDSkPx6z4ptifobdRcUE-JYhX6IIBziuMvQoVZ/pub?output=csv')

df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
# Checking for Duplicates

df.duplicated().sum()

0

In [4]:
# Checking Dtype and missing counts

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [5]:
# Checking value of Item Fat Content

df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [6]:
# Replacing value and making two total values

df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('LF', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('low fat', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('reg', 'Regular')
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [7]:
# Identify the features (X) and target (y): 
# Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix.

X = df.drop(columns = ['Item_Outlet_Sales', 'Item_Identifier', 'Outlet_Establishment_Year'])

y = df['Item_Outlet_Sales']

In [8]:
# Perform a train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [9]:
## Create a preprocessing object to prepare the dataset for Machine Learning

# Selectors to for numeric and categorical data types

num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

In [10]:
# SimpleImputers with most_frequent and median strategies

freq_imputer = SimpleImputer(strategy='most_frequent')
median_imputer = SimpleImputer(strategy='median')

In [11]:
# Create tuples of (imputer, selector) for each datatype

num_tuple = (median_imputer, num_selector)
cat_tuple = (freq_imputer, cat_selector)

# ColumnTransformer

col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder='passthrough')
col_transformer

In [12]:
# Create a StandardScaler for scaling numeric columns.
# Create a OneHotEncoder for one-hot encoding the categorical columns.

scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse = False)

In [13]:
# Make pipeline
# Imputation of missing values occurs after the train test split using SimpleImputer. 

preprocessing_pipeline = make_pipeline(col_transformer, ohe, scaler)
preprocessing_pipeline

In [14]:
# Fit pipeline on training data

preprocessing_pipeline.fit(X_train)



In [15]:
# Transform both the training and testing data. 

# Check the results

X_train_imputed = preprocessing_pipeline.transform(X_train)
X_test_imputed = preprocessing_pipeline.transform(X_test)

display(X_train_imputed)

array([[-0.02502347, -0.0125088 , -0.02502347, ..., -1.36803451,
         2.84245413, -0.35712144],
       [-0.02502347, -0.0125088 , -0.02502347, ..., -1.36803451,
         2.84245413, -0.35712144],
       [-0.02502347, -0.0125088 , -0.02502347, ...,  0.73097571,
        -0.35180867, -0.35712144],
       ...,
       [-0.02502347, -0.0125088 , -0.02502347, ...,  0.73097571,
        -0.35180867, -0.35712144],
       [-0.02502347, -0.0125088 , -0.02502347, ...,  0.73097571,
        -0.35180867, -0.35712144],
       [-0.02502347, -0.0125088 , -0.02502347, ...,  0.73097571,
        -0.35180867, -0.35712144]])