<a href="https://colab.research.google.com/github/ryonce/Sales-Model/blob/main/Sales_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sales Data and Cleaning

In [None]:
# Imports
import pandas as pd
import numpy as np

# Modeling & Preprocessing import
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer,make_column_transformer,make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [None]:
# Load in the data

df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vTOdd9ucw6tCewOdXmi_zGCnXLdAtUKs1-k5KgSD6TDSkPx6z4ptifobdRcUE-JYhX6IIBziuMvQoVZ/pub?output=csv')

df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
# Checking for Duplicates

df.duplicated().sum()

0

In [None]:
# Checking Dtype and missing counts

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [None]:
# Checking value of Item Fat Content

df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [None]:
# Replacing value and making two total values

df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('LF', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('low fat', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('reg', 'Regular')
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [None]:
# Identify the features (X) and target (y): 
# Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix.

X = df.drop(columns = ['Item_Outlet_Sales', 'Item_Identifier', 'Outlet_Establishment_Year', 'Outlet_Identifier'])

y = df['Item_Outlet_Sales']

# Create Preproccer for Machine Learning

In [None]:
# Perform a train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
## Create a preprocessing object to prepare the dataset for Machine Learning

# SimpleImputers with most_frequent and median strategies plus scaler and OHE

freq_imputer = SimpleImputer(strategy='most_frequent')
median_imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output = False)

# Prepare separate processing pipelines for numeric and categorical data

num_pipe = make_pipeline(median_imputer, scaler)
cat_pipe = make_pipeline(freq_imputer, ohe)

# Selectors to for numeric and categorical data types

num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

# Create tuples of (imputer, selector) for each datatype

num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)

# Create the preprocessing ColumnTransformer

preprocessor = make_column_transformer(cat_tuple, num_tuple, remainder='drop')
preprocessor

# Create Linear Regression Model 

In [None]:
# Instantiate a linear regression model

linreg = LinearRegression()

# Combine the preprocessing ColumnTransformer and the linear regression model in a Pipeline

linreg_pipe = make_pipeline(preprocessor, linreg)
linreg_pipe

In [None]:
linreg_pipe.fit(X_train, y_train)

In [None]:
# Create a function to take the true and predicted values
# and print MAE, MSE, RMSE, and R2 metrics for a model
def model_metrics(pipe, x_train, y_train, x_test, y_test, 
                       model_name='Regression Model', ):
  # Train
  mae = mean_absolute_error(y_train, pipe.predict(x_train))
  mse = mean_squared_error(y_train, pipe.predict(x_train))
  rmse = np.sqrt(mean_squared_error(y_train, pipe.predict(x_train)))
  r2 = r2_score(y_train, pipe.predict(x_train))
  print(f'{model_name} Train Scores')
  print(f'MAE: {mae:,.4f} \nMSE: {mse:,.4f} \nRMSE: {rmse:,.4f} \nR2: {r2:.4f}\n')

  # Test
  mae = mean_absolute_error(y_test, pipe.predict(x_test))
  mse = mean_squared_error(y_test, pipe.predict(x_test))
  rmse = np.sqrt(mean_squared_error(y_test, pipe.predict(x_test)))
  r2 = r2_score(y_test, pipe.predict(x_test))

  # Display the metrics for the model
  print(f'{model_name} Test Scores')
  print(f'MAE: {mae:,.4f} \nMSE: {mse:,.4f} \nRMSE: {rmse:,.4f} \nR2: {r2:.4f}\n')

In [None]:
# Linear Regression Model displayed

model_metrics(linreg_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Linear Regression Model')

Linear Regression Model Train Scores
MAE: 847.8170 
MSE: 1,302,767.4566 
RMSE: 1,141.3884 
R2: 0.5598

Linear Regression Model Test Scores
MAE: 805.7376 
MSE: 1,197,625.8163 
RMSE: 1,094.3609 
R2: 0.5659



- The Linear Regression Model scores look high but will need to compare these vs another modeal to see if we can approve. 

- MAE is 805 dollars and RMSE at 1,094 dollars. 

# Create Decision Tree Model

In [77]:
# Create an instance of the model

dec_tree = DecisionTreeRegressor()

# Make pipeline

dec_tree_pipe = make_pipeline(preprocessor, dec_tree)

# Fit the data

dec_tree_pipe.fit(X_train, y_train)

In [78]:
# Display the Decision Tree Model metrics

model_metrics(dec_tree_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Decision Tree Model')

Decision Tree Model Train Scores
MAE: 0.0000 
MSE: 0.0000 
RMSE: 0.0000 
R2: 1.0000

Decision Tree Model Test Scores
MAE: 1,072.8509 
MSE: 2,428,098.4820 
RMSE: 1,558.2357 
R2: 0.1199



- Very interesting numbers as our R^2 is very low. The MAE and RMSE are not too far from the Linear Regression model. Maybe if we tweak the numbers to come up with a slightly better model

In [79]:
# What was the depth of our Decision Tree Model?

dec_tree.get_depth()

42

In [92]:
# List of values to try for max_depth:

depths = list(range(2, 43)) # will try every value between 2 and 42

# Data frame to store the score for each value of max_depth:

scores = pd.DataFrame(index=depths, columns=['Test Score','Train Score'])

for depth in depths:
    dec_tree = DecisionTreeRegressor(max_depth=depth, random_state=42)
    dec_tree_pipe = make_pipeline(preprocessor, dec_tree)
    dec_tree_pipe.fit(X_train, y_train)
    train_score = dec_tree_pipe.score(X_train, y_train)
    test_score = dec_tree_pipe.score(X_test, y_test)
    scores.loc[depth, 'Train Score'] = train_score
    scores.loc[depth, 'Test Score'] = test_score


In [94]:
# Turn list of scores into a dataframe to sort

sorted_scores = scores.sort_values(by='Test Score', ascending=False)
sorted_scores.head()

Unnamed: 0,Test Score,Train Score
5,0.594709,0.603933
4,0.584005,0.582625
6,0.582337,0.615063
7,0.578544,0.626452
8,0.564088,0.642731


In [95]:
# Best Model 

dt_best_model = DecisionTreeRegressor(max_depth = 5)

dt_best_model_pipe = make_pipeline(preprocessor, dt_best_model)

dt_best_model_pipe.fit(X_train,y_train)

In [96]:
# Display the Decision Tree Model metrics

model_metrics(dt_best_model_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Decision Tree Model')

Decision Tree Model Train Scores
MAE: 762.6399 
MSE: 1,172,142.0438 
RMSE: 1,082.6551 
R2: 0.6039

Decision Tree Model Test Scores
MAE: 738.3556 
MSE: 1,118,187.9463 
RMSE: 1,057.4441 
R2: 0.5947



- We were able to tune and make this model better. It is now better than the Linear Regression Model so we will use this one.