<a href="https://colab.research.google.com/github/spockthompson/Portfolio-Project/blob/main/Sales_Prediction_Project_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sales Predictions
- Soulution by: Scotty Thomason

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, \
OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(display='diagram')

path = '/content/drive/MyDrive/sales_predictions.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


##Looking at the Data Set

In [2]:
df_ml = df.copy()

In [3]:
df_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [4]:
df_ml.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [5]:
df_ml.duplicated().sum()

0

In [6]:
df_ml.nunique()

Item_Identifier              1559
Item_Weight                   415
Item_Fat_Content                5
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     3
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64

#Before splitting our data, We can drop duplicates and fix inconsistencies in categorical data.

In [7]:
# Fixing the description of items "Fat Content"
df_ml['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [8]:
df_ml['Item_Fat_Content'] = df_ml['Item_Fat_Content'].replace('LF', 'Low Fat')
df_ml['Item_Fat_Content'] = df_ml['Item_Fat_Content'].replace('reg','Regular')
df_ml['Item_Fat_Content'] = df_ml['Item_Fat_Content'].replace('low fat', 'Low Fat')

df_ml['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [9]:
# Ordinal Encoding 'Outlet_Size' , 'Outlet_Location_Type' and 'Outlet_Type'
replacement_dictionary = {'Small':0, 'Medium':1, 'High':2}
df_ml['Outlet_Size'].replace(replacement_dictionary, inplace=True)
df_ml['Outlet_Size']

0       1.0
1       1.0
2       1.0
3       NaN
4       2.0
       ... 
8518    2.0
8519    NaN
8520    0.0
8521    1.0
8522    0.0
Name: Outlet_Size, Length: 8523, dtype: float64

In [10]:
replacement_dictionary = {'Tier 1':0, 'Tier 2':1, 'Tier 3':2}
df_ml['Outlet_Location_Type'].replace(replacement_dictionary, inplace=True)
df_ml['Outlet_Location_Type']

0       0
1       2
2       0
3       2
4       2
       ..
8518    2
8519    1
8520    1
8521    2
8522    0
Name: Outlet_Location_Type, Length: 8523, dtype: int64

In [11]:
replacement_dictionary = {'Supermarket Type1':0, 'Supermarket Type2':1, 'Supermarket Type3':2, 'Grocery Store':3}
df_ml['Outlet_Type'].replace(replacement_dictionary, inplace=True)
df_ml['Outlet_Type']

0       0
1       1
2       0
3       3
4       0
       ..
8518    0
8519    0
8520    0
8521    1
8522    0
Name: Outlet_Type, Length: 8523, dtype: int64

#Identify the features (X) and target (y): Assign the "Item_Outlet_Sales" column as our target and the rest of the relevant variables as our features matrix and preform train test split.

In [12]:
# split X and y, you are predicting price
X = df_ml.drop(columns=['Item_Outlet_Sales', 'Item_Identifier', 'Outlet_Identifier'])
y = df_ml['Item_Outlet_Sales']

# split training and test
# set random_state to 42 for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,16.35,Low Fat,0.029565,Household,256.4646,2009,1.0,2,1
7510,15.25,Regular,0.0,Snack Foods,179.766,2009,1.0,2,1
5828,12.35,Regular,0.158716,Meat,157.2946,1999,1.0,0,0
5327,7.975,Low Fat,0.014628,Baking Goods,82.325,2004,0.0,1,0
4810,19.35,Low Fat,0.016645,Frozen Foods,120.9098,2002,,1,0


#Create a preprocessing object to prepare the dataset for Machine Learning

In [13]:
# Selectors
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')


In [14]:
# Imputers
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')
# Scaler
scaler = StandardScaler()
# One-hot encoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)


In [15]:
# Numeric pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

# Categorical pipeline
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe


In [16]:
# Tuples for Column Transformer
number_tuple = (numeric_pipe, num_selector)
category_tuple = (categorical_pipe, cat_selector)
# ColumnTransformer
preprocessor = make_column_transformer(number_tuple, category_tuple, remainder='drop')
preprocessor



In [17]:
# fit on train
preprocessor.fit(X_train)

In [18]:
# transform train and test
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# view results
# Check for missing values and that data is scaled and one-hot encoded
print(np.isnan(X_train_processed).sum().sum(), 'missing values in training data')
print(np.isnan(X_test_processed).sum().sum(), 'missing values in testing data')
print('\n')
print('All data in X_train_processed are', X_train_processed.dtype)
print('All data in X_test_processed are', X_test_processed.dtype)
print('\n')
print('shape of data is', X_train_processed.shape)
print('\n')



0 missing values in training data
0 missing values in testing data


All data in X_train_processed are float64
All data in X_test_processed are float64


shape of data is (6392, 25)




## Creating a Linear Regression Model

In [19]:
# Create Model Pipeline
reg = LinearRegression()
reg_pipe = make_pipeline(preprocessor, reg)

In [20]:
# Model is learning the relationship between X and y
reg_pipe.fit(X_train,y_train)

train_pred = reg_pipe.predict(X_train)
test_pred = reg_pipe.predict(X_test)


In [21]:
# finding MAE, MSE, RMSE and R2 on the model for both the train and test data
def eval_regression(true, pred):
  mae = mean_absolute_error(true, pred)
  mse = mean_squared_error(true, pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(true, pred)

  print(f'MAE {mae},\n MSE {mse},\n RMSE: {rmse},\n R^2: {r2} ')

eval_regression(y_train, train_pred)

MAE 960.8028167891905,
 MSE 1674600.7758017518,
 RMSE: 1294.0636675997637,
 R^2: 0.434152444572693 


In [22]:
eval_regression(y_test, test_pred)

MAE 941.0491428975531,
 MSE 1595942.1489181314,
 RMSE: 1263.306039294569,
 R^2: 0.42154574596484584 


## Creating a Regression Tree Model

In [23]:
dec_tree = DecisionTreeRegressor(max_depth=None)
# put the model in a pipeline with the preprocessor
tree_pipe = make_pipeline(preprocessor, dec_tree)
# fit the model on the training data
tree_pipe.fit(X_train, y_train)
# evaluate the model on both the training and the testing data.
print('Train Data')
eval_regression(y_train, tree_pipe.predict(X_train))
print('\nTest Data')
eval_regression(y_test, tree_pipe.predict(X_test))

Train Data
MAE 1.0671480386885109e-16,
 MSE 2.4264137179864312e-29,
 RMSE: 4.925864104892086e-15,
 R^2: 1.0 

Test Data
MAE 1035.5211457531673,
 MSE 2229925.6258294587,
 RMSE: 1493.2935497849908,
 R^2: 0.19175643971965572 


In [24]:
tree_pipe

In [25]:
dec_tree.get_depth()

40

In [26]:
# create a range of max_depth values to loop over
depths = range(1, 41)
# create a dataframe to store train and test scores.
scores = pd.DataFrame(columns=['Train','Test'], index=depths)
# loop over the values in the depths range 
for depth in depths:
  # fit a new model with the depth
  dec = DecisionTreeRegressor(max_depth=depth)
  # put model in a pipeline
  dec_pipe = make_pipeline(preprocessor, dec)
  # fit the model
  dec_pipe.fit(X_train, y_train)
  # create prediction arrays
  train_pred = dec_pipe.predict(X_train)
  test_pred = dec_pipe.predict(X_test)

  # evaluate the model using R2 score
  train_r2 = r2_score(y_train, train_pred)
  test_r2 = r2_score(y_test, test_pred)
  # store the scores in a dataframe
  scores.loc[depth, 'Train'] = train_r2
  scores.loc[depth, 'Test'] = test_r2

In [27]:
scores.sort_values(by='Test', ascending=False)

Unnamed: 0,Train,Test
5,0.60394,0.59471
4,0.582625,0.584005
6,0.615072,0.582639
7,0.626454,0.576838
8,0.642714,0.558973
9,0.663832,0.544385
3,0.524218,0.524222
10,0.684032,0.522842
11,0.709827,0.49647
12,0.734407,0.475438


In [28]:
# Let's run the model with our optimized value for max_depth
opt_dec = DecisionTreeRegressor(max_depth = 5, random_state = 42)
opt_dec.fit(X_train_processed, y_train)

train_5_score = opt_dec.score(X_train_processed, y_train)
test_5_score = opt_dec.score(X_test_processed, y_test)
print(train_5_score)
print(test_5_score)

0.6039397477322958
0.5947099753159973


In [29]:
eval_regression(y_train, train_pred)

MAE 5.3357401934425544e-17,
 MSE 6.066034294966078e-30,
 RMSE: 2.462932052446043e-15,
 R^2: 1.0 


In [30]:
eval_regression(y_test, test_pred)

MAE 1050.6361492257156,
 MSE 2275450.1218106435,
 RMSE: 1508.4595194471224,
 R^2: 0.17525594289339286 


- In this case I don't see any improvent over the Linear regression Model and I would use that instead