<a href="https://colab.research.google.com/github/ruelanthonyb/sales-predictions/blob/main/Sales_Prediction_Project1_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [None]:
# Import the data
path = '/content/drive/MyDrive/Coding Dojo PH/DS_Bootcamp/ML Fundamentals/Wk6/assignments/datasets/sales_predictions_2023.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
print(df.info(), '\n')
print(df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB
None 

Item_Identifier                 0
Item

In [None]:
df.shape

(8523, 12)

Identify the target (X) and features (y): Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix

In [None]:
# Features matrix
X = df.drop(columns=['Item_Outlet_Sales'])

# Target
y = df['Item_Outlet_Sales']

Perform a train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Create a pre processing pipeline to prepare the dataset for Machine Learning

In [None]:
# Selectors for categorical and numeric columns
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [None]:
# Create transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Use ColumnTransformer to apply the transformations to the correct columns in the dataframe
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_selector),
        ('cat', categorical_transformer, cat_selector)
    ])

preprocessor

# 1. Build a linear regression model to predict sales

Build a linear regression model

In [None]:
# Linear Regression pipeline
linear_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('regressor', LinearRegression())
                                 ])

# Train the model
linear_pipeline.fit(X_train, y_train)

Evaluate the performance of your model based on r^2

In [None]:
# Predictions
y_pred_linear = linear_pipeline.predict(X_test)

# Evaluation
r2_linear = r2_score(y_test, y_pred_linear)

print(f'Linear Regression R^2: {r2_linear}')

Linear Regression R^2: 0.3830534897083434


Evaluate the performance of your model based on rmse

In [None]:
# Evaluation
rmse_linear = mean_squared_error(y_test, y_pred_linear, squared=False)

print(f'Linear Regression RMSE: {rmse_linear}')

Linear Regression RMSE: 1304.6614157267989


# 2. Build a regression tree model to predict sales

Build a simple regression tree model

In [None]:
# Regression Tree pipeline
tree_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', DecisionTreeRegressor(random_state=42))
                               ])

# Train the model
tree_pipeline.fit(X_train, y_train)

Evaluate the performance of your model based on r^2

In [None]:
# Predictions
y_pred_tree = tree_pipeline.predict(X_test)

# Evaluation
r2_tree = r2_score(y_test, y_pred_tree)

print(f'Regression Tree R^2: {r2_tree}')

Regression Tree R^2: 0.2253710899600514


Evaluate the performance of your model based on rmse

In [None]:
# Evaluation
rmse_tree = mean_squared_error(y_test, y_pred_tree, squared=False)

print(f'Regression Tree RMSE: {rmse_tree}')

Regression Tree RMSE: 1461.9109306926473


# 3. Determine which model to implement

Overall, which model do you recommend?

In [None]:
if r2_linear > r2_tree and rmse_linear < rmse_tree:
    print("I recommend the Linear Regression model.")
elif r2_tree > r2_linear and rmse_tree < rmse_linear:
    print("I recommend the Regression Tree model.")
else:
    print("Both models have their strengths. Consider ensembling them or further tuning.")

I recommend the Linear Regression model.


**Justification:**

The recommendation is based on the model that provides a higher R^2 and a lower RMSE on the test data, as these metrics directly represent the accuracy and error of the predictions, respectively.