# Module 01
**Demand Forecasting and Inventory Optimization**
**Objectives:**

Reduce overstocking by 20% and the stockout rate by 15% within 6 months.

- Target Variable for Inventory Optimization: **Stock_Quantity**

- Target Variable for Demand Forecasting: **Sales_Volume**

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.io as pio

from sklearn.preprocessing import OrdinalEncoder

from smart_supply_chain_ai.data_processing import get_data

import warnings
warnings.filterwarnings('ignore')

# Define plotly template and figure dimensions
pio.templates.default = "plotly_white"
px.defaults.width = 800
px.defaults.height = 600

# Pandas show all columns
pd.set_option('display.max_columns', None)

### Get Data

In [2]:
# Paths
raw_data_path = os.path.join('../data', 'raw')
processed_data_path = os.path.join('../data', 'processed')

In [3]:
# link for data - [USER] [DATASET_NAME]
module_one = "salahuddinahmedshuvo/grocery-inventory-and-sales-dataset"
# Download Data and Unzip 
get_data.download_kaggle_dataset(module_one, raw_data_path)

Starting the download of dataset 'salahuddinahmedshuvo/grocery-inventory-and-sales-dataset' from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/salahuddinahmedshuvo/grocery-inventory-and-sales-dataset
Download, unzipping, and cleanup complete! The dataset was saved to: ../data/raw


In [4]:
# Load data
df_raw = pd.read_csv(raw_data_path + '/Grocery_Inventory_and_Sales_Dataset.csv')

## EDA - Exploratory Data Analysis

In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Product_ID               990 non-null    object
 1   Product_Name             990 non-null    object
 2   Catagory                 989 non-null    object
 3   Supplier_ID              990 non-null    object
 4   Supplier_Name            990 non-null    object
 5   Stock_Quantity           990 non-null    int64 
 6   Reorder_Level            990 non-null    int64 
 7   Reorder_Quantity         990 non-null    int64 
 8   Unit_Price               990 non-null    object
 9   Date_Received            990 non-null    object
 10  Last_Order_Date          990 non-null    object
 11  Expiration_Date          990 non-null    object
 12  Warehouse_Location       990 non-null    object
 13  Sales_Volume             990 non-null    int64 
 14  Inventory_Turnover_Rate  990 non-null    i

In [6]:
# Dictionary columns names help
column_inventory = {'Product_ID': 'Unique identifier for each product.',
'Product_Name': 'Name of the product.',
'Category': 'The product category (e.g., Grains & Pulses, Beverages, Fruits & Vegetables).',
'Supplier_ID': 'Unique identifier for the product supplier.',
'Supplier_Name': 'Name of the supplier.',
'Stock_Quantity': 'The current stock level of the product in the warehouse.',
'Reorder_Level': 'The stock level at which new stock should be ordered.',
'Reorder_Quantity': 'The quantity of product to order when the stock reaches the reorder level.',
'Unit_Price': 'Price per unit of the product.',
'Date_Received': 'The date the product was received into the warehouse.',
'Last_Order_Date': 'The last date the product was ordered.',
'Expiration_Date': 'The expiration date of the product, if applicable.',
'Warehouse_Location': 'The warehouse address where the product is stored.',
'Sales_Volume': 'The total number of units sold.',
'Inventory_Turnover_Rate': 'The rate at which the product sells and is replenished.',
'Status': 'Current status of the product (e.g., Active, Discontinued, Backordered).'}

In [7]:
# make a copy
df = df_raw.copy()

In [8]:
df.rename(columns={"Catagory": "Category"}, inplace=True)

In [9]:
df[df['Category'].isna()]

Unnamed: 0,Product_ID,Product_Name,Category,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Last_Order_Date,Expiration_Date,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate,Status
685,10-378-9729,Cabbage,,83-941-9620,Rooxo,69,21,68,$66.55,12/23/2024,11/26/2024,9/21/2024,2 Butterfield Pass,36,35,Discontinued


In [10]:
df.Category.unique()

array(['Grains & Pulses', 'Beverages', 'Fruits & Vegetables',
       'Oils & Fats', 'Dairy', 'Bakery', 'Seafood', nan], dtype=object)

### Missing Values

In [11]:
# I decided to categorize cabbage under 'Fruits & Vegetables' after researching online. 
df = df.fillna('Fruits & Vegetables')

### Types

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Product_ID               990 non-null    object
 1   Product_Name             990 non-null    object
 2   Category                 990 non-null    object
 3   Supplier_ID              990 non-null    object
 4   Supplier_Name            990 non-null    object
 5   Stock_Quantity           990 non-null    int64 
 6   Reorder_Level            990 non-null    int64 
 7   Reorder_Quantity         990 non-null    int64 
 8   Unit_Price               990 non-null    object
 9   Date_Received            990 non-null    object
 10  Last_Order_Date          990 non-null    object
 11  Expiration_Date          990 non-null    object
 12  Warehouse_Location       990 non-null    object
 13  Sales_Volume             990 non-null    int64 
 14  Inventory_Turnover_Rate  990 non-null    i

In [13]:
df.head()

Unnamed: 0,Product_ID,Product_Name,Category,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Last_Order_Date,Expiration_Date,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate,Status
0,29-205-1132,Sushi Rice,Grains & Pulses,38-037-1699,Jaxnation,22,72,70,$4.50,8/16/2024,6/29/2024,9/19/2024,48 Del Sol Trail,32,19,Discontinued
1,40-681-9981,Arabica Coffee,Beverages,54-470-2479,Feedmix,45,77,2,$20.00,11/1/2024,5/29/2024,5/8/2024,36 3rd Place,85,1,Discontinued
2,06-955-3428,Black Rice,Grains & Pulses,54-031-2945,Vinder,30,38,83,$6.00,8/3/2024,6/10/2024,9/22/2024,3296 Walton Court,31,34,Backordered
3,71-594-6552,Long Grain Rice,Grains & Pulses,63-492-7603,Brightbean,12,59,62,$1.50,12/8/2024,2/19/2025,4/17/2024,3 Westerfield Crossing,95,99,Active
4,57-437-1828,Plum,Fruits & Vegetables,54-226-4308,Topicstorm,37,30,74,$4.00,7/3/2024,10/11/2024,10/5/2024,15068 Scoville Court,62,25,Backordered


In [14]:
# Create category columns
cat_columns = ['Category', 'Status']
df[cat_columns] = df[cat_columns].astype('category')

In [15]:
# create date columns
date_columns = ['Date_Received', 'Last_Order_Date', 'Expiration_Date']
df[date_columns] = df[date_columns].apply(pd.to_datetime, errors='coerce')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Product_ID               990 non-null    object        
 1   Product_Name             990 non-null    object        
 2   Category                 990 non-null    category      
 3   Supplier_ID              990 non-null    object        
 4   Supplier_Name            990 non-null    object        
 5   Stock_Quantity           990 non-null    int64         
 6   Reorder_Level            990 non-null    int64         
 7   Reorder_Quantity         990 non-null    int64         
 8   Unit_Price               990 non-null    object        
 9   Date_Received            990 non-null    datetime64[ns]
 10  Last_Order_Date          990 non-null    datetime64[ns]
 11  Expiration_Date          990 non-null    datetime64[ns]
 12  Warehouse_Location       990 non-nul

In [17]:
# Remove "$" from price
df['Unit_Price'] = df['Unit_Price'].str.replace('$', '').astype('float')

In [18]:
# Numeric columns statistics
df.describe(exclude=['datetime', 'object', 'category'])

Unnamed: 0,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Sales_Volume,Inventory_Turnover_Rate
count,990.0,990.0,990.0,990.0,990.0,990.0
mean,55.609091,51.215152,51.913131,5.924192,58.925253,50.150505
std,26.300775,29.095241,29.521059,6.49128,23.002318,28.798954
min,10.0,1.0,1.0,0.2,20.0,1.0
25%,33.0,25.25,25.0,2.5,39.0,25.0
50%,56.0,53.0,54.0,4.225,58.0,50.0
75%,79.0,77.0,77.0,7.0,78.0,74.75
max,100.0,100.0,100.0,98.43,100.0,100.0


In [19]:
# Categorical columns statistics
df.describe(include=['category'])

Unnamed: 0,Category,Status
count,990,990
unique,7,3
top,Fruits & Vegetables,Discontinued
freq,332,333


In [20]:
# String columns statistics
df.describe(include=['object'])

Unnamed: 0,Product_ID,Product_Name,Supplier_ID,Supplier_Name,Warehouse_Location
count,990,990,990,990,990
unique,990,121,990,350,990
top,29-205-1132,Bread Flour,38-037-1699,Katz,48 Del Sol Trail
freq,1,19,1,12,1


In [21]:
# Date Range
print(f'\tDate Min value\n\n{df[['Date_Received', 'Last_Order_Date', 'Expiration_Date']].min()}')
print(30 * '-')
print('')
print(f'\tDate Max value\n\n{df[['Date_Received', 'Last_Order_Date', 'Expiration_Date']].max()}')

	Date Min value

Date_Received     2024-02-25
Last_Order_Date   2024-02-25
Expiration_Date   2024-02-25
dtype: datetime64[ns]
------------------------------

	Date Max value

Date_Received     2025-02-24
Last_Order_Date   2025-02-24
Expiration_Date   2025-02-24
dtype: datetime64[ns]


In [22]:
df.head()

Unnamed: 0,Product_ID,Product_Name,Category,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Last_Order_Date,Expiration_Date,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate,Status
0,29-205-1132,Sushi Rice,Grains & Pulses,38-037-1699,Jaxnation,22,72,70,4.5,2024-08-16,2024-06-29,2024-09-19,48 Del Sol Trail,32,19,Discontinued
1,40-681-9981,Arabica Coffee,Beverages,54-470-2479,Feedmix,45,77,2,20.0,2024-11-01,2024-05-29,2024-05-08,36 3rd Place,85,1,Discontinued
2,06-955-3428,Black Rice,Grains & Pulses,54-031-2945,Vinder,30,38,83,6.0,2024-08-03,2024-06-10,2024-09-22,3296 Walton Court,31,34,Backordered
3,71-594-6552,Long Grain Rice,Grains & Pulses,63-492-7603,Brightbean,12,59,62,1.5,2024-12-08,2025-02-19,2024-04-17,3 Westerfield Crossing,95,99,Active
4,57-437-1828,Plum,Fruits & Vegetables,54-226-4308,Topicstorm,37,30,74,4.0,2024-07-03,2024-10-11,2024-10-05,15068 Scoville Court,62,25,Backordered


In [23]:
df['Product_ID'].duplicated().sum()

np.int64(0)

In [24]:
df['Supplier_ID'].duplicated().sum()

np.int64(0)

In [25]:
df

Unnamed: 0,Product_ID,Product_Name,Category,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Last_Order_Date,Expiration_Date,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate,Status
0,29-205-1132,Sushi Rice,Grains & Pulses,38-037-1699,Jaxnation,22,72,70,4.5,2024-08-16,2024-06-29,2024-09-19,48 Del Sol Trail,32,19,Discontinued
1,40-681-9981,Arabica Coffee,Beverages,54-470-2479,Feedmix,45,77,2,20.0,2024-11-01,2024-05-29,2024-05-08,36 3rd Place,85,1,Discontinued
2,06-955-3428,Black Rice,Grains & Pulses,54-031-2945,Vinder,30,38,83,6.0,2024-08-03,2024-06-10,2024-09-22,3296 Walton Court,31,34,Backordered
3,71-594-6552,Long Grain Rice,Grains & Pulses,63-492-7603,Brightbean,12,59,62,1.5,2024-12-08,2025-02-19,2024-04-17,3 Westerfield Crossing,95,99,Active
4,57-437-1828,Plum,Fruits & Vegetables,54-226-4308,Topicstorm,37,30,74,4.0,2024-07-03,2024-10-11,2024-10-05,15068 Scoville Court,62,25,Backordered
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,82-977-7752,Spinach,Fruits & Vegetables,57-473-8672,Shuffledrive,88,78,17,2.5,2024-09-06,2024-12-28,2024-11-04,58 Corscot Terrace,58,21,Active
986,62-393-9939,Cheddar Cheese,Dairy,93-877-9384,Gabcube,60,9,89,9.0,2024-06-01,2024-06-02,2024-10-05,5 Oxford Pass,95,63,Active
987,31-745-6850,Cabbage,Fruits & Vegetables,96-215-2767,Lajo,94,90,12,0.9,2024-10-03,2024-10-24,2024-11-01,081 Jana Lane,98,71,Active
988,86-692-2312,Avocado Oil,Oils & Fats,77-783-4107,Dazzlesphere,30,48,52,10.0,2024-06-11,2024-12-07,2024-04-30,00616 Manitowish Parkway,22,78,Active


### Analysis 

#### Univariate analysis

> Categoric

In [26]:
df.select_dtypes('category').head()

Unnamed: 0,Category,Status
0,Grains & Pulses,Discontinued
1,Beverages,Discontinued
2,Grains & Pulses,Backordered
3,Grains & Pulses,Active
4,Fruits & Vegetables,Backordered


In [27]:
fig = px.bar(df.sort_values(by='Category'), 'Category')
fig.show()

In [28]:
fig = px.bar(df, 'Status')
fig.show()

In [29]:
fig = px.bar(df.query('Status != "Discontinued"').sort_values(by='Category'), 'Category', title='Produts Status Active or Backordered')
fig.show()

In [30]:
fig = px.bar(df.query('Status == "Discontinued"').sort_values(by='Category'), 'Category', title='Produts Status Discontinued')
fig.show()

> Numeric

In [31]:
active_ = df.query('Status != "Discontinued"').reset_index(drop=True)
inactive_ = df.query('Status == "Discontinued"').reset_index(drop=True)

In [32]:
df.select_dtypes(['int', 'float', 'object']).head()

Unnamed: 0,Product_ID,Product_Name,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate
0,29-205-1132,Sushi Rice,38-037-1699,Jaxnation,22,72,70,4.5,48 Del Sol Trail,32,19
1,40-681-9981,Arabica Coffee,54-470-2479,Feedmix,45,77,2,20.0,36 3rd Place,85,1
2,06-955-3428,Black Rice,54-031-2945,Vinder,30,38,83,6.0,3296 Walton Court,31,34
3,71-594-6552,Long Grain Rice,63-492-7603,Brightbean,12,59,62,1.5,3 Westerfield Crossing,95,99
4,57-437-1828,Plum,54-226-4308,Topicstorm,37,30,74,4.0,15068 Scoville Court,62,25


In [33]:
px.box(active_.sort_values(by='Category'), y='Sales_Volume', x='Category', title="Active Products")

In [34]:
px.box(inactive_.sort_values(by='Category'), y='Sales_Volume', x='Category', title="Inactive Products")

In [35]:
px.box(active_.sort_values(by='Category'), y='Stock_Quantity', x='Category', title="Active Products in Stock")

In [36]:
px.box(inactive_.sort_values(by='Category'), y='Stock_Quantity', x='Category', title="Inactive Products in Stock")

In [37]:
# Inactive Products in stock
inactive_.groupby('Product_Name', as_index=False)['Stock_Quantity'].sum().sort_values('Stock_Quantity', ascending=False)

Unnamed: 0,Product_Name,Stock_Quantity
16,Bread Flour,448
2,Anchovies,439
11,Bell Pepper,424
17,Broccoli,358
39,Egg (Goose),354
...,...,...
67,Mushrooms,34
42,Eggplant,31
27,Cherry,29
82,Potato,26


In [38]:
px.histogram(active_, 'Inventory_Turnover_Rate', title='Active Products: Turnover Rate in 1 year')

In [39]:
fig = px.histogram(inactive_, 'Inventory_Turnover_Rate', title='Inactive Products: Turnover Rate in 1 year')
fig.update_traces(marker=dict(color='green'))
fig.show()

##### Outliers

In [40]:
px.box(df, y=df.select_dtypes(['int', 'float']).columns, title='Box Plot')

In [41]:
def outliers(dataset_name, column_name: str):
    """
    View Outliers in Dataset
    Args:
        dataset_name (str): The name of the dataset.
        column_name (str): The name of column values.
    """

    # Sale Volume Outliers
    Q1 = dataset_name[column_name].quantile(0.25)
    Q3 = dataset_name[column_name].quantile(0.75)
    IQR = Q3 - Q1
    upper_limit = Q3 + 1.5 * IQR
    under_limit = Q1 - 1.5 * IQR

    return dataset_name[(dataset_name[column_name] > upper_limit) | (dataset_name[column_name] < under_limit)]

In [42]:
# Unit_Price Outliers
outliers_price = outliers(df, 'Unit_Price')
outliers_price

Unnamed: 0,Product_ID,Product_Name,Category,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Last_Order_Date,Expiration_Date,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate,Status
1,40-681-9981,Arabica Coffee,Beverages,54-470-2479,Feedmix,45,77,2,20.0,2024-11-01,2024-05-29,2024-05-08,36 3rd Place,85,1,Discontinued
18,48-414-6162,Arabica Coffee,Beverages,10-060-8515,Skivee,55,93,84,20.0,2024-05-29,2024-11-08,2024-05-05,17780 Lindbergh Junction,46,84,Active
33,79-136-9840,Herbal Tea,Beverages,07-809-0926,Pixonyx,77,45,75,30.0,2024-11-05,2024-08-19,2024-06-23,26175 Oxford Way,25,37,Discontinued
42,21-816-1004,Black Coffee,Beverages,68-482-4786,Realpoint,84,13,88,15.0,2025-01-26,2024-12-04,2024-07-22,564 Hazelcrest Crossing,25,34,Backordered
65,14-844-4138,Black Coffee,Beverages,90-534-2165,Flipbug,61,43,29,15.0,2025-02-03,2024-10-25,2024-03-03,40643 Sullivan Lane,67,6,Discontinued
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870,59-725-4038,Arabica Coffee,Beverages,39-055-1703,Zoozzy,78,34,39,20.5,2024-11-13,2024-08-26,2024-08-23,697 Hoepker Center,62,2,Active
900,27-387-0428,Halibut,Seafood,62-312-2748,Camido,88,42,68,20.0,2024-06-23,2024-11-29,2024-04-09,1715 Dottie Pass,92,29,Active
905,19-323-0506,Halibut,Seafood,28-977-3789,Skidoo,68,43,96,20.0,2024-11-17,2024-04-28,2024-10-15,5 Ohio Lane,75,35,Backordered
955,31-403-6234,Tuna,Seafood,00-258-2525,Thoughtbeat,87,24,66,18.0,2024-05-20,2024-11-24,2024-05-04,35 Waxwing Center,22,34,Active


In [43]:
fig_facet = px.box(
    outliers_price,
    x='Product_Name',
    y='Sales_Volume',
    # facet_col='Category',
    title='Outlier Analysis: Sales Volume Distribution for Product Name (Based on Unit Price Outliers)',
    # facet_col_wrap=2
)
fig_facet.update_traces(marker={"color": "green"})
fig_facet.show()

Certain product categories, including beverages, seafood, and fresh produce, are exhibiting price outliers, possibly influenced by seasonal trends and inherent product attributes.

#### Bivariate analysis

In [44]:
fig = px.scatter(df, 'Stock_Quantity', 'Sales_Volume', title="Sales Volume x Stock Quantity")
fig.update_traces(marker=dict(color='blue'))
fig.show()

In [45]:
fig = px.scatter(df, 'Inventory_Turnover_Rate', 'Sales_Volume', title="Sales Volume x Invetory Turnover Rate")
fig.update_traces(marker=dict(color='green'))
fig.show()

In [46]:
fig = px.scatter(df, 'Inventory_Turnover_Rate', 'Reorder_Level', title="Reorder Level x Invetory Turnover Rate")
fig.update_traces(marker=dict(color='orange'))
fig.show()

As expected, the data is scattered, we can use clustering techniques, ABC curve for analysis.

In [47]:
df.head()

Unnamed: 0,Product_ID,Product_Name,Category,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Last_Order_Date,Expiration_Date,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate,Status
0,29-205-1132,Sushi Rice,Grains & Pulses,38-037-1699,Jaxnation,22,72,70,4.5,2024-08-16,2024-06-29,2024-09-19,48 Del Sol Trail,32,19,Discontinued
1,40-681-9981,Arabica Coffee,Beverages,54-470-2479,Feedmix,45,77,2,20.0,2024-11-01,2024-05-29,2024-05-08,36 3rd Place,85,1,Discontinued
2,06-955-3428,Black Rice,Grains & Pulses,54-031-2945,Vinder,30,38,83,6.0,2024-08-03,2024-06-10,2024-09-22,3296 Walton Court,31,34,Backordered
3,71-594-6552,Long Grain Rice,Grains & Pulses,63-492-7603,Brightbean,12,59,62,1.5,2024-12-08,2025-02-19,2024-04-17,3 Westerfield Crossing,95,99,Active
4,57-437-1828,Plum,Fruits & Vegetables,54-226-4308,Topicstorm,37,30,74,4.0,2024-07-03,2024-10-11,2024-10-05,15068 Scoville Court,62,25,Backordered


## Feature Engineering

In [48]:
# Stock Cover
column_inventory.update({'Stock_Coverage': 'How many days your current inventory can last.'})
df['Stock_Coverage'] = (365 / df['Inventory_Turnover_Rate']).apply(np.floor).astype('int')

In [49]:
fig = px.scatter(df, 'Stock_Quantity', 'Stock_Coverage', title='Stock Cover in days')
fig.show()

In [50]:
# create Sales Percent Total Value
column_inventory.update({'Sales_Volume(%)': 'The percentage of total sales volume represented by this product.'})
df['Sales_Volume(%)'] = (df['Sales_Volume'] / (df['Sales_Volume'].sum())) * 100

The percentage value of the inventory gives a sense of the product's impact on the stock.

In [51]:
# calculate Total Values
column_inventory.update({'Stock_Value': 'The total monetary value of the current stock (Stock Quantity * Unit Price).', 
                         'Reorder_Value': 'The total monetary value of the reorder quantity (Reorder Quantity * Unit Price).'})
df['Stock_Value'] = (df['Stock_Quantity'] * df['Unit_Price'])
df['Reorder_Value'] = (df['Reorder_Quantity'] * df['Unit_Price'])

In [52]:
fig = px.histogram(df.query('Status != "Discontinued"'), x='Category', y=['Stock_Value', 'Reorder_Value'], barmode='group' , title='Stock vs. Reorder Value')
fig.show()

The graph above demonstrates inconsistency between purchase value and inventory, omitting the discontinued values there has been a deterioration in the relationships.
A higher inventory value than the purchase value may indicate low sales, excess purchases, or obsolete products.

In [53]:
column_inventory.update({'Stock_Value(%)': 'The percentage of the total inventory value represented by this product.', 
                         'Reorder_Value(%)': 'The percentage of the total inventory reorder value represented by this product.',
                         'LeadTime(days)': 'The number of days it takes for a product to be delivered after an order is placed.',
                         'Purchase_Order': 'The number or identifier of the purchase order related to this product.',
                         'Days_For_Expiration': 'The number of days remaining until the product expires.',
                         'Expiration_Status': 'The status of the product based on its expiration date (e.g., Expired, Nearing Expiration).',
                         })

In [54]:
# create Stock Percent Total Value
df['Stock_Value(%)'] = (df['Stock_Value'] / (df['Stock_Value'].sum())) * 100

# create Reorder Percent Total Value
df['Reorder_Value(%)'] =  (df['Reorder_Value'] / (df['Reorder_Value'].sum())) * 100

# Lead Time in days
df['LeadTime(days)'] = (df['Date_Received'] - df['Last_Order_Date']).dt.days

In [55]:
# Status_Order
df['Purchase_Order'] = np.where(df['LeadTime(days)'] < 0, 'Active', 'Inactive')

For a more direct and clear view of the number of days left before the product expires from the date of receipt.
How many days can we keep the product in stock?

In [56]:
# Expiration Date in days
df['Days_For_Expiration'] = (df['Expiration_Date'] - df['Date_Received']).dt.days

Categories were created for product validity, based on attention to products with less than 30 days until expiration. The value can be adjusted according to business needs. These products deserve special attention, potentially generating a marketing campaign or promotion to avoid waste. Expired products need to be removed and an analysis conducted. For example: poorly sized quantities, market changes, seasonality, etc.

In [57]:
df['Expiration_Status'] = np.where(df['Days_For_Expiration'] < 0, 'Expired', 
         np.where(df['Days_For_Expiration'] < 30, 'Nearing', 'Safe'))

In [58]:
# Change Type for Category
df['Expiration_Status'] = df['Expiration_Status'].astype('category')
df['Purchase_Order'] = df['Purchase_Order'].astype('category')

In [59]:
fig = px.bar(df, 'Expiration_Status')
fig.show()

Expired products in the supply chain lead to direct financial losses, increased operational costs for disposal, and wasted storage space.  
They also severely damage brand reputation and expose the company to health risks, highlighting failures in supply chain management.

In [60]:
df.head()

Unnamed: 0,Product_ID,Product_Name,Category,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Last_Order_Date,Expiration_Date,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate,Status,Stock_Coverage,Sales_Volume(%),Stock_Value,Reorder_Value,Stock_Value(%),Reorder_Value(%),LeadTime(days),Purchase_Order,Days_For_Expiration,Expiration_Status
0,29-205-1132,Sushi Rice,Grains & Pulses,38-037-1699,Jaxnation,22,72,70,4.5,2024-08-16,2024-06-29,2024-09-19,48 Del Sol Trail,32,19,Discontinued,19,0.054855,99.0,315.0,0.029761,0.103115,48,Inactive,34,Safe
1,40-681-9981,Arabica Coffee,Beverages,54-470-2479,Feedmix,45,77,2,20.0,2024-11-01,2024-05-29,2024-05-08,36 3rd Place,85,1,Discontinued,365,0.145708,900.0,40.0,0.270551,0.013094,156,Inactive,-177,Expired
2,06-955-3428,Black Rice,Grains & Pulses,54-031-2945,Vinder,30,38,83,6.0,2024-08-03,2024-06-10,2024-09-22,3296 Walton Court,31,34,Backordered,10,0.05314,180.0,498.0,0.05411,0.163019,54,Inactive,50,Safe
3,71-594-6552,Long Grain Rice,Grains & Pulses,63-492-7603,Brightbean,12,59,62,1.5,2024-12-08,2025-02-19,2024-04-17,3 Westerfield Crossing,95,99,Active,3,0.16285,18.0,93.0,0.005411,0.030443,-73,Active,-235,Expired
4,57-437-1828,Plum,Fruits & Vegetables,54-226-4308,Topicstorm,37,30,74,4.0,2024-07-03,2024-10-11,2024-10-05,15068 Scoville Court,62,25,Backordered,14,0.106281,148.0,296.0,0.044491,0.096895,-100,Active,94,Safe


In [61]:
df.select_dtypes(['category', 'object'])

Unnamed: 0,Product_ID,Product_Name,Category,Supplier_ID,Supplier_Name,Warehouse_Location,Status,Purchase_Order,Expiration_Status
0,29-205-1132,Sushi Rice,Grains & Pulses,38-037-1699,Jaxnation,48 Del Sol Trail,Discontinued,Inactive,Safe
1,40-681-9981,Arabica Coffee,Beverages,54-470-2479,Feedmix,36 3rd Place,Discontinued,Inactive,Expired
2,06-955-3428,Black Rice,Grains & Pulses,54-031-2945,Vinder,3296 Walton Court,Backordered,Inactive,Safe
3,71-594-6552,Long Grain Rice,Grains & Pulses,63-492-7603,Brightbean,3 Westerfield Crossing,Active,Active,Expired
4,57-437-1828,Plum,Fruits & Vegetables,54-226-4308,Topicstorm,15068 Scoville Court,Backordered,Active,Safe
...,...,...,...,...,...,...,...,...,...
985,82-977-7752,Spinach,Fruits & Vegetables,57-473-8672,Shuffledrive,58 Corscot Terrace,Active,Active,Safe
986,62-393-9939,Cheddar Cheese,Dairy,93-877-9384,Gabcube,5 Oxford Pass,Active,Active,Safe
987,31-745-6850,Cabbage,Fruits & Vegetables,96-215-2767,Lajo,081 Jana Lane,Active,Active,Nearing
988,86-692-2312,Avocado Oil,Oils & Fats,77-783-4107,Dazzlesphere,00616 Manitowish Parkway,Active,Active,Expired


In [62]:
# Encode Non numeric variables
encoder = OrdinalEncoder()

In [63]:
# Columns Encoding
col_enc = ['Product_Name', 'Category', 'Supplier_Name', 'Status', 'Purchase_Order', 'Expiration_Status']

In [64]:
# Encode data
data_encoded = encoder.fit_transform(df[col_enc])

In [65]:
# Create name cols
suffix = '_encoded'
new_columns = [(col + suffix) for col in col_enc]

In [66]:
# Include encoded data
df[new_columns] = data_encoded

In [67]:
df

Unnamed: 0,Product_ID,Product_Name,Category,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Last_Order_Date,Expiration_Date,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate,Status,Stock_Coverage,Sales_Volume(%),Stock_Value,Reorder_Value,Stock_Value(%),Reorder_Value(%),LeadTime(days),Purchase_Order,Days_For_Expiration,Expiration_Status,Product_Name_encoded,Category_encoded,Supplier_Name_encoded,Status_encoded,Purchase_Order_encoded,Expiration_Status_encoded
0,29-205-1132,Sushi Rice,Grains & Pulses,38-037-1699,Jaxnation,22,72,70,4.5,2024-08-16,2024-06-29,2024-09-19,48 Del Sol Trail,32,19,Discontinued,19,0.054855,99.0,315.0,0.029761,0.103115,48,Inactive,34,Safe,101.0,4.0,127.0,2.0,1.0,2.0
1,40-681-9981,Arabica Coffee,Beverages,54-470-2479,Feedmix,45,77,2,20.0,2024-11-01,2024-05-29,2024-05-08,36 3rd Place,85,1,Discontinued,365,0.145708,900.0,40.0,0.270551,0.013094,156,Inactive,-177,Expired,5.0,1.0,93.0,2.0,1.0,0.0
2,06-955-3428,Black Rice,Grains & Pulses,54-031-2945,Vinder,30,38,83,6.0,2024-08-03,2024-06-10,2024-09-22,3296 Walton Court,31,34,Backordered,10,0.053140,180.0,498.0,0.054110,0.163019,54,Inactive,50,Safe,13.0,4.0,298.0,1.0,1.0,2.0
3,71-594-6552,Long Grain Rice,Grains & Pulses,63-492-7603,Brightbean,12,59,62,1.5,2024-12-08,2025-02-19,2024-04-17,3 Westerfield Crossing,95,99,Active,3,0.162850,18.0,93.0,0.005411,0.030443,-73,Active,-235,Expired,64.0,4.0,27.0,0.0,0.0,0.0
4,57-437-1828,Plum,Fruits & Vegetables,54-226-4308,Topicstorm,37,30,74,4.0,2024-07-03,2024-10-11,2024-10-05,15068 Scoville Court,62,25,Backordered,14,0.106281,148.0,296.0,0.044491,0.096895,-100,Active,94,Safe,83.0,3.0,278.0,1.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,82-977-7752,Spinach,Fruits & Vegetables,57-473-8672,Shuffledrive,88,78,17,2.5,2024-09-06,2024-12-28,2024-11-04,58 Corscot Terrace,58,21,Active,17,0.099424,220.0,42.5,0.066135,0.013912,-113,Active,59,Safe,98.0,3.0,230.0,0.0,0.0,2.0
986,62-393-9939,Cheddar Cheese,Dairy,93-877-9384,Gabcube,60,9,89,9.0,2024-06-01,2024-06-02,2024-10-05,5 Oxford Pass,95,63,Active,5,0.162850,540.0,801.0,0.162330,0.262206,-1,Active,126,Safe,26.0,2.0,105.0,0.0,0.0,2.0
987,31-745-6850,Cabbage,Fruits & Vegetables,96-215-2767,Lajo,94,90,12,0.9,2024-10-03,2024-10-24,2024-11-01,081 Jana Lane,98,71,Active,5,0.167992,84.6,10.8,0.025432,0.003535,-21,Active,29,Nearing,22.0,3.0,148.0,0.0,0.0,1.0
988,86-692-2312,Avocado Oil,Oils & Fats,77-783-4107,Dazzlesphere,30,48,52,10.0,2024-06-11,2024-12-07,2024-04-30,00616 Manitowish Parkway,22,78,Active,4,0.037713,300.0,520.0,0.090184,0.170221,-179,Active,-42,Expired,8.0,5.0,57.0,0.0,0.0,0.0


In [79]:
col_matrix = ['Stock_Quantity', 'Reorder_Level', 'Reorder_Quantity', 'Unit_Price',
       'Sales_Volume', 'Inventory_Turnover_Rate', 'Stock_Coverage',
       'Stock_Value', 'Reorder_Value', 'LeadTime(days)', 'Days_For_Expiration',
       'Product_Name_encoded', 'Category_encoded', 'Supplier_Name_encoded',
       'Status_encoded', 'Purchase_Order_encoded',
       'Expiration_Status_encoded']

In [81]:
correlation_matrix = df[col_matrix].corr(method='spearman')

In [82]:
# create heatmap
fig = px.imshow(correlation_matrix,
                text_auto=True,  # Adiciona os valores nas cÃ©lulas
                aspect="auto",   # Ajusta a proporÃ§Ã£o do grÃ¡fico
                color_continuous_scale='RdBu_r',  # Define a escala de cores
                title='Correlation Matrix')

fig.update_layout(
    xaxis_title='Variables',
    yaxis_title='Variables',
    width=1600,
    height=1600
)

fig.show()

# continuar daqui
temos baixa correlaÃ§Ã£o com variÃ¡veis alvo (sales_volume e stick_quantity), utilizar randon forest e derivados.

### 1. Analysis Sales and Stock

In [None]:
# df_ano_atual.groupby("etapa", as_index=False).agg(
#     total_acum=("valor_total_previsto", "sum")
# )

In [83]:
df_sale_category = df.groupby(by=['Category', 'Status'], as_index=False)[['Sales_Volume', 'Stock_Quantity', 'Stock_Value']].sum().sort_values(by='Sales_Volume', ascending=False).reset_index(drop=True)
df_sale_category

Unnamed: 0,Category,Status,Sales_Volume,Stock_Quantity,Stock_Value
0,Fruits & Vegetables,Backordered,6668,6436,26823.91
1,Fruits & Vegetables,Discontinued,6466,6198,36038.04
2,Fruits & Vegetables,Active,6263,5924,27763.16
3,Dairy,Discontinued,3728,3815,15443.7
4,Dairy,Backordered,3560,3304,16372.7
5,Dairy,Active,3484,3415,18785.55
6,Grains & Pulses,Active,3320,2776,11128.1
7,Grains & Pulses,Discontinued,2877,2661,9514.6
8,Grains & Pulses,Backordered,2689,2803,11326.5
9,Seafood,Active,2226,2238,25505.7


In [84]:
# Sales for category and status
fig = px.histogram(df, x='Category' , y='Sales_Volume', color='Status', barmode='group')
fig.show()

In [85]:
# Sales for category and stock Quantity
fig = px.histogram(df, x='Category' , y='Stock_Quantity', color='Status', barmode='group')
fig.show()

In [86]:
# Stock Value for category
fig = px.histogram(df, x='Category' , y='Stock_Value', color='Status', barmode='group')
fig.show()

In [87]:
# Stock Value for Supplier
fig = px.histogram(df, x='Unit_Price', y='Sales_Volume')
fig.show()

In [None]:
df.columns

In [None]:
fig = px.scatter(df.query("Status!='Discontinued'"), x="Stock_Quantity", y="Stock_Value",
	         size="Stock_Value", color="Supplier_Name", log_x=True, size_max=60)
fig.show()

In [None]:
df.groupby(['Category'])['Product_ID'].count()

In [None]:
df.head()

In [None]:
df.groupby(['Category', 'Expiration_Status', 'Purchase_Order'])[['Sales_Volume', 'Stock_Value']].sum()

In [None]:
df.groupby(['Supplier_Name'])[['LeadTime(days)']].sum().sort_values(by='LeadTime(days)', ascending=False)

In [88]:
df[df['Supplier_Name'] == 'Gigashots']

Unnamed: 0,Product_ID,Product_Name,Category,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Last_Order_Date,Expiration_Date,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate,Status,Stock_Coverage,Sales_Volume(%),Stock_Value,Reorder_Value,Stock_Value(%),Reorder_Value(%),LeadTime(days),Purchase_Order,Days_For_Expiration,Expiration_Status,Product_Name_encoded,Category_encoded,Supplier_Name_encoded,Status_encoded,Purchase_Order_encoded,Expiration_Status_encoded
258,19-214-5762,Pear,Fruits & Vegetables,88-108-3774,Gigashots,41,52,75,4.5,2024-04-06,2024-04-24,2024-10-13,7 Bonner Terrace,57,62,Backordered,5,0.09771,184.5,337.5,0.055463,0.11048,-18,Active,190,Safe,80.0,3.0,112.0,1.0,0.0,2.0
332,65-145-9672,Long Grain Rice,Grains & Pulses,65-068-1200,Gigashots,71,10,49,1.5,2024-08-11,2024-03-16,2024-05-26,8 Dovetail Junction,66,43,Active,8,0.113138,106.5,73.5,0.032015,0.02406,148,Inactive,-77,Expired,64.0,4.0,112.0,0.0,1.0,0.0
539,93-015-0811,Pineapple,Fruits & Vegetables,07-055-5188,Gigashots,18,7,58,3.5,2024-11-26,2024-08-05,2024-06-26,1 Pleasure Hill,88,96,Discontinued,3,0.15085,63.0,203.0,0.018939,0.066452,113,Inactive,-153,Expired,82.0,3.0,112.0,2.0,1.0,0.0
752,85-835-3445,Egg (Goose),Dairy,13-433-4930,Gigashots,50,93,37,2.45,2025-01-11,2024-03-02,2024-05-12,288 Fair Oaks Place,78,79,Backordered,4,0.133708,122.5,90.65,0.036825,0.029674,315,Inactive,-244,Expired,41.0,2.0,112.0,1.0,1.0,0.0
814,10-555-5971,Sourdough Bread,Bakery,30-410-3509,Gigashots,94,56,80,4.0,2025-01-04,2024-12-02,2024-09-27,8425 New Castle Parkway,47,96,Backordered,3,0.080568,376.0,320.0,0.11303,0.104751,33,Inactive,-99,Expired,97.0,0.0,112.0,1.0,1.0,0.0
829,40-126-0515,Carrot,Fruits & Vegetables,78-379-0369,Gigashots,74,37,57,1.5,2025-02-14,2024-03-04,2024-12-18,28351 Cascade Plaza,91,18,Active,20,0.155993,111.0,85.5,0.033368,0.027988,347,Inactive,-58,Expired,24.0,3.0,112.0,0.0,1.0,0.0


In [None]:
df[~(df['Status'] == 'Discontinued')]