## Data Generation for Grocery Supply Chain
### Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import json

from smart_supply_chain_ai.utils import create_data_functions, combine_df_functions, weather_conditions

import warnings
warnings.filterwarnings('ignore')

### Paths

In [2]:
# Define data paths
raw_data_path = os.path.join('../data', 'raw/')

external_data_path = os.path.join('../data', 'external/')

json_path = os.path.join('../src','smart_supply_chain_ai' , 'utils/')

In [3]:
# List of JSON filenames (without extension) to be loaded
arch_json = ['products','products_categories', 'suppliers']

# Dictionary to store the loaded JSON content
store_catalog = {}

# Loop through each filename, build the full path, and load the JSON data
for name in arch_json:
    file_path = os.path.join(json_path, f"{name}.json")  # Construct full file path
    with open(file_path, "r", encoding="utf-8") as f:     # Open the JSON file
        store_catalog[name] = json.load(f)                        # Load and store the data under its name

# Product catalog information

In [4]:
# Create a DataFrame of products with product names as a column
products = pd.DataFrame.from_dict(store_catalog['products']).T.reset_index().rename(columns={'index': 'product'})


In [5]:
# Replace product with new IDs
products['product_id'] = create_data_functions.create_IDs(products.shape[0], suffix='P')

In [6]:
# View length and width of data
products.shape

(189, 9)

In [7]:
# Show the first rows of the DataFrame
products.head()

Unnamed: 0,product,category,sub_category,shelf_life_days,maximum_days_on_sale,seasonality,storage_recommendation,unit_of_measurement,product_id
0,All-Purpose Flour,Pantry,Baking Supplies,365,60,[],Room Temperature,lb,1580721|P
1,Almond Flour,Pantry,Baking Supplies,180,45,[],Refrigerated,lb,1318697|P
2,Almond Milk,Dairy & Alternatives,Plant-Based Milk,7,3,[],Refrigerated,unit,1916339|P
3,Anchovies,Pantry,Canned Fish,1095,90,[],Room Temperature,unit,1781249|P
4,Apple,Fresh Foods,Fruits,21,7,"[September, October, November]",Refrigerated,lb,1004205|P


# Supplier catalog and distribution details

In [8]:
# Create a DataFrame of suppliers with supplier names as a column
suppliers = pd.DataFrame.from_dict(store_catalog['suppliers']).T.reset_index().rename(columns={'index': 'supplier'})

In [9]:
# Insert supplier IDs as the second column
suppliers.insert(1, 'supplier_id', create_data_functions.create_IDs(suppliers.shape[0], suffix='S'))

In [10]:
# Split each supplier's product list into separate rows and reset the index
suppliers = suppliers.explode('products').reset_index(drop=True)


In [11]:
# Merge product and supplier data on matching product names, then drop duplicate 'products' column from suppliers
products_suppliers_df = pd.merge(products, suppliers, left_on='product', right_on='products').drop(columns='products')

In [12]:
# Initialize a random number generator with a fixed seed for reproducibility.
rng = np.random.default_rng(seed=43)

# Assign random supplier ratings between 1 and 4 to all suppliers.
products_suppliers_df['supplier_rating'] = rng.integers(1, 5, size=products_suppliers_df.shape[0])

# Randomly select 15 unique suppliers to be considered "top suppliers".
suppliers_top = rng.choice(products_suppliers_df['supplier'].unique(), 15, replace=False)

# Update ratings: if the supplier is in the top list, set rating to 5; otherwise keep the original rating.
products_suppliers_df['supplier_rating'] = np.where(products_suppliers_df['supplier'].isin(suppliers_top), 5, products_suppliers_df['supplier_rating'])


In [13]:
# Show the first rows of the DataFrame
products_suppliers_df.head()

Unnamed: 0,product,category,sub_category,shelf_life_days,maximum_days_on_sale,seasonality,storage_recommendation,unit_of_measurement,product_id,supplier,supplier_id,distance_km,moq,supplier_rating
0,All-Purpose Flour,Pantry,Baking Supplies,365,60,[],Room Temperature,lb,1580721|P,BakeWell Supplies,1803930|S,90,50,3
1,Almond Flour,Pantry,Baking Supplies,180,45,[],Refrigerated,lb,1318697|P,BakeWell Supplies,1803930|S,90,50,3
2,Almond Milk,Dairy & Alternatives,Plant-Based Milk,7,3,[],Refrigerated,unit,1916339|P,Plant-Based Alternatives,1185876|S,105,60,2
3,Apple,Fresh Foods,Fruits,21,7,"[September, October, November]",Refrigerated,lb,1004205|P,OrchardBest Fruits,1677419|S,200,120,1
4,Apple,Fresh Foods,Fruits,21,7,"[September, October, November]",Refrigerated,lb,1004205|P,Emergency Supplier,1454719|S,15,10,3


## Meteorological Data for Supply Chain Management

In [14]:
# Set the path to the weather CSV file
# archive_csv = external_data_path + 'dados_83967_D_2015-01-01_2025-09-18.csv'
archive_csv = external_data_path + 'dados_B807_D_2022-12-07_2025-09-22.csv'

# Read the CSV file into a DataFrame
weather_df = pd.read_csv(archive_csv, sep=";", decimal=",", skiprows=9, engine="python")

# Show the first rows of the DataFrame
weather_df.head()

Unnamed: 0,Data Medicao,"PRECIPITACAO TOTAL, DIARIO (AUT)(mm)","TEMPERATURA MAXIMA, DIARIA (AUT)(°C)","TEMPERATURA MINIMA, DIARIA (AUT)(°C)","VENTO, VELOCIDADE MEDIA DIARIA (AUT)(m/s)",Unnamed: 5
0,2022-12-07,,,,,
1,2022-12-08,,,,,
2,2022-12-09,,32.3,17.4,2.4,
3,2022-12-10,0.0,30.1,18.3,2.2,
4,2022-12-11,0.0,31.8,22.7,2.8,


In [15]:
# Remove columns that contain only missing values
weather_df.dropna(axis=1, how='all', inplace=True)


In [16]:
# Rename columns to clear and descriptive English names
weather_df.columns = [
    "measurement_date",
    "daily_total_precipitation_mm",
    "daily_maximum_temperature_c",
    "daily_minimum_temperature_c",
    "daily_average_wind_speed_mps"
]


In [17]:
# Set 'measurement_date' as index and remove rows with all missing values
weather_df = weather_df.set_index('measurement_date').dropna(how='all')

# Show the first rows of the DataFrame
weather_df.head()


Unnamed: 0_level_0,daily_total_precipitation_mm,daily_maximum_temperature_c,daily_minimum_temperature_c,daily_average_wind_speed_mps
measurement_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-12-09,,32.3,17.4,2.4
2022-12-10,0.0,30.1,18.3,2.2
2022-12-11,0.0,31.8,22.7,2.8
2022-12-12,0.0,37.3,19.9,2.9
2022-12-13,6.8,25.3,17.6,3.7


In [18]:
# This line calculates the total number of missing (NaN) values in each column of the weather_df DataFrame.
weather_df.isna().sum()

daily_total_precipitation_mm    5
daily_maximum_temperature_c     2
daily_minimum_temperature_c     0
daily_average_wind_speed_mps    6
dtype: int64

In [19]:
# Fill missing values in 'daily_total_precipitation_mm' using backward fill (next valid value).
weather_df['daily_total_precipitation_mm'].fillna(method='bfill', inplace=True)

# First, fill missing values in 'daily_maximum_temperature_c' using backward fill.
weather_df['daily_maximum_temperature_c'].fillna(method='bfill', inplace=True)

# Then, fill any remaining missing values in 'daily_maximum_temperature_c' using forward fill.
weather_df['daily_maximum_temperature_c'].fillna(method='ffill', inplace=True)

# Fill missing values in 'daily_average_wind_speed_mps' using forward fill (previous valid value).
weather_df['daily_average_wind_speed_mps'].fillna(method='ffill', inplace=True)


In [20]:
# Reset the DataFrame index to a default integer index and drop the old index column.
weather_df = weather_df.reset_index()

# Convert 'measurement_date' column to datetime format
weather_df['measurement_date'] = pd.to_datetime(weather_df['measurement_date'])

In [21]:
# Display summary information about the DataFrame
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   measurement_date              992 non-null    datetime64[ns]
 1   daily_total_precipitation_mm  992 non-null    float64       
 2   daily_maximum_temperature_c   992 non-null    float64       
 3   daily_minimum_temperature_c   992 non-null    float64       
 4   daily_average_wind_speed_mps  992 non-null    float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 38.9 KB


In [22]:
# Dropped all missing values
weather_df.dropna(inplace=True)

In [23]:
# Apply the weather classification function to the cleaned DataFrame to generate severity and category labels
weather_analyser = weather_conditions.WeatherConditions(weather_df)
weather_severity_df = weather_analyser.classify_weather()

In [24]:
# Show 10 samples rows of the DataFrame
weather_severity_df.sample(10)

Unnamed: 0,measurement_date,daily_total_precipitation_mm,daily_maximum_temperature_c,daily_minimum_temperature_c,daily_average_wind_speed_mps,daily_average_temperature_c,temperature_classification,precipitation_classification,wind_classification,weather_severity
199,2023-06-26,1.4,28.2,17.1,1.3,22.65,Mild to Temperate,Light Rain,Calm / Light Breeze,Moderate
608,2024-08-08,6.0,16.4,13.0,1.7,14.7,Cool,Moderate Rain,Gentle to Fresh Breeze,Moderate
645,2024-09-14,32.0,20.5,17.4,3.2,18.95,Mild to Temperate,Heavy Rain,Gentle to Fresh Breeze,Severe
98,2023-03-17,0.0,32.1,21.5,1.8,26.8,Warm,No precipitation,Gentle to Fresh Breeze,Moderate
539,2024-05-31,0.4,18.7,7.0,1.4,12.85,Cool,Light Rain,Calm / Light Breeze,Moderate
989,2025-09-20,0.0,23.5,17.3,0.9,20.4,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal
148,2023-05-06,83.6,23.8,19.8,1.4,21.8,Mild to Temperate,Violent Rainfall,Calm / Light Breeze,Severe
59,2023-02-06,0.2,31.3,20.6,1.7,25.95,Warm,Light Rain,Gentle to Fresh Breeze,Moderate
322,2023-10-27,0.0,27.5,10.2,2.1,18.85,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate
524,2024-05-16,0.0,14.1,5.6,0.9,9.85,Cold,No precipitation,Calm / Light Breeze,Moderate


In [25]:
# Generate and transpose summary statistics for all numeric columns in the classified weather DataFrame
weather_severity_df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
measurement_date,992.0,2024-04-20 02:35:19.354838528,2022-12-09 00:00:00,2023-08-13 18:00:00,2024-04-17 12:00:00,2024-12-21 06:00:00,2025-09-22 00:00:00,
daily_total_precipitation_mm,992.0,4.645565,0.0,0.0,0.0,1.65,132.0,12.602568
daily_maximum_temperature_c,992.0,25.034073,10.4,21.175,25.5,29.125,39.2,5.654455
daily_minimum_temperature_c,992.0,15.84496,1.7,12.3,16.6,19.6,25.2,4.928274
daily_average_wind_speed_mps,992.0,2.366331,0.7,1.6,2.3,2.9,7.7,0.96995
daily_average_temperature_c,992.0,20.439516,6.65,16.9375,20.85,24.25,30.9,4.911554


# Realistic supply chain modeling based on weather and product data

In [26]:
# Create a copy of the climate data DataFrame to work with weather-specific analysis
df_weather_conditions = weather_severity_df.copy()

# Create a copy of the products_suppliers data DataFrame to work with product-related operations
df_products = products_suppliers_df.copy()

In [27]:
# Display the first row of the weather DataFrame to preview its structure
df_weather_conditions.head(1)


Unnamed: 0,measurement_date,daily_total_precipitation_mm,daily_maximum_temperature_c,daily_minimum_temperature_c,daily_average_wind_speed_mps,daily_average_temperature_c,temperature_classification,precipitation_classification,wind_classification,weather_severity
0,2022-12-09,0.0,32.3,17.4,2.4,24.85,Warm,No precipitation,Gentle to Fresh Breeze,Moderate


In [28]:
# Preview the first row of the products DataFrame to check column names and initial data
df_products.head(1)

Unnamed: 0,product,category,sub_category,shelf_life_days,maximum_days_on_sale,seasonality,storage_recommendation,unit_of_measurement,product_id,supplier,supplier_id,distance_km,moq,supplier_rating
0,All-Purpose Flour,Pantry,Baking Supplies,365,60,[],Room Temperature,lb,1580721|P,BakeWell Supplies,1803930|S,90,50,3


In [29]:
# Rename the column 'measurement_date' to 'date' for easier reference.
df_weather_conditions.rename(columns={'measurement_date': 'received_date'}, inplace=True)


In [30]:
# Filters df_weather_conditions to retain only relevant weather-related columns for analysis or merging
df_weather_conditions = df_weather_conditions[['received_date', 'temperature_classification', 
                                               'precipitation_classification', 'wind_classification', 
                                               'weather_severity']]

In [31]:
# Display the first few rows of the weather DataFrame
df_weather_conditions.head()

Unnamed: 0,received_date,temperature_classification,precipitation_classification,wind_classification,weather_severity
0,2022-12-09,Warm,No precipitation,Gentle to Fresh Breeze,Moderate
1,2022-12-10,Warm,No precipitation,Gentle to Fresh Breeze,Moderate
2,2022-12-11,Warm,No precipitation,Gentle to Fresh Breeze,Moderate
3,2022-12-12,Warm,No precipitation,Gentle to Fresh Breeze,Moderate
4,2022-12-13,Mild to Temperate,Moderate Rain,Gentle to Fresh Breeze,Moderate


In [32]:
# Display the first few rows of the products DataFrame
df_products.head()

Unnamed: 0,product,category,sub_category,shelf_life_days,maximum_days_on_sale,seasonality,storage_recommendation,unit_of_measurement,product_id,supplier,supplier_id,distance_km,moq,supplier_rating
0,All-Purpose Flour,Pantry,Baking Supplies,365,60,[],Room Temperature,lb,1580721|P,BakeWell Supplies,1803930|S,90,50,3
1,Almond Flour,Pantry,Baking Supplies,180,45,[],Refrigerated,lb,1318697|P,BakeWell Supplies,1803930|S,90,50,3
2,Almond Milk,Dairy & Alternatives,Plant-Based Milk,7,3,[],Refrigerated,unit,1916339|P,Plant-Based Alternatives,1185876|S,105,60,2
3,Apple,Fresh Foods,Fruits,21,7,"[September, October, November]",Refrigerated,lb,1004205|P,OrchardBest Fruits,1677419|S,200,120,1
4,Apple,Fresh Foods,Fruits,21,7,"[September, October, November]",Refrigerated,lb,1004205|P,Emergency Supplier,1454719|S,15,10,3


In [33]:
# Determine the number of samples based on the length of the supply DataFrame
n_samples = len(df_weather_conditions)

# Randomly select 'n_samples' rows from the df_products DataFrame
# Sampling is done with replacement (same row can be chosen more than once)
# The index is reset to avoid keeping the original row indices
random_samples1 = df_products.sample(n=n_samples, replace=True).reset_index(drop=True)
random_samples2 = df_products.sample(n=n_samples, replace=True).reset_index(drop=True)
random_samples3 = df_products.sample(n=n_samples, replace=True).reset_index(drop=True)
random_samples4 = df_products.sample(n=n_samples, replace=True).reset_index(drop=True)
random_samples5 = df_products.sample(n=n_samples, replace=True).reset_index(drop=True)


In [34]:
# Merge the supply_df and random_samples DataFrames using their index values
# This aligns rows from both DataFrames based on their position (index)
df_merged1 = df_weather_conditions.merge(random_samples1, left_index=True, right_index=True)
df_merged2 = df_weather_conditions.merge(random_samples2, left_index=True, right_index=True)
df_merged3 = df_weather_conditions.merge(random_samples3, left_index=True, right_index=True)
df_merged4 = df_weather_conditions.merge(random_samples4, left_index=True, right_index=True)
df_merged5 = df_weather_conditions.merge(random_samples5, left_index=True, right_index=True)


In [35]:
# Combine all merged DataFrames into one by stacking them row-wise and resetting the index
df_merged_full = pd.concat([df_merged1, df_merged2, df_merged3, df_merged4, df_merged5], axis=0, ignore_index=True)


In [36]:
# Display DataFrame information
df_merged_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   received_date                 4960 non-null   datetime64[ns]
 1   temperature_classification    4960 non-null   object        
 2   precipitation_classification  4960 non-null   object        
 3   wind_classification           4960 non-null   object        
 4   weather_severity              4960 non-null   object        
 5   product                       4960 non-null   object        
 6   category                      4960 non-null   object        
 7   sub_category                  4960 non-null   object        
 8   shelf_life_days               4960 non-null   object        
 9   maximum_days_on_sale          4960 non-null   object        
 10  seasonality                   4960 non-null   object        
 11  storage_recommendation        

In [37]:
# Randomly selects 3,000 rows from df_merged_full and resets the index to avoid retaining the original indices
df_raw = df_merged_full.sample(3000).reset_index(drop=True)

In [38]:
# Defines the desired column order for organizing the dataset, prioritizing product, inventory, and weather-related attributes
reorder_columns = ['received_date', 'seasonality', 'product', 'product_id', 'category', 'sub_category', 
                   'shelf_life_days', 'maximum_days_on_sale', 'unit_of_measurement', 'supplier_rating',
                   'supplier', 'supplier_id', 'distance_km', 'moq', 'storage_recommendation', 
                   'temperature_classification', 'precipitation_classification', 'wind_classification', 'weather_severity']

# Reorders columns according to reorder_columns
df_raw = df_raw[reorder_columns]

In [39]:
# Converts selected inventory-related columns to integer type for numerical operations and consistency
cols_int = ['shelf_life_days', 'maximum_days_on_sale', 'moq']
df_raw[cols_int] = df_raw[cols_int].astype(int)

# Converts selected categorical columns to 'category' dtype to optimize memory and improve model performance
cols_cat = ['category', 'sub_category', 'unit_of_measurement', 'supplier_rating', 
            'temperature_classification', 'precipitation_classification', 
            'wind_classification', 'weather_severity']
df_raw[cols_cat] = df_raw[cols_cat].astype('category')


In [40]:
# Summary 
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   received_date                 3000 non-null   datetime64[ns]
 1   seasonality                   3000 non-null   object        
 2   product                       3000 non-null   object        
 3   product_id                    3000 non-null   object        
 4   category                      3000 non-null   category      
 5   sub_category                  3000 non-null   category      
 6   shelf_life_days               3000 non-null   int64         
 7   maximum_days_on_sale          3000 non-null   int64         
 8   unit_of_measurement           3000 non-null   category      
 9   supplier_rating               3000 non-null   category      
 10  supplier                      3000 non-null   object        
 11  supplier_id                   

## Generate data about holidays, weekdays, and seasons of the year.

In [41]:
# Creates a full copy of df_raw and assigns it to df_date for independent manipulation
df_date = df_raw.copy()

In [42]:
# Apply the 'day_classification' function from 'create_data_functions' to each date in the 'date' column,
# assigning the result to a new column called 'day_classification'.
df_date['day_classification'] = create_data_functions.day_classification(dates = df_date['received_date'], country='BR')

# Create a boolean column indicating whether the day is classified as a holiday.
df_date['is_holiday'] = np.where(df_date['day_classification'] == 'Holiday', True, False)

# Create a boolean column indicating whether the day falls on a weekend (Saturday or Sunday).
df_date['is_weekend'] = np.where(df_date['received_date'].dt.dayofweek > 4, True, False)

In [43]:
def check_seasonality(row):
    """
    Checks whether the received month of a product aligns with its seasonal availability.
    """
    received_month = row['month_name']
    seasonality_list = row['seasonality']
    
    return received_month in seasonality_list


In [44]:
# Extracts the full month name from 'received_date' to support seasonality checks
df_date['month_name'] = df_date['received_date'].dt.month_name()

# Inserts a new column 'in_season' at position 2, indicating whether the product's received month aligns with its seasonal availability
df_date.insert(2, 'in_season', df_date.apply(check_seasonality, axis=1))

# Removes the temporary 'month_name' column after seasonality classification is complete
df_date.drop(columns=['month_name', 'seasonality'], inplace=True)

In [45]:
# Show the first rows of the DataFrame
df_date.head()

Unnamed: 0,received_date,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,...,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend
0,2025-02-20,False,Canned Beans,1913772|P,Pantry,Canned Goods,1095,90,unit,2,...,95,130,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False
1,2025-05-08,False,Rice Flour,1787175|P,Pantry,Baking Supplies,365,90,lb,4,...,90,50,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False
2,2023-11-22,False,Maple Syrup,1027236|P,Pantry,Sweeteners,365,90,unit,5,...,85,70,Room Temperature,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False
3,2024-01-30,False,Spinach,1300280|P,Fresh Foods,Vegetables,5,2,lb,3,...,127,150,Refrigerated,Mild to Temperate,Light Rain,Gentle to Fresh Breeze,Moderate,Weekdays,False,False
4,2024-10-20,False,Rye Bread,1261252|P,Bakery,Bread,5,2,unit,1,...,45,75,Room Temperature,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Sunday,False,True


## Generate data for stock quantities and sales volumes.

In [46]:
# Creates a full copy
df_stock = df_date.copy()

In [47]:
# Adds a 'sales_demand' column to df_stock by classifying each date using Brazilian holiday calendar and demand heuristics
df_stock['sales_demand'] = create_data_functions.classify_grocery_demand(
    dates=df_stock['received_date'],
    country='BR'
)

In [48]:
# Replace the 'sales_volume' column with simulated values using a custom function
df_stock['sales_volume'] = create_data_functions.simulate_sales_volume(df_stock, random_state=42)

In [49]:
# Apply the estimate_delivery_days function to each row in df_stock to calculate lead time,
# storing the result in a new 'lead_time' column. This estimates delivery duration based on row-specific data.
df_stock['lead_time'] = df_stock.apply(create_data_functions.estimate_delivery_days, axis=1)


In [50]:
# Calculate minimum stock as average sales × average lead time per product and supplier
df_stock['min_stock'] = df_stock.groupby(['product', 'supplier'])['sales_volume'].transform('mean').astype(int) * df_stock.groupby(['product', 'supplier'])['lead_time'].transform('mean').astype(int)

In [51]:
def max_stock(row):
    # Compute average sales volume per product-supplier pair
    avg_sales = df_stock.groupby(['product', 'supplier'])['sales_volume'].transform('mean')

    # Add average sales to minimum stock to estimate max stock
    max_stock_value = row['min_stock'] + avg_sales.loc[row.name].astype(int)

    # Ensure max stock is not below the minimum order quantity
    return row['moq'] if max_stock_value < row['moq'] else max_stock_value

In [52]:
# Calculate max stock for each row using custom function
df_stock['max_stock'] = df_stock.apply(max_stock, axis=1)

In [53]:
# Inserts a new column 'stock_quantity' at position 7 with simulated stock quantities generated from min and max stock values
df_stock['stock_quantity'] = create_data_functions.create_stock_distribution_vectorized(df_stock['min_stock'], df_stock['max_stock'])

In [54]:
# Insert a new column 'lpo' at position 1 with initial value 0
df_stock.insert(1, 'lpo', 0)

In [55]:
# Generate simulated purchase order dates based on product attributes and logistics
df_stock['lpo'] = create_data_functions.simulate_purchase_order_columns(df_stock)

In [56]:
# Save the updated DataFrame to CSV, excluding the index column
df_stock.to_csv(raw_data_path + 'synthetic_data_grocery_stock.csv', index=False)