# Environment Setup

In [1]:
import pandas as pd
import numpy as np
import random

from faker import Faker
from datetime import datetime

# Intializing Vendor Names & Ingrediants

In [2]:
# List of realistic vendor names
vendor_names = [
    'Loblaws', 'Metro', 'Sobeys', 'No Frills', 'Real Canadian Superstore', 'Bulk Barn', 
    'Longo\'s', 'Fortinos', 'Food Basics', 'FreshCo', 'Save-On-Foods', 'T&T Supermarket', 
    'Safeway Canada', 'IGA', 'Thrifty Foods', 'Super C', 'Co-op', 'Giant Tiger', 'Rexall', 'Shoppers Drug Mart'
]

# List of diverse ingredients with categories
ingredients = [
    ('Tomatoes', 'Vegetable'),
    ('Pasta', 'Grain'),
    ('Cheese', 'Dairy'),
    ('Basil', 'Herb'),
    ('Garlic', 'Vegetable'),
    ('Olive Oil', 'Oil'),
    ('Chicken', 'Meat'),
    ('Beef', 'Meat'),
    ('Fish', 'Seafood'),
    ('Rice', 'Grain'),
    ('Pepper', 'Vegetable'),
    ('Salt', 'Seasoning'),
    ('Milk', 'Dairy'),
    ('Butter', 'Dairy'),
    ('Eggs', 'Dairy'),
    ('Flour', 'Grain'),
    ('Sugar', 'Sweetener'),
    ('Onions', 'Vegetable'),
    ('Mushrooms', 'Vegetable'),
    ('Zucchini', 'Vegetable'),
    ('Spinach', 'Vegetable'),
    ('Broccoli', 'Vegetable'),
    ('Carrots', 'Vegetable'),
    ('Potatoes', 'Vegetable'),
    ('Cilantro', 'Herb'),
    ('Thyme', 'Herb'),
    ('Rosemary', 'Herb'),
    ('Parsley', 'Herb'),
    ('Lemon', 'Fruit'),
    ('Lime', 'Fruit'),
    ('Oranges', 'Fruit'),
    ('Apples', 'Fruit'),
    ('Bananas', 'Fruit'),
    ('Grapes', 'Fruit'),
    ('Strawberries', 'Fruit'),
    ('Blueberries', 'Fruit'),
    ('Yogurt', 'Dairy'),
    ('Cream', 'Dairy'),
    ('Almonds', 'Nut'),
    ('Walnuts', 'Nut'),
    ('Cashews', 'Nut'),
    ('Peanut Butter', 'Nut'),
    ('Honey', 'Sweetener'),
    ('Maple Syrup', 'Sweetener'),
    ('Kale', 'Vegetable'),
    ('Lettuce', 'Vegetable'),
    ('Cucumber', 'Vegetable'),
    ('Bell Peppers', 'Vegetable'),
    ('Eggplant', 'Vegetable'),
    ('Squash', 'Vegetable'),
    ('Corn', 'Vegetable'),
    ('Peas', 'Vegetable'),
    ('Lentils', 'Legume'),
    ('Chickpeas', 'Legume'),
    ('Black Beans', 'Legume'),
    ('Kidney Beans', 'Legume')
]

len(vendor_names), len(ingredients)

(20, 56)

# Synthetic Data Generation

## Initialize Fields

In [3]:
# Initialize Faker to generate realistic Canadian data
fake = Faker('en_CA')

## Generate Vendor Data

In [4]:
# Generate vendors data
def generate_vendors_data(num_vendors):
    vendors = []
    for _ in range(num_vendors):
        vendor_name = random.choice(vendor_names)
        location = fake.city()
        ingredient_supplied, category = random.choice(ingredients)
        vendors.append({
            'vendor_name': vendor_name,
            'location': location,
            'ingredient_supplied': ingredient_supplied,
            'category': category
        })
    return pd.DataFrame(vendors)

# Generate realistic vendors data
vendors = generate_vendors_data(num_vendors=100)


vendors.head(), vendors.shape

(  vendor_name            location ingredient_supplied   category
 0     Longo's         Grahamville             Almonds        Nut
 1     FreshCo          North Lori            Rosemary       Herb
 2      Rexall         North Kevin              Onions  Vegetable
 3   No Frills           Greentown              Yogurt      Dairy
 4     Longo's  South Michaelmouth           Olive Oil        Oil,
 (100, 4))

## Generate Purchase History Data

In [5]:
import datetime

# Generate purchase history data with ingredient category
def generate_purchase_history(num_records):
    purchase_history = []
    start_date = datetime.datetime(2018, 1, 1)
    end_date = datetime.datetime(2022, 12, 31)
    for _ in range(num_records):
        vendor_name = random.choice(vendor_names)
        ingredient, category = random.choice(ingredients)
        quantity = random.randint(1, 50)
        purchase_date = fake.date_time_between_dates(start_date, end_date)
        purchase_history.append({
            'vendor_name': vendor_name,
            'ingredient': ingredient,
            'category': category,
            'quantity': quantity,
            'purchase_date': purchase_date
        })
    return pd.DataFrame(purchase_history)

# Generate realistic purchase history data
purchase_history = generate_purchase_history(num_records=50000)

# Introduce some duplicates
purchase_history = pd.concat([purchase_history, purchase_history.sample(100)], ignore_index=True)

# Introduce some missing values
purchase_history.loc[np.random.choice(purchase_history.index, size=100, replace=False), 'quantity'] = np.nan

# Display first few rows and the shape of the dataset
purchase_history.head(), purchase_history.shape


(                vendor_name ingredient   category  quantity  \
 0                       IGA      Flour      Grain      20.0   
 1  Real Canadian Superstore   Tomatoes  Vegetable      11.0   
 2                     Co-op     Garlic  Vegetable      18.0   
 3           T&T Supermarket   Cucumber  Vegetable      18.0   
 4               Giant Tiger  Olive Oil        Oil       1.0   
 
                purchase_date  
 0 2022-01-24 16:30:42.811353  
 1 2019-10-22 12:24:44.940169  
 2 2020-11-09 00:17:37.230027  
 3 2022-05-06 07:41:38.109536  
 4 2021-02-10 01:29:29.674654  ,
 (50100, 5))

## Generate Product Demand Data

In [6]:
def generate_product_demand_data(start_date, end_date, products, num_records=1000):

    # Convert start_date and end_date to datetime if they are strings
    if isinstance(start_date, str):
        start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    if isinstance(end_date, str):
        end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')

    # Generate date range
    dates = pd.date_range(start=start_date, end=end_date, freq='D')

    # Generate random data for each feature
    demand = np.random.randint(10, 100, size=len(dates))
    price = np.random.uniform(1, 100, size=len(dates))
    promotion = np.random.choice([0, 1], size=len(dates))
    temperature = np.random.uniform(-20, 40, size=len(dates))
    economic_indicator = np.random.uniform(0, 100, size=len(dates))
    social_media_sentiment = np.random.uniform(-1, 1, size=len(dates))
    previous_demand = np.random.randint(10, 100, size=len(dates)).cumsum()

    # Create DataFrame
    demand_forecast_data = pd.DataFrame({
        'timestamp': dates.repeat(num_records // len(dates)),
        'product_id': np.tile(np.random.choice(products, size=len(dates)), num_records // len(dates)),
        'demand': np.tile(demand, num_records // len(dates)),
        'price': np.tile(price, num_records // len(dates)),
        'promotion': np.tile(promotion, num_records // len(dates)),
        'temperature': np.tile(temperature, num_records // len(dates)),
        'economic_indicator': np.tile(economic_indicator, num_records // len(dates)),
        'social_media_sentiment': np.tile(social_media_sentiment, num_records // len(dates)),
        'previous_demand': np.tile(previous_demand, num_records // len(dates))
    })

    return demand_forecast_data

start_date = '2023-01-01'
end_date = '2023-12-31'
products = vendors['ingredient_supplied'].unique()  # Use ingredients from vendors data as products

product_demand_data = generate_product_demand_data(start_date, end_date, products=products)


product_demand_data.head(), product_demand_data.shape


(   timestamp    product_id  demand      price  promotion  temperature  \
 0 2023-01-01  Bell Peppers      94  60.891186          0    38.630601   
 1 2023-01-01        Cheese      98  22.108710          0    23.877024   
 2 2023-01-02          Salt      33   2.609250          0    -4.800778   
 3 2023-01-02         Basil      78  30.681360          1   -13.291511   
 4 2023-01-03  Bell Peppers      32  26.702867          0   -15.476022   
 
    economic_indicator  social_media_sentiment  previous_demand  
 0           51.370497               -0.157013               16  
 1           59.302178               -0.927554               97  
 2            1.240155               -0.965119              182  
 3           18.838625                0.044827              240  
 4           69.579086               -0.436603              280  ,
 (730, 9))

# Export Generated Data To Pickle

In [7]:
# Save datasets to Pickle

generated_data_filepath = '../../data/generated/demand-forecast/'

vendors.to_pickle(f'{generated_data_filepath}vendors.pkl')
purchase_history.to_pickle(f'{generated_data_filepath}purchase_history.pkl')
product_demand_data.to_pickle(f'{generated_data_filepath}product_demand.pkl')