# Generate Data

This notebook demonstrates the process of generating synthetic data for a fictional pizza chain, "Panucci's Pizza." It includes the creation of random store locations, customer profiles, and transactional data. The synthetic data is generated using statistical distributions and randomization techniques to mimic real-world scenarios. Because the locations are randomized, you might see odd things such as stores within a neighborhood.

For the purposes of this exercise, I'll generate `10` stores around the area of `Frisco` and `McKinney`, `Texas`

The data is then structured and stored in Snowflake for further analysis and visualization. 



#### Base Classes and Imports

In [44]:
import numpy as np
import pandas as pd

In [45]:
class BaseEntity:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

    def __repr__(self):
        attrs = ', '.join(f"{key}={value}" for key, value in self.__dict__.items())
        return f"{self.__class__.__name__}({attrs})"
    
    def showDataFrame(self):
        return pd.DataFrame([self.__dict__])


class Store(BaseEntity):
    all_stores = []  # Class-level list to store all instances of Store

    def __init__(self, store_id, latitude, longitude, store_name, opendt):
        super().__init__(STORE_ID=store_id, 
                         LATITUDE=latitude, 
                         LONGITUDE=longitude, 
                         STORE_NAME=store_name, 
                         OPENDT=opendt)
        Store.all_stores.append(self)  # Add each instance to the class-level list

    def __repr__(self):
        return f"Store(store_id={self.STORE_ID}, lat={self.LATITUDE}, lon={self.LONGITUDE}, name={self.STORE_NAME}, opendt={self.OPENDT})"

    @classmethod
    def display_all_stores(cls):
        """
        Display a DataFrame of all store instances.
        """
        return pd.DataFrame([store.__dict__ for store in cls.all_stores])
    

class Customer(BaseEntity):
    all_customers = []  # Class-level list to store all instances of Customers
    def __init__(self, customer_id, latitude, longitude, firstname, lastname, homestore_ID):
        super().__init__(CUSTOMER_ID=customer_id, 
                         LATITUDE=latitude, 
                         LONGITUDE=longitude, 
                         FIRSTNAME=firstname, 
                         LASTNAME=lastname, 
                         HOMESTORE_ID=homestore_ID)
        Customer.all_customers.append(self)  # Add each instance to the class-level list

    def __repr__(self):
        return f"Customer(customer_id={self.CUSTOMER_ID}, lat={self.LATITUDE}, lon={self.LONGITUDE}, name={self.FIRSTNAME} {self.LASTNAME}, homestoreID={self.HOMESTORE_ID})"
    
    @classmethod
    def display_all_customers(cls):
        """
        Display a DataFrame of all customer instances.
        """
        return pd.DataFrame([cust.__dict__ for cust in cls.all_customers])
    
class Order(BaseEntity):
    all_orders = []  # Class-level list to store all instances of Orders
    def __init__(self, order_id, customer_id, store_id, order_date, subtotal,tax, total):        
        super().__init__(ORDER_ID=order_id, 
                         CUSTOMER_ID=customer_id, 
                         STORE_ID=store_id, 
                         ORDER_DATE=order_date, 
                         SUBTOTAL=subtotal,
                         TAX=tax,
                         TOTAL=total)

    @classmethod
    def display_all_orders(cls):
        """
        Display a DataFrame of all order instances.
        """
        return pd.DataFrame([order.__dict__ for order in cls.all_orders])

#### Generate random store locations

In [46]:
# Bounding box coordinates for the area of interest
min_lat, max_lat = 33.0633, 33.2182
min_lon, max_lon = -96.9162, -96.6718

num_stores = 10

import random
random.seed(1033)  # For reproducibility

In [47]:
# Generate random store locations within the bounding box

Store.all_stores = []  # Reset the class-level list to avoid duplicates
name_prefix = "Panucci's Pizza - "
store_ids = [random.randint(0, 9999) for _ in range(num_stores)]
store_locations = [
    Store(
        store_id=store_id,
        latitude=np.round(random.uniform(min_lat, max_lat),5),
        longitude=np.round(random.uniform(min_lon, max_lon),5),
        store_name=f"{name_prefix}{store_id}",
        opendt=f"{random.randint(2010, 2020)}-{random.randint(1, 12):02d}-{random.randint(1, 28):02d}"
    ).__dict__ for store_id in store_ids
]

Store.display_all_stores()

Unnamed: 0,STORE_ID,LATITUDE,LONGITUDE,STORE_NAME,OPENDT
0,3431,33.18667,-96.8761,Panucci's Pizza - 3431,2012-11-13
1,4464,33.07242,-96.79917,Panucci's Pizza - 4464,2011-11-16
2,7496,33.15334,-96.73341,Panucci's Pizza - 7496,2016-11-12
3,1119,33.11117,-96.83614,Panucci's Pizza - 1119,2014-09-06
4,1249,33.13475,-96.84477,Panucci's Pizza - 1249,2010-02-02
5,3789,33.106,-96.74276,Panucci's Pizza - 3789,2018-06-23
6,151,33.11653,-96.82622,Panucci's Pizza - 151,2013-01-28
7,6202,33.16028,-96.90773,Panucci's Pizza - 6202,2020-02-12
8,6144,33.20155,-96.75194,Panucci's Pizza - 6144,2014-12-10
9,2510,33.14326,-96.69223,Panucci's Pizza - 2510,2012-06-19


#### Generate customer locations for each store

In [48]:
num_customers_per_store = (20,50)

radius_range = (0.5, 2.0)  # in km

In [49]:
from scipy.stats import gamma
import numpy as np

# Predefined list of first names
first_names = ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Hannah", "Ivy", "Jack", "Kathy", "Liam", "Mona", "Nathan", "Olivia", "Paul", "Quincy", "Rachel", "Steve", "Tina"]
# Predefined list of Pokémon names
pokemon_last_names = ["Pikachu", "Charmander", "Bulbasaur", "Squirtle", "Jigglypuff", "Meowth", "Psyduck", "Snorlax", "Eevee", "Mewtwo"]


def generate_customers(store_location, search_radius, num_customers, store_id=None):
    """
    Generate customers around a store location using a exponential distribution.

    :param store_location: Tuple of (latitude, longitude) for the store location.
    :param search_radius: Maximum search radius in kilometers.
    :param num_customers: Number of customers to generate.
    :param store_id: ID of the store to associate customers with.
    :return: List of Customer objects.
    """
    customers = []
    for i in range(num_customers):
        # Generate a random distance using the exponential distribution
        distance = np.random.exponential(scale=search_radius / 2)
        # Generate a random bearing (angle in radians)
        bearing = np.random.uniform(0, 2 * np.pi)
        # Calculate the new latitude and longitude
        delta_lat = distance * np.cos(bearing) / 111  # Approx. conversion of km to degrees latitude
        delta_lon = distance * np.sin(bearing) / (111 * np.cos(np.radians(store_location[0])))  # Adjust for longitude
        customer_lat = store_location[0] + delta_lat
        customer_lon = store_location[1] + delta_lon
        # Create a Customer object
        customer_id = f"{random.randint(0, 9999)}"
        first_name = random.choice(first_names)
        last_name = random.choice(pokemon_last_names)
        customers.append(Customer(customer_id, np.round(customer_lat,5), np.round(customer_lon,5), first_name, last_name, store_id))
    return customers

In [50]:
# Generate customers for each store
Customer.all_customers = []  # Reset the class-level list to avoid duplicates

for store in Store.all_stores:
    store_location = (store.LATITUDE, store.LONGITUDE)
    num_customers = random.randint(*num_customers_per_store)  # Randomly choose the number of customers within the range
    search_radius_km = random.uniform(*radius_range)  # Randomly choose the search radius within the range

    generate_customers(store_location, search_radius_km, num_customers, store_id=store.STORE_ID)


In [51]:
Customer.display_all_customers()

Unnamed: 0,CUSTOMER_ID,LATITUDE,LONGITUDE,FIRSTNAME,LASTNAME,HOMESTORE_ID
0,915,33.18502,-96.87365,Tina,Pikachu,3431
1,4218,33.18697,-96.88154,Ivy,Bulbasaur,3431
2,8825,33.20890,-96.87432,Liam,Jigglypuff,3431
3,3041,33.18817,-96.88249,Liam,Mewtwo,3431
4,6745,33.18971,-96.87358,Quincy,Squirtle,3431
...,...,...,...,...,...,...
351,1294,33.13427,-96.69123,Ivy,Eevee,2510
352,8297,33.14403,-96.69201,Kathy,Psyduck,2510
353,8165,33.16027,-96.72421,Kathy,Eevee,2510
354,3830,33.13651,-96.68974,Olivia,Snorlax,2510


#### Transactional data
Generate random transactional data for each store

In [52]:
def generate_gaussian_value(mean, std, floor):
    """
    Generate a random value based on a Gaussian distribution.

    :param mean: Mean of the Gaussian distribution.
    :param std: Standard deviation of the Gaussian distribution.
    :param floor: Minimum value (floor) for the generated value.
    :return: A random value from the Gaussian distribution, floored at the specified minimum value.
    """
    value = np.random.normal(loc=mean, scale=std)
    return round(max(value, floor), 2)  # Round to 2 decimal places and ensure it's not below the floor value

In [53]:
num_orders_per_customer = (3, 25)

Order.all_orders = []  # Reset the class-level list to avoid duplicates

for customer in Customer.all_customers:
    num_orders = random.randint(*num_orders_per_customer)  # Randomly choose the number of orders for each customer
    for _ in range(num_orders):
        order_id = f"{random.randint(0, 99999)}"
        order_date = f"{random.randint(2021, 2024)}-{random.randint(1, 12):02d}-{random.randint(1, 28):02d}"
        order_subtotal = generate_gaussian_value(mean=30, std=10, floor=5)  # Generate a random order total
        order_tax = np.round(order_subtotal * 0.0825,2)  # Assuming a tax rate of 8.25%
        order_total = np.round(order_subtotal + order_tax,2)
        Order.all_orders.append(Order(order_id, customer.CUSTOMER_ID, customer.HOMESTORE_ID, order_date, order_subtotal, order_tax, order_total))

In [54]:
Order.display_all_orders()

Unnamed: 0,ORDER_ID,CUSTOMER_ID,STORE_ID,ORDER_DATE,SUBTOTAL,TAX,TOTAL
0,29570,915,3431,2021-08-19,20.46,1.69,22.15
1,58260,915,3431,2023-02-15,23.33,1.92,25.25
2,41196,915,3431,2024-06-26,11.24,0.93,12.17
3,59530,915,3431,2024-02-16,33.19,2.74,35.93
4,35746,915,3431,2022-06-21,16.67,1.38,18.05
...,...,...,...,...,...,...,...
5043,85661,9931,2510,2022-12-04,51.47,4.25,55.72
5044,7550,9931,2510,2023-03-27,29.09,2.40,31.49
5045,31965,9931,2510,2022-08-05,33.50,2.76,36.26
5046,6121,9931,2510,2022-01-12,28.57,2.36,30.93


### Push Data to SnowFlake

In [55]:
from dotenv import load_dotenv
import os

load_dotenv()
SF_USER = os.getenv("SF_USER")
SF_PASSWORD = os.getenv("SF_PASSWORD")
SF_ACCOUNT = os.getenv("SF_ACCOUNT")

In [56]:
from snowflake.snowpark import Session
from snowflake.snowpark import functions as SF

connection_parameters = {
  "account": SF_ACCOUNT,
  "user": SF_USER,
  "password": SF_PASSWORD,
}
new_session = Session.builder.configs(connection_parameters).create()

In [57]:
new_db = "PANUCCIS_PIZZA"
new_schema = "POS_MAIN"
new_WH = "PANUCCIS_PIZZA_WH"
table_stores = "STORES"
table_customers = "CUSTOMERS"
table_orders = "ORDERS_HEADER"

In [58]:
# create warehouse
new_session.sql(f"""
                CREATE WAREHOUSE if not exists {new_WH}
                WITH
                WAREHOUSE_SIZE = XSMALL
                AUTO_SUSPEND = 30
                INITIALLY_SUSPENDED = TRUE
                COMMENT = 'WH TO DO TEST RUNS ON PANUCCIS PIZZA DB'
                """).collect()

new_session.use_warehouse(f"{new_WH}")

# create the Panuccis Database
new_session.sql(f'CREATE OR REPLACE DATABASE {new_db};').collect()

# create the Panuccis PoS Main Schema
new_session.sql(f'CREATE OR REPLACE SCHEMA {new_db}.{new_schema};').collect()

[Row(status='Schema POS_MAIN successfully created.')]

In [59]:
# Convert customer data to a pandas DataFrame
customer_df = Customer.display_all_customers()
orders_df = Order.display_all_orders().sort_values(by='ORDER_DATE', ascending=False)
store_df = Store.display_all_stores()

# Add the first order date for each customer
first_order_dates = orders_df.groupby(['CUSTOMER_ID'])[['ORDER_DATE']].min().rename(columns={'ORDER_DATE':'FIRST_ORDER_DATE'})
customer_df = customer_df.join(first_order_dates, on='CUSTOMER_ID', how='left')


customer_df = customer_df.sort_values(by='FIRST_ORDER_DATE', ascending=False)

In [60]:
from snowflake.snowpark.types import StructType, StructField, StringType, FloatType, IntegerType, DateType

# Create a Snowpark DataFrame from the pandas DataFrame
schema = StructType([
    StructField("CUSTOMER_ID", StringType()),
    StructField("LATITUDE", FloatType()),
    StructField("LONGITUDE", FloatType()),
    StructField("FIRSTNAME", StringType()),
    StructField("LASTNAME", StringType()),
    StructField("HOMESTORE_ID", IntegerType()),
    StructField("FIRST_ORDER_DATE", DateType())
])
# Write the data to the Snowflake table
snowpark_customer_df = new_session.create_dataframe(customer_df.to_records(index=False).tolist(), 
                                                    schema=schema)
snowpark_customer_df.write.mode("overwrite").save_as_table(f"{new_db}.{new_schema}.{table_customers}")




# Define the schema for the store data
schema = StructType([
    StructField("STORE_ID", IntegerType()),
    StructField("LATITUDE", FloatType()),
    StructField("LONGITUDE", FloatType()),
    StructField("STORE_NAME", StringType()),
    StructField("OPENDT", DateType())
])
# Write the data to the Snowflake table
snowpark_store_df = new_session.create_dataframe(store_df.to_records(index=False).tolist(), 
                                                 schema=schema)
snowpark_store_df.write.mode("overwrite").save_as_table(f"{new_db}.{new_schema}.{table_stores}")




# Define the schema for the orders data
schema = StructType([
    StructField("ORDER_ID", StringType()),
    StructField("CUSTOMER_ID", StringType()),
    StructField("STORE_ID", IntegerType()),
    StructField("ORDER_DATE", DateType()),
    StructField("SUBTOTAL", FloatType()),
    StructField("TAX", FloatType()),
    StructField("TOTAL", FloatType()),
])

# Write the data to the Snowflake table
snowpark_orders_df = new_session.create_dataframe(orders_df.to_records(index=False).tolist(), 
                                                  schema=schema)
snowpark_orders_df.write.mode("overwrite").save_as_table(f"{new_db}.{new_schema}.{table_orders}")

#### Snowflake CleanUp
When done with the exercise use the following code to clean up your snowflake enviornment

In [61]:
# new_session.sql(f"DROP TABLE IF EXISTS {new_db}.{new_schema}.{table_stores};").collect()
# new_session.sql(f"DROP TABLE IF EXISTS {new_db}.{new_schema}.{table_customers};").collect()
# new_session.sql(f"DROP TABLE IF EXISTS {new_db}.{new_schema}.{table_orders};").collect()
# new_session.sql(f"DROP SCHEMA IF EXISTS {new_db}.{new_schema};").collect()
# new_session.sql(f"DROP DATABASE IF EXISTS {new_db};").collect()
# new_session.sql(f"DROP WAREHOUSE IF EXISTS {new_WH};").collect()

In [62]:
new_session.close()