Synthetic Starbucks Data Generator
This code simulates a week of Starbucks orders, creating realistic-looking data that you can use for testing your ETL pipeline. Here's what it does. It doesn't do it well, and I wonder if an LLM would generate better data via a prompt?!

7 days of orders from 5 AM to 10 PM
Realistic traffic patterns: More orders during morning rush (7-9 AM), lunch (11-2 PM), and evening (5-7 PM)
8 drink types with varying prep times and prices
Random toppings that add $0.50 each
Mobile vs in-store orders with different wait times

Each order includes:

Order timestamp
Drink type (Coffee, Latte, Frappuccino, etc.)
Toppings list
Total price
Prep time & wait time
Whether it was a mobile order

In [1]:
import random
from datetime import datetime, timedelta

# this function will 'simulate' a starbucks for a week, you can mod it if you'd like. It's extremely basic, but should hopefully generate some fun data to play with.
def simulate_starbucks(drink_types=None):
    if drink_types is None:
        drink_types = {
            'Coffee': {'prep_time': (1, 3), 'price': (2.5, 4.0)},
            'Latte': {'prep_time': (3, 5), 'price': (3.5, 5.0)},
            'Americano': {'prep_time': (2, 4), 'price': (3.0, 4.5)},
            'Cappuccino': {'prep_time': (3, 5), 'price': (3.5, 5.0)},
            'Mocha': {'prep_time': (4, 6), 'price': (4.0, 5.5)},
            'Espresso': {'prep_time': (1, 2), 'price': (2.0, 3.0)},
            'Tea': {'prep_time': (2, 4), 'price': (2.5, 4.0)},
            'Frappuccino': {'prep_time': (5, 8), 'price': (4.5, 6.5)},
        }

    toppings = ['Whipped Cream', 'Caramel Drizzle', 'Chocolate Syrup', 'Cinnamon', 'Vanilla Syrup']

    start_date = datetime(2024, 4, 1, 5, 0)  # Monday at 5:00 AM

    opening_time = 5  # 5:00 AM
    closing_time = 22  # 10:00 PM

    # we have peak order times, right?!
    morning_peak = (7, 9)
    lunch_peak = (11, 14)
    evening_peak = (17, 19)

    orders = []

    for day in range(7):  # you could pass in an argument for days if you'd like
        current_date = start_date + timedelta(days=day)
        current_time = current_date

        while current_time.hour < closing_time:
            # Determine the likelihood of an order based on the time of day
            if morning_peak[0] <= current_time.hour < morning_peak[1]:
                order_chance = 0.8
            elif lunch_peak[0] <= current_time.hour < lunch_peak[1]:
                order_chance = 0.6
            elif evening_peak[0] <= current_time.hour < evening_peak[1]:
                order_chance = 0.5
            else:
                order_chance = 0.3

            # Generate an order based on the calculated chance
            if random.random() < order_chance:
                # select it at random
                drink = random.choice(list(drink_types.keys()))

                # upack the min/max price for each drink (there's a fancy way to do this too, an LLM would use that)
                min_price, max_price = drink_types[drink]['price']
                base_price = random.uniform(min_price, max_price)

                # Add random toppings (0 to 2)
                num_toppings = random.randint(0, 2)
                selected_toppings = random.sample(toppings, num_toppings)
                price = base_price + (len(selected_toppings) * 0.5)  # add some $$$ for each topping

                is_mobile = random.choice([True, False]) # there's a better algo for this

                # same thing, we have prep time ranges
                min_prep_time, max_prep_time = drink_types[drink]['prep_time']
                prep_time = random.randint(min_prep_time, max_prep_time)

                # Mobile orders typically have shorter wait times
                if is_mobile:
                    wait_time = prep_time + random.randint(0, 2)
                else:
                    wait_time = prep_time + random.randint(1, 5)

                # construct le order
                order = {
                    'order_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
                    'drink_type': drink,
                    'toppings': selected_toppings,
                    'price': round(price, 2),
                    'prep_time': prep_time,
                    'wait_time': wait_time,
                    'is_mobile': is_mobile
                }
                orders.append(order)

            # just moving the time forward
            current_time += timedelta(minutes=random.randint(1, 5))

    return orders

# calling the function to get some data
simulated_data = simulate_starbucks()

# Print the first 5 orders
for order in simulated_data[:5]:
    print(order)

# in our example, we can make custom drinks too
fru_fru_drinks = {
    'Special Coffee': {'prep_time': (2, 4), 'price': (3.0, 5.0)},
    'Super Mocha Frappuccino': {'prep_time': (6, 10), 'price': (5.5, 7.5)},
    'Herbal Tea Infusion': {'prep_time': (3, 5), 'price': (3.5, 4.5)},
}

# Just pass them in
custom_simulated_data = simulate_starbucks(fru_fru_drinks)

# Print the first few orders of the custom simulation
print("\nCustom Drink Simulation:")
for order in custom_simulated_data[:5]:
    print(order)

{'order_time': '2024-04-01 05:20:00', 'drink_type': 'Mocha', 'toppings': [], 'price': 4.9, 'prep_time': 4, 'wait_time': 6, 'is_mobile': True}
{'order_time': '2024-04-01 05:25:00', 'drink_type': 'Espresso', 'toppings': [], 'price': 2.31, 'prep_time': 1, 'wait_time': 2, 'is_mobile': True}
{'order_time': '2024-04-01 05:35:00', 'drink_type': 'Tea', 'toppings': [], 'price': 2.85, 'prep_time': 3, 'wait_time': 5, 'is_mobile': True}
{'order_time': '2024-04-01 05:41:00', 'drink_type': 'Cappuccino', 'toppings': [], 'price': 4.7, 'prep_time': 5, 'wait_time': 7, 'is_mobile': True}
{'order_time': '2024-04-01 05:48:00', 'drink_type': 'Coffee', 'toppings': [], 'price': 3.87, 'prep_time': 2, 'wait_time': 5, 'is_mobile': False}

Custom Drink Simulation:
{'order_time': '2024-04-01 05:12:00', 'drink_type': 'Super Mocha Frappuccino', 'toppings': [], 'price': 5.72, 'prep_time': 8, 'wait_time': 10, 'is_mobile': True}
{'order_time': '2024-04-01 05:25:00', 'drink_type': 'Super Mocha Frappuccino', 'toppings': 

In [14]:
import pandas as pd
# okay, your job:
## Create a dataframe
starbucks_df = pd.DataFrame(simulated_data)

starbucks_df.head(10)
## Show basic information (info, summary stats)
starbucks_df.info()
starbucks_df.describe()
## Answer some questions:

## what is the average wait time?
avg_wait_time = starbucks_df[['wait_time']].mean(axis=0) # .groupby('wait_time').mean()
print()
print("Average Wait Time:")
print(avg_wait_time)

## NOW YOU DO THE BELOW OK!!!
## what is the average price?
avg_price = starbucks_df['price'].mean()
print()
print("Average Price:")
print(avg_price)
## what are our most popular drink types?
popular_drinks = starbucks_df['drink_type'].value_counts()
print()
print("Most Popular Drink:")
print(popular_drinks.head(1))
## what is the average wait time per drink?
avg_wait_per_drink = starbucks_df.groupby('drink_type')['wait_time'].mean()
print()
print("Average Wait Time Per Drink:")
print(avg_wait_per_drink)
## average wait time for mobile vs in-store
avg_wait_mobile = starbucks_df[starbucks_df['is_mobile']==True]['wait_time'].mean()
avg_wait_store = starbucks_df[starbucks_df['is_mobile']==False]['wait_time'].mean()
print()
print("Average Wait Time: Mobile vs In-Store:")
print(f"Average Mobile Wait Time: {avg_wait_mobile}")
print(f"Average Mobile Wait Time: {avg_wait_store}")
## when are we the busiest?

print()
## any others?

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1066 entries, 0 to 1065
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   order_time  1066 non-null   object 
 1   drink_type  1066 non-null   object 
 2   toppings    1066 non-null   object 
 3   price       1066 non-null   float64
 4   prep_time   1066 non-null   int64  
 5   wait_time   1066 non-null   int64  
 6   is_mobile   1066 non-null   bool   
dtypes: bool(1), float64(1), int64(2), object(3)
memory usage: 51.1+ KB

Average Wait Time:
wait_time    5.693246
dtype: float64

Average Price:
4.438180112570356

Most Popular Drink:
drink_type
Americano    147
Name: count, dtype: int64

Average Wait Time Per Drink:
drink_type
Americano      5.061224
Cappuccino     5.910448
Coffee         4.323741
Espresso       3.562500
Frappuccino    8.455882
Latte          6.162602
Mocha          7.256198
Tea            5.188525
Name: wait_time, dtype: float64

Average Wait Time: Mo