In [11]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def generate_realistic_hourly_orders(target_rows=1008):
    """
    Generate realistic hourly-aggregated order data:
    - Controlled zeros
    - Peak hour boosts
    - Weekend boost
    - Small spikes and rare outliers
    """
    dishes = [
        "Burger", "Pizza", "Coke", "Fries", "Salad",
        "Pasta", "IceCream", "Sushi", "Sandwich", "Soup"
    ]
    base_popularity = [3, 4, 5, 3, 2, 2, 1, 1, 2, 1]
    
    # Hourly timestamps for past 6 weeks (~1008 hours)
    end_time = datetime.now().replace(minute=0, second=0, microsecond=0)
    start_time = end_time - timedelta(days=42)
    timestamps = pd.date_range(start=start_time, end=end_time, freq='H')
    
    data = []
    
    for ts in timestamps:
        row = []
        hour = ts.hour
        weekday = ts.weekday()
        weekend_factor = 1.2 if weekday >= 5 else 1.0  # 20% boost weekends
        
        for idx, dish in enumerate(dishes):
            popularity = base_popularity[idx]
            
            # Peak hour boost
            if 11 <= hour <= 14 or 18 <= hour <= 21:
                popularity *= 1.5
            
            # Off-peak reduction
            if hour < 8 or hour > 22:
                popularity *= 0.5  # reduce lambda, less orders
            
            popularity *= weekend_factor
            
            # Sample from Poisson (ensures mostly realistic counts)
            qty = np.random.poisson(lam=popularity)
            
            # Small spike: 10% chance
            if random.random() < 0.1:
                qty += random.randint(1, 3)
            
            # Rare additive outlier: 2% chance
            if random.random() < 0.02:
                qty += random.randint(5, 10)
            
            # Cap maximum per dish
            qty = min(qty, 20)
            
            row.append(qty)
        data.append(row)
    
    df = pd.DataFrame(data, columns=dishes)
    df.insert(0, "order_placed_at", timestamps)
    
    return df

# Example usage
df_realistic = generate_realistic_hourly_orders()
print(df_realistic.head(20))
print(f"Total rows: {len(df_realistic)}")


       order_placed_at  Burger  Pizza  Coke  Fries  Salad  Pasta  IceCream  \
0  2025-09-16 18:00:00       4      7     4      9      3      5         1   
1  2025-09-16 19:00:00       7      5     7      6      2      3         1   
2  2025-09-16 20:00:00       3      5     5      2      2      1         5   
3  2025-09-16 21:00:00       7      2    12      1      4      1         0   
4  2025-09-16 22:00:00       7      7     3      3      1      2         0   
5  2025-09-16 23:00:00       1      2     2      1      2      1         0   
6  2025-09-17 00:00:00       2      3     4      1      3      2         2   
7  2025-09-17 01:00:00       0      0     0      4      1      2         0   
8  2025-09-17 02:00:00       1      2     5      2      1      4         0   
9  2025-09-17 03:00:00       2      1     3      5      0      1         1   
10 2025-09-17 04:00:00      11      0     3      9      1      2         0   
11 2025-09-17 05:00:00       2      2     1     11      0      2

  timestamps = pd.date_range(start=start_time, end=end_time, freq='H')


In [12]:
df_realistic.head(30)

Unnamed: 0,order_placed_at,Burger,Pizza,Coke,Fries,Salad,Pasta,IceCream,Sushi,Sandwich,Soup
0,2025-09-16 18:00:00,4,7,4,9,3,5,1,3,4,5
1,2025-09-16 19:00:00,7,5,7,6,2,3,1,1,2,2
2,2025-09-16 20:00:00,3,5,5,2,2,1,5,7,2,1
3,2025-09-16 21:00:00,7,2,12,1,4,1,0,1,3,2
4,2025-09-16 22:00:00,7,7,3,3,1,2,0,2,4,1
5,2025-09-16 23:00:00,1,2,2,1,2,1,0,0,0,1
6,2025-09-17 00:00:00,2,3,4,1,3,2,2,0,1,0
7,2025-09-17 01:00:00,0,0,0,4,1,2,0,1,1,1
8,2025-09-17 02:00:00,1,2,5,2,1,4,0,0,1,1
9,2025-09-17 03:00:00,2,1,3,5,0,1,1,2,1,0


In [13]:
df_realistic.to_csv("dummy_orders.csv", index=False)