## PART 1: Generate Fake Orders

#### Import all necessary libraries

In [1]:
import pandas as pd
import random
import re

from datetime import datetime, timedelta
from typing import List, Dict

from faker import Faker

#### Define a function to generate line items for an order

In [2]:

def generate_line_items_for_order(
    order_number: str,
    order_date: datetime,
    billing_name: str,
    payment_method: str,
    payment_reference: str,
    products: List[str],
    customer_lookup: Dict[str, dict],
    product_lookup: Dict[str, dict],
) -> List[Dict]:

    records = []
    
    payment_date = order_date + timedelta(days=random.uniform(0, 1))
    fulfillment_date = order_date + timedelta(days=random.uniform(1, 2))

    for _ in range(random.randint(1, 3)):
        product_name = random.choice(products)

        record = {
            "order_number": order_number,
            "order_date": order_date,
            "billing_name": billing_name,
            "lineitem_name": product_name,
            "lineitem_qty": random.randint(1, 3),
            "payment_method": payment_method,
            "payment_reference": payment_reference,
            "payment_date": payment_date,
            "fulfillment_date": fulfillment_date,
        }

        # Merge customer and product details
        record.update(customer_lookup.get(billing_name, {}))
        record.update(product_lookup.get(product_name, {}))

        records.append(record)

    return records

#### Define helper functions

In [3]:
def dataframe_to_lookup(
    df: pd.DataFrame,
    key_col: str,
    exclude: list[str] | None = None,
):
    exclude = set(exclude or []) | {key_col}

    return {
        row[key_col]: {k: v for k, v in row.items() if k not in exclude}
        for _, row in df.iterrows()
    }


def snake_case_formatting(s: str) -> str:
    s = re.sub(r"[^\w\s]", "", s)  # remove punctuation
    s = re.sub(r"\s+", "_", s)     # replace spaces with underscore
    return s.lower()

#### Define a function to generate orders

In [4]:
def generate_orders(
    customers_df: pd.DataFrame,
    products_df: pd.DataFrame,
    num_orders: int,
    number_of_days: int,
    payment_methods: list[str] | None = None,
) -> pd.DataFrame:

    if payment_methods is None:
        payment_methods = [
            "PayPal",
            "Digital Wallet",
            "Cash on Delivery",
            "Bank Transfer",
        ]

    customers = customers_df["Full Name"].tolist()
    products = products_df["Title"].tolist()

    # Reference numbers generator
    random_number = lambda: f"#{random.randint(100_000_000_000, 999_999_999_999)}"

    # Fast lookups
    customer_lookup = dataframe_to_lookup(customers_df, "Full Name")
    product_lookup = dataframe_to_lookup(products_df, "Title")

    data = []

    for _ in range(num_orders):
        order_number = random_number()
        order_date =  datetime.now() - timedelta(days=random.randint(0, number_of_days))
        billing_name = random.choice(customers)
        payment_method = random.choice(payment_methods)
        payment_reference = random_number()

        data.extend(
            generate_line_items_for_order(
                order_number,
                order_date,
                billing_name,
                payment_method,
                payment_reference,
                products,
                customer_lookup,
                product_lookup,
            )
        )

    df = pd.DataFrame(data)
    df.columns = [snake_case_formatting(col) for col in df.columns]

    return df

In [5]:
# Read customers csv to pandas dataframe
customers_df = pd.read_csv("dataset/fake_customers.csv")
customers_df.head()

Unnamed: 0,First Name,Last Name,Full Name,Email,Address Company,Address City,Address Province,Address Zip,Phone
0,Paul,Wagner,Paul Wagner,wagner.paul@gmail.com,"2305 Caballero Street, Dao Cove 4",Tangub,Misamis Occidental,7214,+63 9845139651
1,Brandon,Mcdonald,Brandon Mcdonald,mcdonald.brandon@gmail.com,"8713 White Street, Banaba Subdivision Phase 4",Tangub,Misamis Occidental,7214,+63 9266894048
2,Elizabeth,Sawyer,Elizabeth Sawyer,sawyer.elizabeth@gmail.com,"8280 Kanlaon Drive, Richards Village Phase 5",Ozamiz,Misamis Occidental,7207,+63 9353351153
3,Brittany,Smith,Brittany Smith,smith.brittany@gmail.com,1470 87th Drive Extension,Oroquieta,Misamis Occidental,7200,+63 9414048999
4,Christina,Gonzalez,Christina Gonzalez,gonzalez.christina@gmail.com,"Unit 333 Hanna Suites Tower 8, 2849 Sampaguita...",Tangub,Misamis Occidental,7214,+63 9171273413


In [6]:
# Read products csv to pandas dataframe
products_df = pd.read_csv("dataset/fake_products.csv")
products_df.head()

Unnamed: 0,Product SKU,Title,Product Description,Vendor,Product Category,Unit Price,Image Src
0,TOY500,Mega Construx Pokemon Buildable Figures Gift Set,Build the power that inside! The path to becom...,POKEMON,Building & Model Sets,33.55,https://cdn.shopify.com/s/files/1/0731/4400/88...
1,TOY499,Mega Construx Pokemon Rowlet vs. Eevee,Create action-packed battles with your favorit...,POKEMON,Building & Model Sets,14.99,https://cdn.shopify.com/s/files/1/0731/4400/88...
2,TOY498,Limited Edition SDCC 2018 GengarvMega Construx...,This limited-edition special figure for San Di...,POKEMON,Building & Model Sets,40.0,https://cdn.shopify.com/s/files/1/0731/4400/88...
3,TOY497,Mega Construx Pokemon Buildable Figures and En...,Pokemon trainers can practice their attacks wi...,POKEMON,Building & Model Sets,23.98,https://cdn.shopify.com/s/files/1/0731/4400/88...
4,TOY496,Mega Construx Pokemon Squirtle vs. Charmander,Create classic battles with your favorite Poke...,POKEMON,Building & Model Sets,14.99,https://cdn.shopify.com/s/files/1/0731/4400/88...


In [7]:
# Create an instace of faker
fake = Faker('en_PH')

# Define inclusive dates
start_date = datetime(2015, 1, 1)
end_date = datetime(2025, 12, 19)


# Calculate the number of days (inclusive)
number_of_days = (end_date - start_date).days + 1

# Calculate the total orders assuming 800 orders per day average
number_of_orders = number_of_days*800

print(f"Number of inclusive days: {number_of_days} days")
print(f"Total number of orders: {number_of_orders} orders")

Number of inclusive days: 750 days
Total number of orders: 600000 orders


In [8]:
# Generate fake orders
orders_df = generate_orders(
    customers_df,
    products_df,
    number_of_orders,
    number_of_days,
)

In [9]:
orders_df.head()

Unnamed: 0,order_number,order_date,billing_name,lineitem_name,lineitem_qty,payment_method,payment_reference,payment_date,fulfillment_date,first_name,...,address_city,address_province,address_zip,phone,product_sku,product_description,vendor,product_category,unit_price,image_src
0,#718984081914,2025-08-25 18:54:59.946616,Elizabeth Dodson,Shopkins World Vacation Coralee,1,Cash on Delivery,#959088025052,2025-08-25 18:55:15.731941,2025-08-26 20:51:41.123849,Elizabeth,...,Gingoog,Misamis Oriental,9014,+63 9334784300,TOY328,"Includes 2 Exclusive Shopkins, 1 suitcase, 1 P...",SHOPKINS,Animal & Collectible Toys,16.99,https://cdn.shopify.com/s/files/1/0731/4400/88...
1,#718984081914,2025-08-25 18:54:59.946616,Elizabeth Dodson,Hot Wheels Ai Intelligent Race System,2,Cash on Delivery,#959088025052,2025-08-25 18:55:15.731941,2025-08-26 20:51:41.123849,Elizabeth,...,Gingoog,Misamis Oriental,9014,+63 9334784300,TOY104,Experience Hot Wheels racing like never before...,HOT WHEELS,Vehicles & Playsets,88.99,https://cdn.shopify.com/s/files/1/0731/4400/88...
2,#718984081914,2025-08-25 18:54:59.946616,Elizabeth Dodson,Dragon Ball Z SH Figuarts Shenron KO Action Fi...,1,Cash on Delivery,#959088025052,2025-08-25 18:55:15.731941,2025-08-26 20:51:41.123849,Elizabeth,...,Gingoog,Misamis Oriental,9014,+63 9334784300,TOY150,Dragon Ball Z SH Figuarts Shenron KO Action Fi...,DRAGON BALL,Action & Roleplay Toys,46.99,https://cdn.shopify.com/s/files/1/0731/4400/88...
3,#111178111884,2024-11-17 18:54:59.946686,Benjamin Terry,Disney Princess Elena of Avalor 11inch Adventu...,3,PayPal,#540192905380,2024-11-17 22:21:24.227360,2024-11-19 02:57:24.577610,Benjamin,...,Cagayan de Oro,Misamis Oriental,9000,+63 9676956131,TOY454,Disney's Elena of Avalor must learn to be ther...,DISNEY PRINCESS,Dolls & Fashion Playsets,14.03,https://cdn.shopify.com/s/files/1/0731/4400/88...
4,#311115426899,2025-04-02 18:54:59.946741,Ryan Thomas,Nerf Modulus Recon MK11,1,Digital Wallet,#999128533401,2025-04-03 06:57:07.982721,2025-04-04 14:38:03.174377,Ryan,...,Cagayan de Oro,Misamis Oriental,9000,+63 9376104203,TOY39,Measures L 42.5 x W 5 x H 20 cm For ages 8 and...,NERF,Blaster Toys,36.0,https://cdn.shopify.com/s/files/1/0731/4400/88...


## PART 2: Save dataframe as parquet files

In [10]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import math

# Number of files
df = orders_df
n_files = 16
rows_per_file = math.ceil(len(df) / n_files)

for i in range(n_files):
    start = i * rows_per_file
    end = start + rows_per_file
    df_chunk = df.iloc[start:end]
    
    table = pa.Table.from_pandas(df_chunk)
    filename = f"part-{i:05d}-6e3c6bd2-7e8a-4504-918c-d60fcc98c75d-c000.snappy.parquet"
    pq.write_table(table, filename, compression="snappy")