In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
import random
import time
import sys
import shutil
from faker import Faker
from datetime import datetime, timedelta
from sqlalchemy import create_engine

In [2]:
# Read csv files to dataframes
project_path = os.getenv("ecomm")

customer_csv_path = os.path.join(project_path, "faker_dataset", "faker_csv", "fake_customers.csv")
product_csv_path = os.path.join(project_path, "faker_dataset", "faker_csv", "fake_products.csv")
output_path = "gs://ecomm_bucket001/output_files/from_faker"

df_customers = pd.read_csv(customer_csv_path) 
df_products = pd.read_csv(product_csv_path)

In [3]:
# Make a list of customers and products
customers_list = df_customers['Full Name'].tolist()
products_list = df_products['Title'].tolist()

# Define payment methods
payment_methods = ["PayPal","Digital Wallet","Cash on Delivery","Bank Transfer"]

# Define inclusive dates for the fake data
start = datetime(2025, 11, 1, 0, 0, 0, 0)
end = datetime.now()

delta = end - start
number_of_days = delta.days

# Calculate the number of rows
average_daily_transaction = 800
number_of_rows = number_of_days*average_daily_transaction

In [4]:
print(number_of_rows)
print(number_of_days)

8800
11


In [5]:
# --- Setup ---
fake = Faker('en_PH')
data = []

In [6]:
# Define a function to generate random reference/order numbers
def random_number():
    return f"#{random.randint(100000000000, 999999999999)}"

# --- Generate Orders ---
for _ in range(number_of_rows):
    order_number = random_number()
    order_date = fake.date_time_between_dates(datetime_start=start, datetime_end=end)
    billing_name = random.choice(customers_list)
    payment_method = random.choice(payment_methods)
    payment_reference = random_number()

    # Each order has 1â€“3 products (line items)
    for _ in range(random.randint(1, 3)):
        lineitem_name = random.choice(products_list)
        lineitem_qty = random.randint(1, 3)

        # Merge customer info
        customer_info = df_customers.loc[df_customers['Full Name'] == billing_name].to_dict('records')[0]
        # Merge product info
        product_info = df_products.loc[df_products['Title'] == lineitem_name].to_dict('records')[0]

        order_dict = {
            'order_number': order_number,
            'order_date': order_date,
            'year': order_date.year,
            'billing_name': billing_name,
            'lineitem_name': lineitem_name,
            'lineitem_qty': lineitem_qty,
            'payment_method': payment_method,
            'payment_reference': payment_reference,
            'payment_date': order_date + timedelta(days=random.uniform(0, 1)),
            'fulfillment_date': order_date + timedelta(days=random.uniform(1, 2)),
        }

        # Merge additional customer & product fields
        order_dict.update({k.lower().replace(' ','_'): v for k, v in customer_info.items() if k != 'Full Name'})
        order_dict.update({k.lower().replace(' ','_'): v for k, v in product_info.items() if k != 'Title'})

        data.append(order_dict)

### Upload data to PostgreSQL database instance

In [9]:
df_orders = pd.DataFrame(data)

In [13]:
db_user = "db_user"
db_pass = "dR4m%T6nb"
db_host = "10.98.224.3"
db_port = "5432"
db_name = "db_ecomm"


In [14]:
engine = create_engine(
    f"postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}"
)

In [15]:
df_orders.to_sql(
    name='orders',
    con=engine,
    if_exists='append',
    index=False
)

475