In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [2]:
creds_location = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")

In [3]:
conf = SparkConf() \
    .setMaster("spark://vm-instance-nyc-taxi.asia-southeast1-a.c.de-project-nyc-taxi.internal:7077") \
    .setAppName("data_generator") \
    .set("spark.jars","/home/salacjamesrhode23/connectors/gcs-connector-hadoop3-2.2.5.jar") \
    .set("google.cloud.auth.service.account.enable", "true") \
    .set("google.cloud.auth.service.account.json.keyfile", creds_location)

In [4]:
sc = SparkContext(conf=conf)
hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", creds_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")



25/11/13 10:10:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [6]:
spark

### Generating Synthetic Data

In [7]:
import pandas as pd
import random
import time
import sys
import shutil
from faker import Faker
from datetime import datetime, timedelta

In [8]:
# Read csv files to dataframes
project_path = os.getenv("ecomm")

customer_csv_path = os.path.join(project_path, "faker_dataset", "faker_csv", "fake_customers.csv")
product_csv_path = os.path.join(project_path, "faker_dataset", "faker_csv", "fake_products.csv")
output_path = "gs://ecomm_bucket001/output_files/from_faker"

df_customers = pd.read_csv(customer_csv_path) 
df_products = pd.read_csv(product_csv_path)

In [9]:
# Make a list of customers and products
customers_list = df_customers['Full Name'].tolist()
products_list = df_products['Title'].tolist()

# Define payment methods
payment_methods = ["PayPal","Digital Wallet","Cash on Delivery","Bank Transfer"]

# Define inclusive dates for the fake data
start = datetime(2010, 1, 1, 0, 0, 0, 0)
end = datetime(2014, 12, 31, 0, 0, 0, 0)

delta = end - start
number_of_days = delta.days

# Calculate the number of rows
average_daily_transaction = 800
number_of_rows = number_of_days*average_daily_transaction

In [12]:
print(number_of_rows)
print(number_of_days)

1460000
1825


In [13]:
# --- Setup ---
fake = Faker('en_PH')
data = []

In [14]:
# Start timer and check free storage in GB
free_gb_start = shutil.disk_usage("/").free / (1024 ** 3)
start_time = time.time()

# Define a function to generate random reference/order numbers
def random_number():
    return f"#{random.randint(100000000000, 999999999999)}"

# --- Generate Orders ---
for _ in range(number_of_rows):
    order_number = random_number()
    order_date = fake.date_time_between_dates(datetime_start=start, datetime_end=end)
    billing_name = random.choice(customers_list)
    payment_method = random.choice(payment_methods)
    payment_reference = random_number()

    # Each order has 1â€“3 products (line items)
    for _ in range(random.randint(1, 3)):
        lineitem_name = random.choice(products_list)
        lineitem_qty = random.randint(1, 3)

        # Merge customer info
        customer_info = df_customers.loc[df_customers['Full Name'] == billing_name].to_dict('records')[0]
        # Merge product info
        product_info = df_products.loc[df_products['Title'] == lineitem_name].to_dict('records')[0]

        order_dict = {
            'order_number': order_number,
            'order_date': order_date,
            'year': order_date.year,
            'billing_name': billing_name,
            'lineitem_name': lineitem_name,
            'lineitem_qty': lineitem_qty,
            'payment_method': payment_method,
            'payment_reference': payment_reference,
            'payment_date': order_date + timedelta(days=random.uniform(0, 1)),
            'fulfillment_date': order_date + timedelta(days=random.uniform(1, 2)),
        }

        # Merge additional customer & product fields
        order_dict.update({k.lower().replace(' ','_'): v for k, v in customer_info.items() if k != 'Full Name'})
        order_dict.update({k.lower().replace(' ','_'): v for k, v in product_info.items() if k != 'Title'})

        data.append(order_dict)

# --- Compute Metrics ---
elapsed_time = time.time() - start_time

# Estimate total data size in GB using pandas for better accuracy
df_estimate = pd.DataFrame(data)
total_bytes = df_estimate.memory_usage(deep=True).sum()
total_gb = total_bytes / (1024 ** 3)

# Check free storage in GB
free_gb_end = shutil.disk_usage("/").free / (1024 ** 3)

# --- Print Summary ---
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"Estimated total data size: {total_gb:.4f} GB")
print(f"Free storage before: {free_gb_start:.4f} GB")
print(f"Free storage after: {free_gb_end:.4f} GB")

25/11/13 10:32:02 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/11/13 10:32:02 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:218)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:923)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:154)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:262)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:169)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

In [13]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
# Convert pandas dataframe to spark dataframe
df_orders = spark.createDataFrame(data)

In [None]:
df_orders.write \
    .mode("overwrite") \
    .partitionBy("year") \
    .parquet(output_path)

# WORKAROUND - DELETE WHEN IN PRODUCTION

In [14]:
df_orders = pd.DataFrame(data)

In [15]:
# Convert datetime columns to microsecond precision before saving
df_orders['order_date'] = df_orders['order_date'].astype('datetime64[us]')
df_orders['payment_date'] = df_orders['payment_date'].astype('datetime64[us]')
df_orders['fulfillment_date'] = df_orders['fulfillment_date'].astype('datetime64[us]')

In [16]:
df_orders.to_parquet("pq/part-00003-8b63187c-abf5-40d4-afe6-e5f6c289e8fe-c000.snappy.parquet", engine="pyarrow", index=False)

### Parquet file names:
part-00000-8b63187c-abf5-40d4-afe6-e5f6c289e8fe-c000.snappy.parquet <br>
part-00001-8b63187c-abf5-40d4-afe6-e5f6c289e8fe-c000.snappy.parquet <br>
part-00002-8b63187c-abf5-40d4-afe6-e5f6c289e8fe-c000.snappy.parquet <br>
part-00003-8b63187c-abf5-40d4-afe6-e5f6c289e8fe-c000.snappy.parquet 

In [17]:
## Now we can read the parquet file and load it to GCS
df_orders = spark.read.parquet("pq/")

                                                                                

In [18]:
output_path = "gs://ecomm_bucket001/output_files/from_faker"

In [19]:
df_orders.repartition(15).write \
    .mode("overwrite") \
    .parquet(output_path)

                                                                                