In [0]:
%sh
pip install faker

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Cargo Manifest Generator (Batch)
# MAGIC 
# MAGIC **Purpose**: Generate daily cargo manifest files
# MAGIC 
# MAGIC **Features**:
# MAGIC - Daily batch generation
# MAGIC - Multiple containers per manifest
# MAGIC - Realistic shipping routes and schedules

# COMMAND ----------

from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime, date, timedelta
from faker import Faker
import random
import builtins

In [0]:
fake = Faker()

# Get parameters
dbutils.widgets.text("catalog_name", "cargo_fleet_dev", "Catalog")
dbutils.widgets.text("num_manifests", "100", "Number of Manifests")
dbutils.widgets.text("load_date", str(date.today()), "Load Date")

catalog_name = dbutils.widgets.get("catalog_name")
num_manifests = int(dbutils.widgets.get("num_manifests"))
load_date = datetime.strptime(dbutils.widgets.get("load_date"), "%Y-%m-%d").date()

volume_path = f"/Volumes/{catalog_name}/bronze/manifest_landing"

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Define Manifest Schema and Data

# COMMAND ----------

manifest_schema = StructType([
    StructField("manifest_id", StringType(), False),
    StructField("ship_id", StringType(), False),
    StructField("voyage_number", StringType(), False),
    StructField("load_port", StringType(), False),
    StructField("destination_port", StringType(), False),
    StructField("departure_date", DateType(), False),
    StructField("estimated_arrival_date", DateType(), False),
    StructField("container_id", StringType(), False),
    StructField("cargo_type", StringType(), True),
    StructField("cargo_weight_kg", DoubleType(), True),
    StructField("cargo_value_usd", DoubleType(), True),
    StructField("shipper_name", StringType(), True),
    StructField("consignee_name", StringType(), True),
    StructField("special_handling", StringType(), True),
    StructField("load_timestamp", TimestampType(), False)
])

# Ports and routes
PORTS = [
    "Shanghai, China", "Singapore", "Rotterdam, Netherlands",
    "Los Angeles, USA", "Hamburg, Germany", "Dubai, UAE",
    "Hong Kong", "Busan, South Korea", "Long Beach, USA",
    "Antwerp, Belgium"
]

ROUTES = [
    ("Shanghai, China", "Los Angeles, USA", 14),
    ("Singapore", "Rotterdam, Netherlands", 21),
    ("Dubai, UAE", "Hamburg, Germany", 18),
    ("Hong Kong", "Long Beach, USA", 15),
    ("Busan, South Korea", "Rotterdam, Netherlands", 24)
]

CARGO_TYPES = [
    "ELECTRONICS", "PHARMACEUTICALS", "FOODSTUFFS",
    "CHEMICALS", "MACHINERY", "TEXTILES", "FURNITURE",
    "AUTOMOTIVE_PARTS", "APPLIANCES", "RAW_MATERIALS"
]

SPECIAL_HANDLING = [
    None, None, None,  # Most don't need special handling
    "REFRIGERATED", "HAZARDOUS", "FRAGILE", "OVERSIZED"
]

In [0]:
def generate_manifest_entry(manifest_id, ship_id, route_info):
    """Generate single manifest entry"""
    load_port, dest_port, transit_days = route_info
    
    departure = load_date + timedelta(days=random.randint(0, 7))
    arrival = departure + timedelta(days=transit_days)
    
    cargo_type = random.choice(CARGO_TYPES)
    weight = random.uniform(1000, 25000)  # kg
    
    # Value calculation based on cargo type
    value_per_kg = {
        "ELECTRONICS": 50, "PHARMACEUTICALS": 100,
        "FOODSTUFFS": 5, "CHEMICALS": 20,
        "MACHINERY": 30, "TEXTILES": 15
    }.get(cargo_type, 10)
    
    value = weight * value_per_kg * random.uniform(0.8, 1.2)
    
    return (
        manifest_id,
        ship_id,
        f"V{random.randint(1000, 9999)}",
        load_port,
        dest_port,
        departure,
        arrival,
        f"CONT{random.randint(100000, 999999)}",
        cargo_type,
        builtins.round(weight, 2),
        builtins.round(value, 2),
        fake.company(),
        fake.company(),
        random.choice(SPECIAL_HANDLING),
        datetime.utcnow()
    )



In [0]:
# Generate manifest data
manifest_data = []

for i in range(num_manifests):
    manifest_id = f"MAN{load_date.strftime('%Y%m%d')}{str(i+1).zfill(4)}"
    ship_id = f"SHIP{random.randint(1, 100):04d}"
    route = random.choice(ROUTES)
    
    # Each manifest has 1-10 containers
    num_containers = random.randint(1, 10)
    
    for _ in range(num_containers):
        entry = generate_manifest_entry(manifest_id, ship_id, route)
        manifest_data.append(entry)

# Create DataFrame
df_manifests = spark.createDataFrame(manifest_data, manifest_schema)

print(f"Generated {df_manifests.count()} manifest entries for {num_manifests} manifests")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Write to Volume

# COMMAND ----------

# Output path
output_file = f"{volume_path}/manifest_{load_date.strftime('%Y%m%d')}.parquet"

# Write as Parquet
df_manifests.write.mode("overwrite").parquet(output_file)

print(f"✓ Written to: {output_file}")
print(f"✓ Total records: {df_manifests.count()}")
print(f"✓ Unique manifests: {df_manifests.select('manifest_id').distinct().count()}")

# Display sample
display(df_manifests.limit(10))