In [1]:
import random
import pandas as pd
from faker import Faker
from datetime import timedelta, datetime  # Import datetime
from collections import defaultdict

In [2]:
data_path = "./data/big/"

In [3]:
fake = Faker()

In [4]:
# 1 CONFIGURATION
NUM_PROJECTS = 100
WORKSTREAMS_MIN = 50
WORKSTREAMS_MAX = 150

NUM_PEOPLE = 2000  # labor resources
NUM_EQUIP_TYPES = 30
NUM_MAT_TYPES = 30

NUM_SUPPLIERS = 80
NUM_TEAMS = 500

TASKS_PER_PROJECT = 3000  # for each of the 100 projects => 300k tasks total
COMMISSIONING_PCT = 0.05  # 5% tasks are CommissioningActivity
OVERLAP_PCT = 0.30        # ~30% tasks have some overlapping schedule

In [5]:
# We define 35 skill types for labor
LABOR_SKILLS = [
    "CivilEngineer", "Electrician", "Carpenter", "Plumber", "HVACTech", "SteelWorker", 
    "ConcreteSpecialist", "SitePlanner", "InsulationWorker", "Rigger", "Painter",
    "Mason", "IronWorker", "WeldingTech", "Surveyor", "SafetyEngineer", "ProjectManager",
    "StructuralEngineer", "GeotechEngineer", "NetworkEngineer", "SysAdmin", "SecurityTech",
    "CableInstaller", "CraneOperator", "DataCenterArchitect", "InstrumentationTech",
    "HeatVentEngineer", "FireProtectionTech", "LandscapeTech", "ElectricPanelInstaller",
    "PowerDistEngineer", "CoolingSystemTech", "RoboticsTech", "ITIntegrationTech", "BackupSysEngineer"
]

# 30 equipment types
EQUIPMENT_TYPES = [
    "Crane", "Excavator", "ConcreteMixer", "Generator", "Forklift", "Bulldozer", "DumpTruck", 
    "BoomLift", "ScissorLift", "WeldingMachine", "AirCompressor", "HydraulicPress", "TowerCrane",
    "RoadRoller", "PileDriver", "CrawlerLoader", "Graders", "Trenchers", "Pumps", "Drills",
    "MobileScaffolding", "DirectionalDrill", "ConcretePump", "PowerSaw", "JackHammer",
    "LaserLevel", "GroundPenetradar", "DronesSurvey", "CablePuller", "PortableLifts"
]

# 30 material types
MATERIAL_TYPES = [
    "Concrete", "SteelBeam", "Rebar", "ElectricCable", "Ducting", "FiberOptics", "NetworkingRack",
    "HVACDuct", "InsulationPanels", "GlassPanels", "Bricks", "Asphalt", "Sand", "Gravel", "Lumber",
    "RoofingSheets", "PlasticPipes", "CopperTubing", "WiringHarness", "ServerChassis", 
    "CoolingPipes", "RaisedFloorPanels", "FireProofing", "EpoxyCoat", "SecuritySensors",
    "DoorFrames", "MetalDoors", "AccessPanels", "FiberPatchCords", "DataCabinets"
]

In [6]:
# 2 We'll store final data in lists
mega_projects = []
workstreams = []
people = []
equipment_list = []
material_list = []
suppliers = []
teams = []
tasks = []
procurement_orders = []

In [7]:
# 3 Generate Projects
for i in range(NUM_PROJECTS):
    start_dt = fake.date_time_between(start_date='-2y', end_date='-1y')
    plan_end_dt = start_dt + timedelta(days=random.randint(300, 800))  # large range
    pr = {
        "id": f"proj_{i}",
        "projectID": f"PROJECT-{1000+i}",
        "projectName": f"MegaDataCenter_{i}",
        "overallBudget": round(random.uniform(10_000_000, 500_000_000), 2),
        "startDate": start_dt.isoformat(),
        "plannedEndDate": plan_end_dt.isoformat(),
        "actualEndDate": None  # we might set if "completed"
    }
    mega_projects.append(pr)

In [8]:
# 4 Generate WorkStreams
ws_id = 0
for proj in mega_projects:
    # random # of workstreams (50-150)
    num_ws = random.randint(WORKSTREAMS_MIN, WORKSTREAMS_MAX)
    proj_start = datetime.fromisoformat(proj["startDate"])
    proj_end = datetime.fromisoformat(proj["plannedEndDate"])
    for _ in range(num_ws):
        ws_start_offset = random.randint(0, max(0,(proj_end - proj_start).days // 2))
        ws_start = proj_start + timedelta(days=ws_start_offset)
        ws_end = ws_start + timedelta(days=random.randint(60,200))
        if ws_end > proj_end:
            ws_end = proj_end  # clamp

        ws_budget = round(random.uniform(500_000, 5_000_000), 2)
        w = {
            "id": f"ws_{ws_id}",
            "workStreamID": f"WS-{5000 + ws_id}",
            "name": f"Workstream_{ws_id}_{fake.word().title()}",
            "description": fake.sentence(nb_words=8),
            "startDate": ws_start.isoformat(),
            "endDate": ws_end.isoformat(),
            "budgetAllocated": ws_budget,
            "projectID": proj["id"]
        }
        workstreams.append(w)
        ws_id += 1

In [9]:
# 5 Generate People
for i in range(NUM_PEOPLE):
    skill = random.choice(LABOR_SKILLS)
    hr_rate = round(random.uniform(10, 100), 2)
    p = {
        "id": f"person_{i}",
        "personID": f"EMP-{8000 + i}",
        "name": fake.name(),
        "skillType": skill,
        "hourlyRate": hr_rate,
        # resourceType="Labor" in an actual combined structure
    }
    people.append(p)

In [10]:
# 6 Generate Equipment (some multiple of 30 types)
EQUIP_TOTAL = 100  # or more
for i in range(EQUIP_TOTAL):
    eq_type = random.choice(EQUIPMENT_TYPES)
    daily_cost = round(random.uniform(100, 2000), 2)
    eq = {
        "id": f"equip_{i}",
        "equipmentID": f"EQ-{1000 + i}",
        "equipmentName": f"{eq_type}_{i}",
        "equipmentType": eq_type,
        "dailyRentalCost": daily_cost,
        "capacityOrSpecs": f"{eq_type} spec details",
        # resourceType="Equipment"
    }
    equipment_list.append(eq)

In [11]:
# 7 Generate Materials
MAT_TOTAL = 100
for i in range(MAT_TOTAL):
    mtype = random.choice(MATERIAL_TYPES)
    cost_unit = round(random.uniform(1, 500), 2)
    # We'll link to a random supplier later or store a placeholder
    mat = {
        "id": f"mat_{i}",
        "materialID": f"MAT-{1000 + i}",
        "materialName": f"{mtype}_{i}",
        "materialType": mtype,
        "unitCost": cost_unit,
        "quantityOnHand": random.randint(500, 100000),
        # resourceType="Material"
    }
    material_list.append(mat)

In [12]:
# 8 Generate Suppliers (80)
for i in range(NUM_SUPPLIERS):
    sup = {
        "id": f"sup_{i}",
        "supplierID": f"SUP-{1000 + i}",
        "supplierName": fake.company() + " Supplies",
        "location": fake.city()
    }
    suppliers.append(sup)

# We'll link some suppliers to some materials/equipment
for mat in material_list:
    # 70% chance to have a supplier
    if random.random() < 0.7:
        s = random.choice(suppliers)
        mat["supplierID"] = s["id"]
for eq in equipment_list:
    # 40% chance to have a supplier
    if random.random() < 0.4:
        s = random.choice(suppliers)
        eq["supplierID"] = s["id"]  # we'll store it for reference

In [13]:
# 9 Generate Teams
for i in range(NUM_TEAMS):
    t = {
        "id": f"team_{i}",
        "teamID": f"TM-{3000 + i}",
        "teamName": f"Team_{i}_{fake.word().title()}",
        # we might link persons but let's store them as a separate step
    }
    # optional: link random subset of persons
    # We'll store references in a separate data structure or a key "personIDs"
    t["personIDs"] = []
    # pick 5-20 random people
    n_members = random.randint(5, 20)
    chosen_people = random.sample(people, k=n_members)
    t["personIDs"] = [cp["id"] for cp in chosen_people]

    teams.append(t)

In [14]:
# 10 Generate ProcurementOrders referencing suppliers & resources
# We'll do a smaller number of POs because tasks are huge
NUM_PROC_ORDERS = 2000
for i in range(NUM_PROC_ORDERS):
    # random supplier
    sp = random.choice(suppliers)
    # pick random resource(s) from either equip or material
    res_count = random.randint(1,3)
    res_type_choice = random.choice(["equip","mat"])
    resources_chosen = []
    if res_type_choice == "equip":
        resources_chosen = random.sample(equipment_list, k=res_count)
    else:
        resources_chosen = random.sample(material_list, k=res_count)

    # random project link
    proj = random.choice(mega_projects)
    order_d = fake.date_time_between_dates(datetime_start=datetime.fromisoformat(proj["startDate"]),
                                           datetime_end=datetime.now())

    total_c = round(sum([r["unitCost"] if "unitCost" in r else r["dailyRentalCost"] for r in resources_chosen])*random.uniform(5,100),2)
    po = {
        "id": f"po_{i}",
        "orderNumber": f"PO-{10000+i}",
        "orderDate": order_d.isoformat(),
        "totalCost": total_c,
        "supplierID": sp["id"],
        "resourceIDs": [r["id"] for r in resources_chosen],
        "belongsToProjectID": proj["id"]
    }
    procurement_orders.append(po)

In [15]:
# 11 Generate Tasks (including CommissioningActivity for ~5%) 
# We have 3000 tasks per project => 300k tasks total. This is huge in memory.
tasks = []
task_id_counter = 0

for proj in mega_projects:
    # gather that project's workstreams
    p_ws = [ws for ws in workstreams if ws["projectID"] == proj["id"]]
    if not p_ws:
        # at least one in theory, but just skip if none
        continue

    for t_count in range(TASKS_PER_PROJECT):
        # random: pick a normal Task or a CommissioningActivity (5% chance)
        is_commission = (random.random() < COMMISSIONING_PCT)

        # pick random ws
        sel_ws = random.choice(p_ws)
        ws_start_dt = datetime.fromisoformat(sel_ws["startDate"])
        ws_end_dt = datetime.fromisoformat(sel_ws["endDate"])

        # generate a start offset
        total_ws_days = (ws_end_dt - ws_start_dt).days
        if total_ws_days < 10:
            total_ws_days = 10

        # if we want 30% overlap, we won't necessarily do perfect offset checks,
        # just random: 30% chance we forcibly pick a start that is within an overlapping window
        if random.random() < OVERLAP_PCT:
            # pick a random start in the first half
            start_offset = random.randint(0, total_ws_days // 2)
        else:
            # normal approach
            start_offset = random.randint(0, total_ws_days - 1)

        t_start = ws_start_dt + timedelta(days=start_offset)
        # random duration
        dur = random.randint(5,50)
        t_end = t_start + timedelta(days=dur)
        if t_end > ws_end_dt:
            t_end = ws_end_dt

        cost_est = round(random.uniform(50_000,1_000_000),2)
        mil_flag = (random.random() < 0.03)  # 3% tasks are milestone

        # Build the base Task dictionary
        base_task = {
            "id": f"task_{task_id_counter}",
            "taskID": f"TK-{10000+task_id_counter}",
            "taskName": f"Task_{task_id_counter}_{fake.bs().title()}",
            "startDate": t_start.isoformat(),
            "endDate": t_end.isoformat(),
            "durationDays": (t_end - t_start).days,
            "costEstimate": cost_est,
            "actualCost": 0.0,
            "isCritical": False,
            "milestoneFlag": mil_flag,
            "workStreamID": sel_ws["id"],
            "dependsOnIDs": [],
            "laborIDs": [],
            "equipmentIDs": [],
            "materialIDs": [],
            "teamID": None
        }

        if is_commission:
            # It's a specialized CommissioningActivity
            # We'll store extra fields
            base_task["classType"] = "CommissioningActivity"  # marker
            base_task["commissioningChecklist"] = "Checklist items..."
            base_task["passDate"] = None  # we can fill in if needed
        else:
            base_task["classType"] = "Task"

        tasks.append(base_task)
        task_id_counter += 1

In [16]:
# 12 Build dependencies for tasks within the same project
# We'll do a two-pass approach:
#  - Sort tasks by startDate per project, then assign up to 3 dependencies from earlier tasks
# TODO: PARALLELIZE THIS CODE, THIS NAIVE APPROACH TAKES WAAAAAYYYY TOO MUCH TIME...
tasks_by_project = defaultdict(list)
for t in tasks:
    # we can find project by looking up the workstream -> project
    wsid = t["workStreamID"]
    wsobj = next((w for w in workstreams if w["id"] == wsid), None)
    if wsobj:
        p_id = wsobj["projectID"]
        tasks_by_project[p_id].append(t)

for p_id, tlist in tasks_by_project.items():
    tlist.sort(key=lambda x: x["startDate"])
    for idx, tsk in enumerate(tlist):
        start_t = datetime.fromisoformat(tsk["startDate"])
        # gather earlier tasks that end before this start
        possible_deps = []
        for j in range(idx):
            cand = tlist[j]
            cand_end = datetime.fromisoformat(cand["endDate"])
            if cand_end <= start_t:
                possible_deps.append(cand["id"])
        # pick up to 3
        nd = random.randint(0, min(3, len(possible_deps)))
        if nd>0:
            chosen = random.sample(possible_deps, nd)
            tsk["dependsOnIDs"] = chosen

In [17]:
# 13 Resource assignment to tasks
# We'll do random: 1-10 labor, 0-2 equipment, 0-4 materials, 0-1 team
for tsk in tasks:
    # labor
    n_labor = random.randint(1,10)
    chosen_people = random.sample(people, n_labor)
    tsk["laborIDs"] = [cp["id"] for cp in chosen_people]

    # equip
    eq_count = random.randint(0,2)
    eq_chosen = random.sample(equipment_list, eq_count)
    tsk["equipmentIDs"] = [e["id"] for e in eq_chosen]

    # material
    mat_count = random.randint(0,4)
    mat_chosen = random.sample(material_list, mat_count)
    tsk["materialIDs"] = [m["id"] for m in mat_chosen]

    # maybe assign a team
    if random.random()<0.2:
        t = random.choice(teams)
        tsk["teamID"] = t["id"]

In [18]:
# 14 Approximate "critical path" marking
# We'll do a naive "longest path" approach per project
dist_cache = {}
children_map = defaultdict(list)
task_map = {}
for t in tasks:
    task_map[t["id"]] = t

for t in tasks:
    for dID in t["dependsOnIDs"]:
        children_map[dID].append(t["id"])

def dfs_longest(tid):
    if tid in dist_cache:
        return dist_cache[tid]
    ch = children_map[tid]
    if not ch:
        dist_cache[tid] = (0,[tid])
        return (0,[tid])
    maxd = -1
    best_path=[]
    for c in ch:
        d,chain = dfs_longest(c)
        if d>maxd:
            maxd = d
            best_path=chain
    dist_cache[tid] = (1+maxd, [tid]+best_path)
    return dist_cache[tid]

# We'll do it per project
for p_id, tlist in tasks_by_project.items():
    # find roots: tasks with no dependsOn
    roots = [x for x in tlist if not x["dependsOnIDs"]]
    for r in roots:
        dfs_longest(r["id"])
    # find global max for that project
    local_max=-1
    local_chain=[]
    for t in tlist:
        tid = t["id"]
        if tid in dist_cache:
            (dist,chain)=dist_cache[tid]
            if dist>local_max:
                local_max=dist
                local_chain=chain
    # Mark tasks in local_chain as isCritical
    for ctid in local_chain:
        task_map[ctid]["isCritical"]=True

In [19]:
# Summaries
print("\n--- Data Generation Complete ---")
print("Created Data Summary:\n")
print(" MegaProjects:", len(mega_projects))
print(" WorkStreams:", len(workstreams))
print(" People(Labor):", len(people))
print(" EquipmentResources:", len(equipment_list))
print(" MaterialResources:", len(material_list))
print(" Suppliers:", len(suppliers))
print(" Teams:", len(teams))
print(" ProcurementOrders:", len(procurement_orders))
print(" Tasks:", len(tasks), " (approx. 3000 per project => ~300k total)")


--- Data Generation Complete ---
Created Data Summary:

 MegaProjects: 100
 WorkStreams: 9734
 People(Labor): 2000
 EquipmentResources: 100
 MaterialResources: 100
 Suppliers: 80
 Teams: 500
 ProcurementOrders: 2000
 Tasks: 300000  (approx. 3000 per project => ~300k total)


In [20]:
# print sample
print("\nSample MegaProject:\n", mega_projects[0])
print("\nSample WorkStream:\n", workstreams[0])
print("\nSample Person (Labor):\n", people[0])
print("\nSample Equipment:\n", equipment_list[0])
print("\nSample Material:\n", material_list[0])
print("\nSample Supplier:\n", suppliers[0])
print("\nSample Team:\n", teams[0])
print("\nSample ProcurementOrder:\n", procurement_orders[0])
print("\nSample Task:\n", tasks[0])


Sample MegaProject:
 {'id': 'proj_0', 'projectID': 'PROJECT-1000', 'projectName': 'MegaDataCenter_0', 'overallBudget': 460459136.18, 'startDate': '2023-11-14T06:25:17', 'plannedEndDate': '2025-08-30T06:25:17', 'actualEndDate': None}

Sample WorkStream:
 {'id': 'ws_0', 'workStreamID': 'WS-5000', 'name': 'Workstream_0_Song', 'description': 'Degree compare much tough pick effect political.', 'startDate': '2024-08-20T06:25:17', 'endDate': '2024-10-26T06:25:17', 'budgetAllocated': 639931.73, 'projectID': 'proj_0'}

Sample Person (Labor):
 {'id': 'person_0', 'personID': 'EMP-8000', 'name': 'Robin Anderson', 'skillType': 'CraneOperator', 'hourlyRate': 92.18}

Sample Equipment:
 {'id': 'equip_0', 'equipmentID': 'EQ-1000', 'equipmentName': 'AirCompressor_0', 'equipmentType': 'AirCompressor', 'dailyRentalCost': 350.36, 'capacityOrSpecs': 'AirCompressor spec details', 'supplierID': 'sup_13'}

Sample Material:
 {'id': 'mat_0', 'materialID': 'MAT-1000', 'materialName': 'AccessPanels_0', 'materialT

In [22]:
# persist the data
pd.DataFrame(mega_projects).to_csv(data_path+"mega_projects.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(workstreams).to_csv(data_path+"workstreams.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(people).to_csv(data_path+"people.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(equipment_list).to_csv(data_path+"equipment_list.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(material_list).to_csv(data_path+"material_list.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(suppliers).to_csv(data_path+"suppliers.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(teams).to_csv(data_path+"teams.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(procurement_orders).to_csv(data_path+"procurement_orders.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(tasks).to_csv(data_path+"tasks.csv", encoding = "utf-8", escapechar = "\"", index=False)