In [1]:
import random
import pandas as pd
from faker import Faker
from datetime import timedelta, datetime  # Import datetime

In [2]:
data_path = "./data/"

In [3]:
fake = Faker()

In [4]:
# Some advanced configurations
NUM_WORKSTREAMS = 200
NUM_PEOPLE = 1000         # labor
NUM_EQUIP_TYPES = 30
NUM_MAT_TYPES = 30
NUM_TASKS = 2000

In [5]:
# We'll define 35 skill types for labor
LABOR_SKILLS = [
    "CivilEngineer", "Electrician", "Carpenter", "Plumber", "HVACTech", "SteelWorker", 
    "ConcreteSpecialist", "SitePlanner", "InsulationWorker", "Rigger", "Painter",
    "Mason", "IronWorker", "WeldingTech", "Surveyor", "SafetyEngineer", "ProjectManager",
    "StructuralEngineer", "GeotechEngineer", "NetworkEngineer", "SysAdmin", "SecurityTech",
    "CableInstaller", "CraneOperator", "DataCenterArchitect", "InstrumentationTech",
    "HeatVentEngineer", "FireProtectionTech", "LandscapeTech", "ElectricPanelInstaller",
    "PowerDistEngineer", "CoolingSystemTech", "RoboticsTech", "ITIntegrationTech", "BackupSysEngineer"
]

# We'll define 30 equipment types
EQUIPMENT_TYPES = [
    "Crane", "Excavator", "ConcreteMixer", "Generator", "Forklift", "Bulldozer", "DumpTruck", 
    "BoomLift", "ScissorLift", "WeldingMachine", "AirCompressor", "HydraulicPress", "TowerCrane",
    "RoadRoller", "PileDriver", "CrawlerLoader", "Graders", "Trenchers", "Pumps", "Drills",
    "MobileScaffolding", "DirectionalDrill", "ConcretePump", "PowerSaw", "JackHammer",
    "LaserLevel", "GroundPenetradar", "DronesSurvey", "CablePuller", "PortableLifts"
]

# We'll define 30 material types
MATERIAL_TYPES = [
    "Concrete", "SteelBeam", "Rebar", "ElectricCable", "Ducting", "FiberOptics", "NetworkingRack",
    "HVACDuct", "InsulationPanels", "GlassPanels", "Bricks", "Asphalt", "Sand", "Gravel", "Lumber",
    "RoofingSheets", "PlasticPipes", "CopperTubing", "WiringHarness", "ServerChassis", 
    "CoolingPipes", "RaisedFloorPanels", "FireProofing", "EpoxyCoat", "SecuritySensors",
    "DoorFrames", "MetalDoors", "AccessPanels", "FiberPatchCords", "DataCabinets"
]

In [6]:
# 1) Create a single "MegaProject" or a few
mega_project = {
    "id": "proj_0",
    "projectID": "MEGA-DATACENTER",
    "projectName": "Mega DataCenter Construction",
    "overallBudget": random.uniform(10_000_000, 500_000_000),
    "startDate": fake.date_time_between(start_date='-1y', end_date='-6m').isoformat(),
    "plannedEndDate": (datetime.now() + timedelta(days=365)).isoformat(),
    "actualEndDate": None  # not completed yet
}

In [7]:
# 2) Generate 200 WorkStreams for this project
workstreams = []
proj_start_dt = datetime.fromisoformat(mega_project["startDate"])
for i in range(NUM_WORKSTREAMS):
    ws_start = proj_start_dt + timedelta(days=random.randint(0,180))
    ws_end = ws_start + timedelta(days=random.randint(60, 180))
    ws_budget = round(random.uniform(500_000, 5_000_000), 2)

    workstreams.append({
        "id": f"ws_{i}",
        "workStreamID": f"WS-{1000 + i}",
        "name": f"Workstream_{i}_{fake.word().title()}",
        "description": fake.sentence(nb_words=8),
        "startDate": ws_start.isoformat(),
        "endDate": ws_end.isoformat(),
        "budgetAllocated": ws_budget,
        "projectID": mega_project["id"]
    })

In [8]:
# 3) Generate Suppliers (30 or 40 random suppliers)
NUM_SUPPLIERS = 40
suppliers = []
for i in range(NUM_SUPPLIERS):
    suppliers.append({
        "id": f"sup_{i}",
        "supplierID": f"SUP-{1000 + i}",
        "supplierName": fake.company() + " Supplies",
        "location": fake.city()
    })

In [9]:
# 4) Generate People (Labor) - 1000 employees
people = []
for i in range(NUM_PEOPLE):
    skill = random.choice(LABOR_SKILLS)
    hr_rate = round(random.uniform(10, 100), 2)
    people.append({
        "id": f"person_{i}",
        "personID": f"EMP-{5000 + i}",
        "name": fake.name(),
        "skillType": skill,
        "hourlyRate": hr_rate
    })

In [10]:
# 5) Generate Equipment (30 distinct resource types)
equipment_list = []
for i in range(len(EQUIPMENT_TYPES)):
    eq_type = EQUIPMENT_TYPES[i]
    daily_cost = round(random.uniform(100, 2000), 2)
    equipment_list.append({
        "id": f"equip_{i}",
        "equipmentID": f"EQ-{1000 + i}",
        "equipmentName": eq_type + f"_{i}",
        "equipmentType": eq_type,
        "dailyRentalCost": daily_cost,
        "capacityOrSpecs": f"{eq_type} spec details"
    })

In [11]:
# 6) Generate Materials (30 distinct material types)
material_list = []
for i in range(len(MATERIAL_TYPES)):
    mat_type = MATERIAL_TYPES[i]
    cost_unit = round(random.uniform(1, 500), 2)
    # link to random suppliers
    sup = random.choice(suppliers)
    mat_name = mat_type + f"_{i}"
    material_list.append({
        "id": f"mat_{i}",
        "materialID": f"MAT-{1000 + i}",
        "materialName": mat_name,
        "materialType": mat_type,
        "unitCost": cost_unit,
        "quantityOnHand": random.randint(100, 100000),
        "supplierID": sup["id"]
    })

# Optionally unify them all as a single Resource concept
# We'll keep them separate for clarity, but we can unify if needed.

In [12]:
# 7) Generate 2000 Tasks
#   Each references a WorkStream, has random start/end, random cost
tasks = []
all_task_ids = []
workstreams_sorted = sorted(workstreams, key=lambda x: x["startDate"])
for i in range(NUM_TASKS):
    ws = random.choice(workstreams_sorted)
    ws_start_dt = datetime.fromisoformat(ws["startDate"])
    # random offset after ws start
    offset_days = random.randint(0, 60)
    t_start = ws_start_dt + timedelta(days=offset_days)
    duration = random.randint(5, 60)
    t_end = t_start + timedelta(days=duration)
    cost_est = round(random.uniform(50_000, 1_000_000), 2)

    tasks.append({
        "id": f"task_{i}",
        "taskID": f"TK-{10000 + i}",
        "taskName": f"Task_{i}_{fake.bs().title()}",
        "startDate": t_start.isoformat(),
        "endDate": t_end.isoformat(),
        "durationDays": duration,
        "costEstimate": cost_est,
        "actualCost": 0.0,  # assume 0 or we can randomize partial
        "isCritical": False,
        "milestoneFlag": (random.random() < 0.05),  # 5% chance it's a milestone
        "workStreamID": ws["id"]
    })
    all_task_ids.append(f"task_{i}")

In [13]:
# 8) Assign Dependencies
# We'll pick tasks that start earlier as potential dependencies
tasks_sorted = sorted(tasks, key=lambda x: x["startDate"])
for idx, tsk in enumerate(tasks_sorted):
    # find possible tasks that end before tsk.start
    tsk_start = datetime.fromisoformat(tsk["startDate"])
    possible_deps = []
    for j in range(idx):
        candidate = tasks_sorted[j]
        candidate_end = datetime.fromisoformat(candidate["endDate"])
        if candidate_end <= tsk_start:
            possible_deps.append(candidate["id"])

    # pick up to 3
    num_deps = random.randint(0, min(3, len(possible_deps)))
    chosen_deps = random.sample(possible_deps, k=num_deps) if possible_deps else []
    tsk["dependsOnIDs"] = chosen_deps

In [14]:
# 9) Assign Resources to Each Task
# We'll pick 1-8 random "people" (labor), 0-2 equipment, 0-3 materials
for tsk in tasks:
    # labor
    labor_count = random.randint(1, 8)
    chosen_people = random.sample(people, k=labor_count)
    tsk["laborIDs"] = [p["id"] for p in chosen_people]

    # equipment
    equip_count = random.randint(0, 2)
    chosen_equip = random.sample(equipment_list, k=equip_count)
    tsk["equipmentIDs"] = [e["id"] for e in chosen_equip]

    # materials
    mat_count = random.randint(0, 3)
    chosen_mat = random.sample(material_list, k=mat_count)
    tsk["materialIDs"] = [m["id"] for m in chosen_mat]

In [15]:
# 10) Compute a simplistic "longest path" to mark isCritical
# We'll do a topological approach + DFS for the longest chain

from collections import defaultdict

children_map = defaultdict(list)
task_map = {}
for t in tasks:
    task_map[t["id"]] = t
for t in tasks:
    for dep_id in t["dependsOnIDs"]:
        children_map[dep_id].append(t["id"])

dist_cache = {}  # store (dist, chain)

def dfs_longest(task_id):
    if task_id in dist_cache:
        return dist_cache[task_id]
    ch_list = children_map[task_id]
    if not ch_list:
        dist_cache[task_id] = (0, [task_id])
        return (0, [task_id])
    max_dist = -1
    best_chain = []
    for c_id in ch_list:
        d, chain = dfs_longest(c_id)
        if d > max_dist:
            max_dist = d
            best_chain = chain
    dist_cache[task_id] = (1 + max_dist, [task_id] + best_chain)
    return dist_cache[task_id]

# tasks with no dependencies are "roots"
roots = [t for t in tasks if len(t["dependsOnIDs"]) == 0]

for r in roots:
    dfs_longest(r["id"])

# find global max
global_dist = -1
global_chain = []
for k, (dist, chain) in dist_cache.items():
    if dist > global_dist:
        global_dist = dist
        global_chain = chain

# Mark those tasks in global_chain as isCritical
for tid in global_chain:
    task_map[tid]["isCritical"] = True

In [16]:
# Summaries
print("\n--- Data Generation Complete ---")
print("Project: 1 (MegaProject)")
print("Workstreams:", len(workstreams))
print("Suppliers:", len(suppliers))
print("People (Labor):", len(people))
print("Equipment Resources:", len(equipment_list))
print("Material Resources:", len(material_list))
print("Tasks:", len(tasks), "\n")


--- Data Generation Complete ---
Project: 1 (MegaProject)
Workstreams: 200
Suppliers: 40
People (Labor): 1000
Equipment Resources: 30
Material Resources: 30
Tasks: 2000 



In [17]:
# Show some samples
print("Sample WorkStream:", workstreams[0])
print("Sample Supplier:", suppliers[0])
print("Sample Person (Labor):", people[0])
print("Sample Equipment:", equipment_list[0])
print("Sample Material:", material_list[0])
print("\nSample Task with references:\n", tasks[0])

Sample WorkStream: {'id': 'ws_0', 'workStreamID': 'WS-1000', 'name': 'Workstream_0_Admit', 'description': 'Growth share national story stuff house cold its federal.', 'startDate': '2024-05-20T21:54:27', 'endDate': '2024-10-27T21:54:27', 'budgetAllocated': 3007086.15, 'projectID': 'proj_0'}
Sample Supplier: {'id': 'sup_0', 'supplierID': 'SUP-1000', 'supplierName': 'Thomas-Wright Supplies', 'location': 'East Louis'}
Sample Person (Labor): {'id': 'person_0', 'personID': 'EMP-5000', 'name': 'Andrea Gilbert', 'skillType': 'LandscapeTech', 'hourlyRate': 87.61}
Sample Equipment: {'id': 'equip_0', 'equipmentID': 'EQ-1000', 'equipmentName': 'Crane_0', 'equipmentType': 'Crane', 'dailyRentalCost': 1557.6, 'capacityOrSpecs': 'Crane spec details'}
Sample Material: {'id': 'mat_0', 'materialID': 'MAT-1000', 'materialName': 'Concrete_0', 'materialType': 'Concrete', 'unitCost': 220.19, 'quantityOnHand': 97953, 'supplierID': 'sup_37'}

Sample Task with references:
 {'id': 'task_0', 'taskID': 'TK-10000',

In [19]:
# persist the data
pd.DataFrame(workstreams).to_csv(data_path+"workstreams.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(suppliers).to_csv(data_path+"suppliers.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(people).to_csv(data_path+"people.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(equipment_list).to_csv(data_path+"equipment_list.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(material_list).to_csv(data_path+"material_list.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(tasks).to_csv(data_path+"tasks.csv", encoding = "utf-8", escapechar = "\"", index=False)