In [16]:
import random
from faker import Faker
from datetime import timedelta, datetime  # Import datetime
import pandas as pd

In [2]:
data_path = "./data/"

In [3]:
fake = Faker()

In [4]:
# Configuration
NUM_MANUFACTURERS = 10
NUM_DISTRIBUTORS = 15
NUM_PHARMACIES = 20
NUM_PRODUCTS = 30
NUM_BATCHES = 40
NUM_SHIPMENTS = 50
NUM_PRESCRIPTIONS = 60
NUM_APPROVALS = 25

# Suppose we have a separate "clinical" list of rxNormCodes for matching
rxNormPool = ["1049630", "860975", "314076", "228933"]  # minimal example

In [5]:
# 1 Generate PharmaManufacturer
manufacturers = []
for i in range(NUM_MANUFACTURERS):
    manufacturers.append({
        "id": f"mfg_{i}",
        "manufacturerName": fake.company() + " Pharma",
        "location": fake.city(),
        "licenseNumber": f"MFG-{random.randint(10000,99999)}"
    })

In [6]:
# 2 Generate Distributors
distributors = []
for i in range(NUM_DISTRIBUTORS):
    distributors.append({
        "id": f"dist_{i}",
        "distributorName": fake.company() + " Distribution",
        "location": fake.city(),
        "distributorID": f"DIST-{random.randint(10000,99999)}"
    })

In [7]:
# 3 Generate PharmacyOrHospital
facilityTypes = ["RetailPharmacy", "HospitalPharmacy"]
pharmacies = []
for i in range(NUM_PHARMACIES):
    pharmacies.append({
        "id": f"ph_{i}",
        "facilityName": fake.company() + " " + random.choice(["Pharmacy", "Hospital"]),
        "facilityType": random.choice(facilityTypes),
        "location": fake.city()
    })

In [8]:
# 4 Generate MedicationProduct
# We'll pick random brand/generic combos, plus random rxNormCode
forms = ["Tablet", "Capsule", "Solution"]
products = []
for i in range(NUM_PRODUCTS):
    brand = fake.word().title()
    generic = fake.word().lower()
    rxc = random.choice(rxNormPool)
    strength_val = f"{random.randint(5,500)}mg"
    products.append({
        "id": f"prod_{i}",
        "internalProductCode": f"INT-{random.randint(1000,9999)}",
        "brandName": brand,
        "genericName": generic,
        "strength": strength_val,
        "form": random.choice(forms),
        "rxNormCode": rxc
    })

In [9]:
# 5 Generate BatchOrLot
batches = []
for i in range(NUM_BATCHES):
    prod = random.choice(products)
    batchNum = f"LOT-{random.randint(10000,99999)}"
    expiry = fake.date_between(start_date='+30d', end_date='+730d')  # 1-2 yrs from now
    qtyProd = random.randint(1000, 50000)
    batches.append({
        "id": f"batch_{i}",
        "batchNumber": batchNum,
        "expiryDate": expiry.isoformat(),
        "quantityProduced": qtyProd,
        "productID": prod["id"]   # references product
    })

In [10]:
# 6 Generate Shipments
# fromEntity can be manufacturer or distributor
# toEntity can be distributor or pharmacy
# We'll define a function to pick fromEntity, toEntity with constraints
shipments = []
def pick_from_to():
    """Pick from manufacturer or distributor -> to distributor or pharmacy/hospital."""
    from_opts = manufacturers + distributors
    to_opts = distributors + pharmacies
    f = random.choice(from_opts)
    t = random.choice(to_opts)
    # ensure we don't pick same type if from is a distributor => to cannot be same distributor
    # but for simplicity, we just ensure they differ in ID
    while t["id"] == f["id"]:
        t = random.choice(to_opts)
    return f, t

for i in range(NUM_SHIPMENTS):
    sID = f"SHP-{random.randint(10000,99999)}"
    ship_dt = fake.date_time_between(start_date='-1y', end_date='now')
    recv_dt = ship_dt + timedelta(days=random.randint(2, 20))
    fromEnt, toEnt = pick_from_to()
    # pick some random batches to include
    included_batches = random.sample(batches, k=random.randint(1,3))

    shipments.append({
        "id": f"ship_{i}",
        "shipmentID": sID,
        "shipDate": ship_dt.isoformat(),
        "receiveDate": recv_dt.isoformat(),
        "fromEntityID": fromEnt["id"],
        "toEntityID": toEnt["id"],
        "batchIDs": [b["id"] for b in included_batches]
    })

In [11]:
# 7 Generate Prescription
# references a product. We won't store patient data in detail, just a minimal script
prescriptions = []
for i in range(NUM_PRESCRIPTIONS):
    pID = f"RX-{random.randint(10000,99999)}"
    pd = fake.date_between(start_date='-6m', end_date='today')
    qty = random.randint(10, 90)
    pr = random.choice(products)
    prescriptions.append({
        "id": f"rx_{i}",
        "prescriptionID": pID,
        "prescribedDate": pd.isoformat(),
        "quantity": qty,
        "productID": pr["id"]  # references product
    })

In [12]:
# 8 Generate RegulatoryApproval
# references a product. If in "USA," must have an FDA approval, etc. We'll skip location logic for brevity
agencies = ["FDA", "EMA", "MHRA"]  # US, Europe, UK agencies
approvals = []
for i in range(NUM_APPROVALS):
    product_ref = random.choice(products)
    appID = f"APP-{random.randint(10000,99999)}"
    agency = random.choice(agencies)
    approvalDate = fake.date_between(start_date='-3y', end_date='-1m')  # some time in last 3 years
    approvals.append({
        "id": f"approval_{i}",
        "approvalID": appID,
        "agencyName": agency,
        "approvalDate": approvalDate.isoformat(),
        "productID": product_ref["id"]
    })

In [13]:
# Summaries
print("\nGenerated Entities:")
print(" Manufacturers:", len(manufacturers))
print(" Distributors:", len(distributors))
print(" Pharmacies/Hospitals:", len(pharmacies))
print(" Products:", len(products))
print(" Batches:", len(batches))
print(" Shipments:", len(shipments))
print(" Prescriptions:", len(prescriptions))
print(" Approvals:", len(approvals))


Generated Entities:
 Manufacturers: 10
 Distributors: 15
 Pharmacies/Hospitals: 20
 Products: 30
 Batches: 40
 Shipments: 50
 Prescriptions: 60
 Approvals: 25


In [14]:
# Show sample data
print("\nSample Manufacturer:", manufacturers[0])
print("Sample Distributor:", distributors[0])
print("Sample Pharmacy/Hospital:", pharmacies[0])
print("Sample Product:", products[0])
print("Sample Batch:", batches[0])
print("Sample Shipment:", shipments[0])
print("Sample Prescription:", prescriptions[0])
print("Sample Approval:", approvals[0])


Sample Manufacturer: {'id': 'mfg_0', 'manufacturerName': 'Thomas PLC Pharma', 'location': 'West Sierraside', 'licenseNumber': 'MFG-96545'}
Sample Distributor: {'id': 'dist_0', 'distributorName': 'Spence-Washington Distribution', 'location': 'Gabriellestad', 'distributorID': 'DIST-34612'}
Sample Pharmacy/Hospital: {'id': 'ph_0', 'facilityName': 'Peterson-Dyer Hospital', 'facilityType': 'HospitalPharmacy', 'location': 'West Jose'}
Sample Product: {'id': 'prod_0', 'internalProductCode': 'INT-6655', 'brandName': 'Computer', 'genericName': 'down', 'strength': '135mg', 'form': 'Tablet', 'rxNormCode': '860975'}
Sample Batch: {'id': 'batch_0', 'batchNumber': 'LOT-93721', 'expiryDate': '2025-05-20', 'quantityProduced': 24524, 'productID': 'prod_26'}
Sample Shipment: {'id': 'ship_0', 'shipmentID': 'SHP-96123', 'shipDate': '2024-12-14T16:47:59', 'receiveDate': '2025-01-03T16:47:59', 'fromEntityID': 'dist_7', 'toEntityID': 'ph_4', 'batchIDs': ['batch_10', 'batch_7']}
Sample Prescription: {'id': '

In [17]:
# persist the data
pd.DataFrame(manufacturers).to_csv(data_path+"manufacturers.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(distributors).to_csv(data_path+"distributors.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(pharmacies).to_csv(data_path+"pharmacies.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(products).to_csv(data_path+"products.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(batches).to_csv(data_path+"batches.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(shipments).to_csv(data_path+"shipments.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(prescriptions).to_csv(data_path+"prescriptions.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(approvals).to_csv(data_path+"approvals.csv", encoding = "utf-8", escapechar = "\"", index=False)