In [1]:
import random
import pandas as pd
from faker import Faker
from datetime import timedelta, datetime  # Import datetime

In [2]:
data_path = "./data/"

In [3]:
fake = Faker()

In [4]:
# Configuration
NUM_POLICYHOLDERS = 600
NUM_INSURERS = 10
NUM_UNDERWRITERS = 20
NUM_AGENTS = 20
NUM_COVERAGES = 15
NUM_POLICIES = 1200
NUM_CLAIMS = 800

In [5]:
# 1. Generate PolicyHolders
policyholders = []
for i in range(NUM_POLICYHOLDERS):
    policyholders.append({
        "id": f"holder_{i}",
        "name": fake.name(),
        "dateOfBirth": fake.date_of_birth(minimum_age=18, maximum_age=85).isoformat(),
        "address": fake.address().replace("\n", ", "),
        "phoneNumber": fake.phone_number()
    })

In [6]:
# 2. Generate Insurers
insurers = []
for i in range(NUM_INSURERS):
    insurers.append({
        "id": f"insurer_{i}",
        "insurerName": fake.company() + " Insurance",
        "headquartersLocation": fake.city(),
        "industryRating": round(random.uniform(2.0, 5.0), 1)  # 2.0 to 5.0 scale
    })

In [7]:
# 3. Generate Underwriters
underwriters = []
for i in range(NUM_UNDERWRITERS):
    underwriters.append({
        "id": f"underwriter_{i}",
        "name": fake.name(),
        "licenseID": f"UW-{random.randint(1000, 9999)}",
        "experienceYears": random.randint(1, 30)
    })

In [8]:
# 4. Generate Agents
agents = []
for i in range(NUM_AGENTS):
    agents.append({
        "id": f"agent_{i}",
        "name": fake.name(),
        "agencyName": fake.company() + " Agency",
        "agentLicense": f"AG-{random.randint(1000, 9999)}"
    })

In [9]:
# 5. Generate Coverage types
possible_coverages = [
    {"coverageName": "Liability Coverage", "limitRange": (10000, 500000), "deductibleRange": (100, 1000)},
    {"coverageName": "Collision Coverage", "limitRange": (5000, 100000), "deductibleRange": (250, 2000)},
    {"coverageName": "Comprehensive Coverage", "limitRange": (5000, 200000), "deductibleRange": (250, 2000)},
    {"coverageName": "Fire Coverage", "limitRange": (10000, 300000), "deductibleRange": (500, 5000)},
    {"coverageName": "Flood Coverage", "limitRange": (20000, 400000), "deductibleRange": (500, 5000)},
    {"coverageName": "Personal Property", "limitRange": (5000, 100000), "deductibleRange": (250, 2000)},
    {"coverageName": "Bodily Injury Liability", "limitRange": (20000, 500000), "deductibleRange": (0, 500)},
    {"coverageName": "Property Damage Liability", "limitRange": (10000, 300000), "deductibleRange": (0, 500)},
    {"coverageName": "Uninsured Motorist", "limitRange": (10000, 200000), "deductibleRange": (0, 500)},
    {"coverageName": "Underinsured Motorist", "limitRange": (10000, 200000), "deductibleRange": (0, 500)},
    {"coverageName": "Theft Coverage", "limitRange": (5000, 150000), "deductibleRange": (500, 3000)},
    {"coverageName": "Vandalism Coverage", "limitRange": (5000, 100000), "deductibleRange": (250, 2000)},
    {"coverageName": "Wind/Hail Coverage", "limitRange": (20000, 300000), "deductibleRange": (500, 5000)},
    {"coverageName": "Personal Injury Protection", "limitRange": (5000, 50000), "deductibleRange": (0, 500)},
    {"coverageName": "Loss of Use", "limitRange": (1000, 20000), "deductibleRange": (0, 250)},
]

coverages = []
for i in range(NUM_COVERAGES):
    template = random.choice(possible_coverages)
    limitVal = random.randint(*template["limitRange"])
    dedVal = random.randint(*template["deductibleRange"])
    coverages.append({
        "id": f"coverage_{i}",
        "coverageName": template["coverageName"],
        "coverageLimit": float(limitVal),
        "deductible": float(dedVal)
    })

In [10]:
# 6. Generate Policies
policies = []
policy_types = ["Auto", "Home", "Condo", "Renter", "Landlord", "Umbrella"]
status_options = ["Active", "Lapsed", "Cancelled", "Expired"]
for i in range(NUM_POLICIES):
    p_holder = random.choice(policyholders)
    p_insurer = random.choice(insurers)
    p_underwriter = random.choice(underwriters)
    p_agent = random.choice(agents)

    start_date = fake.date_between(start_date='-3y', end_date='today')
    # end_date might be after start_date by up to 3 years
    end_date = start_date + timedelta(days=random.randint(30, 3 * 365))

    # pick some coverage lines (1 to 4)
    num_covers = random.randint(1, 4)
    chosen_coverages = random.sample(coverages, k=num_covers)
    coverage_ids = [c["id"] for c in chosen_coverages]

    policy_data = {
        "id": f"policy_{i}",
        "policyNumber": f"PN-{random.randint(1000, 9999)}-{i}",
        "policyType": random.choice(policy_types),
        "startDate": start_date.isoformat(),
        "endDate": end_date.isoformat(),
        "premiumAmount": round(random.uniform(200.0, 2000.0), 2),
        "status": random.choice(status_options),
        "policyHolderID": p_holder["id"],
        "insurerID": p_insurer["id"],
        "underwriterID": p_underwriter["id"],
        "coverageIDs": coverage_ids,  # simplistic approach
        "agentID": p_agent["id"]
    }
    policies.append(policy_data)

In [11]:
# 7. Generate Claims
claims = []
claim_types = ["Accident", "Fire", "Theft", "Water Damage", "Liability", "Weather", "Vandalism"]
for i in range(NUM_CLAIMS):
    claim_id = f"claim_{i}"
    pol = random.choice(policies)
    if pol["status"] not in ["Active", "Lapsed"]:
        # It's possible to claim on Lapsed or even Cancelled, but let's bias towards active
        if random.random() < 0.4:
            pol = random.choice(policies)

    # pick the policyholder who is associated with that policy
    holder_id = pol["policyHolderID"]
    # pick the insurer from the policy
    ins_id = pol["insurerID"]

    # Corrected date comparison here:  Use datetime.fromisoformat()
    c_date = fake.date_between(start_date=datetime.fromisoformat(pol["startDate"]), end_date=datetime.fromisoformat(pol["endDate"]))
    claim_type = random.choice(claim_types)
    amount_claimed = round(random.uniform(500.0, 30000.0), 2)
    # settlement might be zero or partial
    amount_settled = round(amount_claimed * random.uniform(0, 1.0), 2)

    claim_status = random.choice(["Open", "Pending Review", "Settled", "Denied"])
    # if it's "Settled", ensure amountSettled > 0
    if claim_status == "Settled" and amount_settled == 0:
        amount_settled = round(amount_claimed * random.uniform(0.3, 1.0), 2)

    claims.append({
        "id": claim_id,
        "claimNumber": f"CL-{random.randint(10000, 99999)}",
        "claimDate": c_date.isoformat(),
        "claimType": claim_type,
        "amountClaimed": amount_claimed,
        "amountSettled": amount_settled,
        "status": claim_status,
        "policyID": pol["id"],
        "policyHolderID": holder_id,
        "insurerID": ins_id
    })

In [12]:
# Summaries
print("Number of PolicyHolders:", len(policyholders))
print("Number of Insurers:", len(insurers))
print("Number of Underwriters:", len(underwriters))
print("Number of Agents:", len(agents))
print("Number of Coverages:", len(coverages))
print("Number of Policies:", len(policies))
print("Number of Claims:", len(claims), "\n")

Number of PolicyHolders: 600
Number of Insurers: 10
Number of Underwriters: 20
Number of Agents: 20
Number of Coverages: 15
Number of Policies: 1200
Number of Claims: 800 



In [13]:
# Print samples
print("Sample PolicyHolder:", policyholders[0])
print("Sample Insurer:", insurers[0])
print("Sample Underwriter:", underwriters[0])
print("Sample Agent:", agents[0])
print("Sample Coverage:", coverages[0])
print("Sample Policy:", policies[0])
print("Sample Claim:", claims[0])

Sample PolicyHolder: {'id': 'holder_0', 'name': 'Richard Gonzales', 'dateOfBirth': '2001-10-09', 'address': '4805 Porter Lodge Apt. 121, Thomasberg, CT 16719', 'phoneNumber': '(585)238-6800'}
Sample Insurer: {'id': 'insurer_0', 'insurerName': 'Schneider-Hicks Insurance', 'headquartersLocation': 'Port Ronaldton', 'industryRating': 4.6}
Sample Underwriter: {'id': 'underwriter_0', 'name': 'Michael Wyatt', 'licenseID': 'UW-2150', 'experienceYears': 20}
Sample Agent: {'id': 'agent_0', 'name': 'Karen Gomez', 'agencyName': 'Spencer-Garcia Agency', 'agentLicense': 'AG-6313'}
Sample Coverage: {'id': 'coverage_0', 'coverageName': 'Personal Injury Protection', 'coverageLimit': 20779.0, 'deductible': 171.0}
Sample Policy: {'id': 'policy_0', 'policyNumber': 'PN-5216-0', 'policyType': 'Auto', 'startDate': '2022-06-21', 'endDate': '2023-11-23', 'premiumAmount': 278.33, 'status': 'Active', 'policyHolderID': 'holder_194', 'insurerID': 'insurer_5', 'underwriterID': 'underwriter_10', 'coverageIDs': ['cov

In [14]:
# persist the data
pd.DataFrame(policyholders).to_csv(data_path+"policyholders.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(insurers).to_csv(data_path+"insurers.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(underwriters).to_csv(data_path+"underwriters.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(agents).to_csv(data_path+"agents.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(coverages).to_csv(data_path+"coverages.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(policies).to_csv(data_path+"policies.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(claims).to_csv(data_path+"claims.csv", encoding = "utf-8", escapechar = "\"", index=False)