In [None]:
import pandas as pd
import numpy as np
import uuid
import random
import os
import json
from datetime import date, datetime, timedelta
from dateutil.relativedelta import relativedelta

# Parameters
start_date = datetime(2024, 1, 1)
end_date = datetime.now()
weeks = pd.date_range(start=start_date, end=end_date, freq='W')
policies_per_month = 20
policies_per_week = int(policies_per_month * 12 / 52)
output_dir = '/shared/sourcedata/insurance_data'
os.makedirs(output_dir, exist_ok=True)

# Helper functions
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

def random_name():
    first = random.choice(['John', 'Jane', 'Alex', 'Emily', 'Chris', 'Katie', 'Mike', 'Laura'])
    last = random.choice(['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller'])
    return f"{first} {last}"

def random_address():
    street = random.choice(['Main St', 'High St', 'Park Ave', 'Oak St', 'Pine St'])
    number = random.randint(1, 999)
    city = random.choice(['Springfield', 'Riverside', 'Greenville', 'Franklin', 'Bristol'])
    return f"{number} {street}, {city}"

def random_vehicle():
    car_brands = [
        {"brand": "Toyota", "models": ["Corolla", "Camry", "RAV4", "Prius"], "factor": 2},
        {"brand": "Honda", "models": ["Civic", "Accord", "CR-V", "Fit"], "factor": 1},
        {"brand": "Ford", "models": ["F-150", "Mustang", "Explorer", "Focus"], "factor": 2},
        {"brand": "Chevrolet", "models": ["Silverado", "Malibu", "Equinox", "Camaro"], "factor": 1},
        {"brand": "BMW", "models": ["3 Series", "5 Series", "X5"], "factor": 1},
        {"brand": "Mercedes-Benz", "models": ["C-Class", "E-Class", "GLC"], "factor": 1},
        {"brand": "Volkswagen", "models": ["Golf", "Passat", "Tiguan"], "factor": 1},
        {"brand": "Hyundai", "models": ["Elantra", "Sonata", "Tucson"], "factor": 2},
        {"brand": "Nissan", "models": ["Altima", "Sentra", "Rogue"], "factor": 1},
        {"brand": "Kia", "models": ["Optima", "Sorento", "Sportage"], "factor": 3},
        {"brand": "Tesla", "models": ["Model 3", "Model Y"], "factor": 1} 
    ]

    # Prepare weights for random selection
    weights = [brand["factor"] for brand in car_brands]

    # Pick a random brand
    random_brand = random.choices(car_brands, weights=weights, k=1)[0]
    # Pick a random model from the selected brand
    random_model = random.choice(random_brand["models"])

    return random_brand['brand'], random_model, random.randint(1990, 2023)

def random_home():
    types = ['Apartment', 'House', 'Townhouse']
    return random.choice(types), random.randint(50, 300)  # sqm

def random_vin(existing_vins):
    while True:
        vin = ''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', k=17))
        if vin not in existing_vins:
            existing_vins.add(vin)
            return vin
        
def random_office_timestamp(week):
    # Office hours
    office_start = 9  # 09:00
    office_end = 17   # 17:00

    # Pick a random weekday (0=Monday, 4=Friday)
    weekday_offset = random.randint(0, 4)
    day = week + timedelta(days=weekday_offset)

    # Pick a random hour and minute within office hours
    hour = random.randint(office_start, office_end - 1)
    minute = random.randint(0, 59)
    second = random.randint(0, 59)

    # Combine into a datetime object
    random_dt = datetime(
        year=day.year,
        month=day.month,
        day=day.day,
        hour=hour,
        minute=minute,
        second=second
    )
    return random_dt

def random_related_date(modification_timestamp):
    rand = random.random()
    base_date = modification_timestamp.date()
    
    if rand < 0.65:
        # 65%: Same date
        return base_date
    elif rand < 0.95:
        # 30%: 1-21 days later
        days_later = random.randint(1, 21)
        return base_date + timedelta(days=days_later)
    else:
        # 5%: 1-7 days before
        days_before = random.randint(1, 7)
        return base_date - timedelta(days=days_before)
    
def random_new_amount(base_amount):
    r = random.random()
    if r < 0.3:
        # 30%: amount remains the same
        return base_amount
    elif r < 0.5:
        # 20%: amount decreases by up to 15%
        decrease_factor = random.uniform(0, 0.15)
        return round(base_amount * (1 - decrease_factor), 2)
    else:
        # 50%: amount increases by up to 25%
        increase_factor = random.uniform(0, 0.25)
        return round(base_amount * (1 + increase_factor), 2)    
    
class DateTimeEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (datetime, date)):
            return obj.isoformat()
        return super().default(obj)

# Static tables
cover_types = [
    {'cover_type_id': 1, 'name': 'Fire', 'modification_ts': start_date},
    {'cover_type_id': 2, 'name': 'Theft', 'modification_ts': start_date},
    {'cover_type_id': 3, 'name': 'Liability', 'modification_ts': start_date},
    {'cover_type_id': 4, 'name': 'Collision', 'modification_ts': start_date},
]

products = [
    {'product_id': 1, 'name': 'Home Basic', 'category': 'Home', 'modification_ts': str(start_date)},
    {'product_id': 2, 'name': 'Home Premium', 'category': 'Home', 'modification_ts': str(start_date)},
    {'product_id': 3, 'name': 'Car TPL', 'category': 'Car', 'modification_ts': str(start_date)},
    {'product_id': 4, 'name': 'Car OD', 'category': 'Car', 'modification_ts': str(start_date)},
    {'product_id': 5, 'name': 'Car OD Premium', 'category': 'Car', 'modification_ts': str(start_date)},
]

product_covers = {
    'Home Basic': ['Fire'],
    'Home Premium': ['Fire', 'Theft'],
    'Car TPL': ['Liability'],
    'Car OD': ['Liability', 'Collision'],
    'Car OD Premium': ['Liability', 'Collision', 'Theft'],
}

cover_name_to_id = {c['name']: c['cover_type_id'] for c in cover_types}

# Data containers
all_policies = []
all_holders = []
all_addresses = []
all_vehicles = []
all_homes = []
all_policy_covers = []

policy_nr_counter = 100000
customer_no_counter = 10000
holder_list = []
existing_vins = set()

for week_idx, week in enumerate(weeks):
    new_policies = []
    renewed_policies = []
    cancelled_policies = []
    week_holders = []
    week_addresses = []
    week_vehicles = []
    week_homes = []
    week_policy_covers = []

    # Add new policies
    for _ in range(policies_per_week):
        modification_ts = random_office_timestamp(week)
        # 10% chance to reuse an existing holder
        if holder_list and random.random() < 0.1:
            holder = random.choice(holder_list)
            holder_id = holder['holder_id']
            customer_no = holder['customer_no']
        else:
            holder_id = str(uuid.uuid4())
            customer_no = customer_no_counter
            customer_no_counter += 1
            holder = {
                'holder_id': holder_id,
                'customer_no': customer_no,
                'name': random_name(),
                'dob': str(random_date(datetime(1950, 1, 1), datetime(2000, 1, 1)).date()),
                'modification_ts': modification_ts
            }
            holder_list.append(holder)
            week_holders.append(holder)

        policy_id = str(uuid.uuid4())
        policy_nr = policy_nr_counter
        policy_nr_counter += 1
        product = random.choice(products)
        address_id = str(uuid.uuid4())
        start = random_related_date(modification_ts)
        end = start + relativedelta(years=1)

        # Policy
        policy = {
            'policy_id': policy_id,
            'policy_nr': policy_nr,
            'product_id': product['product_id'],
            'holder_id': holder_id,
            'customer_no': customer_no,
            'address_id': address_id,
            'start_date': start,
            'end_date': end,
            'cancel_date': None,
            'status': 'active',
            'vehicle_id': None,
            'home_id': None,
            'modification_ts': modification_ts
        }
        new_policies.append(policy)

        # Address
        address = {
            'address_id': address_id,
            'address': random_address(),
            'modification_ts': modification_ts
        }
        week_addresses.append(address)

        # Product-specific tables
        if product['category'] == 'Car':
            vehicle_id = str(uuid.uuid4())
            policy['vehicle_id']= vehicle_id
            make, model, year = random_vehicle()
            vin = random_vin(existing_vins)
            vehicle = {
                'vehicle_id': vehicle_id,
                'make': make,
                'model': model,
                'year': year,
                'vin': vin,
                'modification_ts': modification_ts
            }
            week_vehicles.append(vehicle)
        else:
            home_id = str(uuid.uuid4())
            policy['home_id'] = home_id
            home_type, sqm = random_home()
            home = {
                'home_id': home_id,
                'type': home_type,
                'sqm': sqm,
                'modification_ts': modification_ts
            }
            week_homes.append(home)

        # Policy covers (based on product)
        covers_for_product = product_covers[product['name']]
        for cover_name in covers_for_product:
            gross_premium_amt = round(random.uniform(50, 500), 2)
            policy_cover = {
                'policy_id': policy_id,
                'cover_type_id': cover_name_to_id[cover_name],
                'gross_premium_amt': gross_premium_amt,
                'modification_ts': modification_ts
            }
            week_policy_covers.append(policy_cover)

        # TODO
        # 0.1% chance of cancellation in the same week    

    for cur_policy in all_policies:
        # Simulate renewals
        # renewals happen from 60 days before end date
        if week_idx > 52:  # After 1 year
            eligable_renewal_start = week.date()
            eligable_renewal_end = (week + timedelta(days=67)).date()

            if cur_policy['end_date'] >= eligable_renewal_start and cur_policy['end_date'] <= eligable_renewal_end and cur_policy['status'] == 'active':
                # 10% chance for renewal each week in eligable period
                if random.random() < 0.1:
                    # Renew
                    modification_ts = random_office_timestamp(week)
                    new_policy_id = str(uuid.uuid4())
                    renewed_policy = {
                        **cur_policy,
                        'policy_id': new_policy_id,
                        'start_date': cur_policy['end_date'],
                        'end_date':  cur_policy['end_date'] + relativedelta(years=1),
                        'cancel_date': None,
                        'status': 'active',
                        'modification_ts': modification_ts
                    }
                    renewed_policies.append(renewed_policy)
                    #  copy the covers
                    cur_covers = [pc for pc in all_policy_covers if pc["policy_id"]==cur_policy["policy_id"]]
                    for cur_cover in cur_covers:
                        renewed_cover = {
                            'policy_id': new_policy_id,
                            'cover_type_id': cur_cover["cover_type_id"],
                            'gross_premium_amt': random_new_amount(cur_cover["gross_premium_amt"]),
                            'modification_ts': modification_ts
                        }
                        week_policy_covers.append(renewed_cover)

                    # modify old policy to avoid renewing multiple times
                    cur_policy['start_date'] = policy['start_date']
                    cur_policy['end_date'] = policy['end_date']

                    print(f"renewed week:{week} :policy_id {new_policy_id}")
        
        # Simulate cancellations
        if cur_policy['start_date'] <= week.date() and cur_policy['end_date'] >= (week + timedelta(days=67)).date() and cur_policy['status'] == 'active':
            # 0.5 % chance for cancellation each week
            if random.random() < 0.005:
                #cancel
                modification_ts = random_office_timestamp(week)
                cancelled_policy = {
                    **cur_policy,
                    'cancel_date': modification_ts.date(),
                    'status': 'cancelled',
                    'modification_ts': modification_ts
                }
                cancelled_policies.append(cancelled_policy)

                # modify old policy to avoid cancelling multiple times
                cur_policy['status'] = 'cancelled'

                print(f"cancelled week:{week} :policy_nr {cur_policy['policy_nr']}")

    # Add to all data
    all_policies.extend(new_policies)
    all_holders.extend(week_holders)
    all_addresses.extend(week_addresses)
    all_vehicles.extend(week_vehicles)
    all_homes.extend(week_homes)
    all_policy_covers.extend(week_policy_covers)

    # Save weekly JSON files
    week_str = week.strftime('%Y-%m-%d')
    new_policies.extend(renewed_policies)
    new_policies.extend(cancelled_policies)
    data = {
        'policy': new_policies,
        'cover_type': cover_types,
        'policy_cover': week_policy_covers,
        'product': products,
        'holder': week_holders,
        'address': week_addresses,
        'vehicle': week_vehicles,
        'home': week_homes
    }
    with open(os.path.join(output_dir, f'data_{week_str}.json'), 'w') as f:
        json.dump(data, f, cls=DateTimeEncoder, indent=2)

print("Data generation complete. JSON files are in the 'insurance_data' directory.")
