In [0]:
import random
import pandas as pd
from datetime import datetime, timedelta

# ---------- Config ----------
names = ["Alice Sharma", "Rohan Singh", "Meera Nair", "Vikram Rao", 
         "Sneha Kapoor", "Arjun Mehta", "Pooja Reddy", "Kiran Das", "Raj Malhotra", "Neha Jain"]

regions = ["North", "South", "East", "West"]

# Dictionary to maintain unique mapping
customer_map = {}

def random_date(start, end):
    """Generate a random datetime between two datetime objects."""
    return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))

def generate_records(n=10):
    records = []
    next_id = 1  # incremental ID assignment
    
    for _ in range(n):
        name = random.choice(names)
        email = name.lower().replace(" ", ".") + "@example.com"
        region = random.choice(regions)

        # check if this combo exists
        key = (name, region, email)
        if key in customer_map:
            customer_id = customer_map[key]
        else:
            customer_id = f"CUST{str(next_id).zfill(3)}"
            customer_map[key] = customer_id
            next_id += 1

        effective_date = random_date(datetime(2023, 1, 1), datetime(2024, 1, 1))
        effective_str = effective_date.strftime("%Y-%m-%dT%H:%M:%SZ")

        action = random.choice(["insert", "update", "delete"])

        if action == "delete":
            end_date = (effective_date + timedelta(days=random.randint(30, 365))).strftime("%Y-%m-%dT%H:%M:%SZ")
        else:
            end_date = None

        record = {
            "customer_id": customer_id,
            "customer_name": name,
            "email": email,
            "region": region,
            "effective_date": effective_str,
            "end_date": end_date
        }
        records.append(record)

        # If update, create another version of the same record
        if action == "update":
            new_name = random.choice(names)
            new_email = new_name.lower().replace(" ", ".") + "@newmail.com"
            new_region = random.choice(regions)
            new_effective_date = (effective_date + timedelta(days=random.randint(30, 300))).strftime("%Y-%m-%dT%H:%M:%SZ")

            new_key = (new_name, new_region, new_email)
            if new_key in customer_map:
                new_customer_id = customer_map[new_key]
            else:
                new_customer_id = f"CUST{str(next_id).zfill(3)}"
                customer_map[new_key] = new_customer_id
                next_id += 1

            update_record = {
                "customer_id": new_customer_id,
                "customer_name": new_name,
                "email": new_email,
                "region": new_region,
                "effective_date": new_effective_date,
                "end_date": None
            }
            records.append(update_record)

    return records

def upload_to_s3(records):
    df = pd.DataFrame(records)
    spark_df = spark.createDataFrame(df)
    spark_df.show(truncate=False)

    now = datetime.now()
    year = now.strftime("%Y")
    month = now.strftime("%m")
    day = now.strftime("%d")
    file_time = now.strftime("%H-%M-%S")

    base_path = "s3://siva-databricks-files/cyber"
    output_path = f"{base_path}/{year}/{month}/{day}/file_{file_time}.csv"

    (spark_df
        .coalesce(1)
        .write
        .mode("overwrite")
        .option("header", "true")
        .csv(output_path))

    print(f"✅ File uploaded to {output_path}")


if __name__ == "__main__":
    data = generate_records(7)  # generate 7 customers
    upload_to_s3(data)