In [1]:
#Required Libraries
import pandas as pd
import numpy as np
from faker import Faker
import random
import os
import datetime
from src.Generic_Errors import apply_generic_chaos
from src.Format_Errors import chaotic_date
from src.Format_Errors import chaotic_case

In [None]:
# CONFIGURATION
DATASET_SIZE = 500  # Number of rows per file
OUTPUT_DIR = "Generated_Data"

fake = Faker()
Faker.seed(42)
random.seed(42)
np.random.seed(42)
os.makedirs(OUTPUT_DIR, exist_ok=True)
rows = DATASET_SIZE

In [3]:
# Retail Dataset 
print(f"Generating Retail Dataset with {rows} rows.")

# Setup Lists
categories = ['Electronics', 'Clothing', 'Home', 'Grocery', 'Toys', 'Books']
payment_methods = ['Credit Card', 'Cash', 'UPI', 'Debit Card', 'Wallet']

df_ret = pd.DataFrame({
    "Trans_ID": [fake.uuid4()[:8] for _ in range(rows)],
    "Date": [fake.date_this_year() for _ in range(rows)],
    "Customer_Name": [fake.name() for _ in range(rows)],
    "Customer_Email": [fake.email() for _ in range(rows)],
    "Store_City": [fake.city() for _ in range(rows)],
    "Payment_Method": [random.choice(payment_methods) for _ in range(rows)],
    "Is_Member": [random.choice([True, False]) for _ in range(rows)],
    "Discount_Pct": [random.choice([0, 0, 0.05, 0.10, 0.15, 0.20, 0.50]) for _ in range(rows)],
    "Category": [random.choice(categories) for _ in range(rows)],
    "Qty": [random.choice([1, 2, 3, 5, 10, 0]) for _ in range(rows)],
    "Price": [round(random.uniform(10, 500), 2) for _ in range(rows)]
})

# Calculate Total with Discount applied
df_ret["Total"] = (df_ret["Price"] * df_ret["Qty"]) * (1 - df_ret["Discount_Pct"])
df_ret["Total"] = df_ret["Total"].round(2)

# 1. Existing Chaos
df_ret.loc[0:50, "Total"] += 100 
df_ret.loc[51:100, "Qty"] = -5 
df_ret.loc[101:150, "Date"] = datetime.date(2099, 1, 1) 

# 2. New Chaos for new columns
# Make some discounts greater than 100% (Math error testing)
df_ret.loc[151:170, "Discount_Pct"] = 1.50 

# Make some Payment Methods NaN/Null
df_ret.loc[171:200, "Payment_Method"] = None

# Corrupt some email formats
df_ret.loc[201:220, "Customer_Email"] = "user_at_gmail.com" # Missing @

# 3. Apply Styling Chaos
df_ret["Category"] = df_ret["Category"].apply(chaotic_case) 
df_ret["Date"] = df_ret["Date"].apply(chaotic_date) 
df_ret = apply_generic_chaos(df_ret)

# Export
df_ret.to_csv(f"{OUTPUT_DIR}/Retail_dataset.csv", index=False)
print("Retail Dataset generated and saved.")

Generating Retail Dataset with 10000 rows.
Retail Dataset generated and saved.


In [4]:
# Finance Dataset
print(f"Generating Finance Dataset with {rows} rows.")

# Setup Lists
loan_purposes = ['Debt Consolidation', 'Home Improvement', 'Business', 'Education', 'Car']
emp_lengths = ['< 1 year', '1-3 years', '4-7 years', '10+ years', 'Unemployed']

df_fin = pd.DataFrame({
    "App_ID": [f"LN-{random.randint(1000,9999)}" for _ in range(rows)],
    "App_Date": [fake.date_this_year() for _ in range(rows)],
    "Applicant_Name": [fake.name() for _ in range(rows)],
    "State": [fake.state_abbr() for _ in range(rows)],
    "Employment_Length": [random.choice(emp_lengths) for _ in range(rows)],
    "Annual_Income": [random.randint(-5000, 150000) for _ in range(rows)], # Includes negative income
    "Loan_Amount": [random.randint(1000, 50000) for _ in range(rows)],
    "Loan_Term_Months": [random.choice([12, 24, 36, 60, 360, 999]) for _ in range(rows)],
    "Interest_Rate": [round(random.uniform(2.5, 35.0), 2) for _ in range(rows)],
    "Purpose": [random.choice(loan_purposes) for _ in range(rows)],
    "Credit_Score": [random.randint(300, 950) for _ in range(rows)],
    "Current_Debt": [random.randint(0, 50000) for _ in range(rows)],
    "Status": [random.choice(['Approved', 'Rejected', 'Pending']) for _ in range(rows)]
})

# 1. Zero Division Chaos: Force some Income to 0 to create 'inf' (Infinity) in DTI
df_fin.loc[0:20, "Annual_Income"] = 0

# 2. Calculate DTI (Debt to Income)
# In Pandas, dividing by zero results in 'inf', which is excellent for testing.
df_fin["DTI_Ratio"] = df_fin["Current_Debt"] / df_fin["Annual_Income"]

# 1. Logic Mismatch: High Credit Score (800+) but Rejected
good_credit_indices = df_fin[df_fin["Credit_Score"] > 800].index
df_fin.loc[good_credit_indices[:10], "Status"] = "Rejected"

# 2. Logic Mismatch: Low Credit Score (<500) but Approved
bad_credit_indices = df_fin[df_fin["Credit_Score"] < 500].index
df_fin.loc[bad_credit_indices[:10], "Status"] = "Approved"

# 3. Impossible Values: Negative Interest Rates
df_fin.loc[50:60, "Interest_Rate"] = -5.00

# 4. Data Type Chaos: String in Numeric Column ("TBD" in Loan Amount)
df_fin = df_fin.astype(object) # Ensure we can mix types safely
df_fin.loc[61:70, "Loan_Amount"] = "TBD"

# Apply the Generic Chaos (Typos, Nulls, Encoding)
df_fin = apply_generic_chaos(df_fin)

df_fin.to_csv(f"{OUTPUT_DIR}/Finance_dataset.csv", index=False)
print("Finance Dataset generated and saved.")

Generating Finance Dataset with 10000 rows.
Finance Dataset generated and saved.


In [5]:
# Supply Chain Dataset
print(f"Generating Supply Chain Dataset with {rows} rows.")

carriers = ['Maersk', 'DHL', 'FedEx', 'UPS', 'Old Dominion', 'Generic_Trucking']
statuses = ['In Transit', 'Delivered', 'Delayed', 'Customs Hold', 'Lost']
products = ['Electronics', 'Perishables', 'Raw Materials', 'Hazmat', 'Textiles']

df_sc = pd.DataFrame({
    "Shipment_ID": [fake.bothify('??-####-####') for _ in range(rows)],
    "SKU": [fake.ean13() for _ in range(rows)],
    "Origin": [fake.city() for _ in range(rows)],
    "Destination": [random.choice([fake.city(), fake.state(), "USA", "India", "Unknown"]) for _ in range(rows)],
    
    # --- NEW COLUMNS ---
    "Carrier": [random.choice(carriers) for _ in range(rows)],
    "Product_Type": [random.choice(products) for _ in range(rows)],
    "Shipping_Cost": [round(random.uniform(50, 5000), 2) for _ in range(rows)],
    "Temperature_C": [round(random.uniform(-30, 30), 1) for _ in range(rows)], # For cold chain testing
    "Fragile": [random.choice([True, False, "Yes", "No", 1, 0]) for _ in range(rows)], # Mixed boolean types
    # -------------------

    # Weight: Mixed units logic (kg, lbs, or no unit)
    "Weight": [f"{random.randint(10,500)}{random.choice(['kg', ' kg', 'lbs', 'Lbs', ''])}" for _ in range(rows)],
    
    "Ship_Date": [fake.date_this_year() for _ in range(rows)],
    "Arrival_Date": [fake.date_this_year() for _ in range(rows)],
    "Status": [random.choice(statuses) for _ in range(rows)]
})

# --- LOGIC & CHAOS UPDATES ---

# 1. Time Travel (Arrival before Shipping)
# Move logic to indexes 0-20
df_sc.loc[0:20, "Arrival_Date"] = df_sc.loc[0:20, "Ship_Date"] - datetime.timedelta(days=10)

# 2. Cold Chain Failure (Perishables that got too hot)
# Find Perishables and set temp to > 20C (Spoiled)
mask_hot = (df_sc['Product_Type'] == 'Perishables')
df_sc.loc[mask_hot, "Temperature_C"] = df_sc.loc[mask_hot, "Temperature_C"].apply(lambda x: random.uniform(20, 40))

# 3. Negative Shipping Costs (Billing Error)
df_sc.loc[21:30, "Shipping_Cost"] = -150.00

# 4. Ghost Shipments (Status = Delivered, but Arrival_Date is Null)
df_sc.loc[31:50, "Status"] = "Delivered"
df_sc.loc[31:50, "Arrival_Date"] = np.nan

# 5. Apply Generic Chaos (Typos, Encoding, etc.)
# We cast to object first to ensure safety, as per previous fixes
df_sc = df_sc.astype(object)
df_sc = apply_generic_chaos(df_sc)

# --- EXPORT & MANUAL INJECTION ---

# Step 1: Write the main CSV first (mode='w' creates the file)
csv_path = f"{OUTPUT_DIR}/Supply_chain_dataset.csv"
df_sc.to_csv(csv_path, index=False)

# Step 2: Append the "Bad Row" (mode='a' adds to the end)
# This simulates a file transfer error or a jagged row (missing columns)
with open(csv_path, "a") as f:
    f.write("\nBAD_ROW_DATA,New York, NY,50kg,2023-01-01") # Deliberately missing columns
    f.write("\nANOTHER_BAD_ROW" + "," * 15) # Row with way too many commas

print("Supply Chain Dataset generated and saved.")

Generating Supply Chain Dataset with 10000 rows.
Supply Chain Dataset generated and saved.


In [6]:
# Healthcare Dataset
print(f"Generating Healthcare Dataset with {rows} rows.")

diagnoses = ['Hypertension', 'Diabetes Type 2', 'Fracture', 'Viral Infection', 'COVID-19', 'Migraine']
blood_types = ['A+', 'A-', 'B+', 'B-', 'O+', 'O-', 'AB+', 'AB-']
insurance_providers = ['BlueCross', 'Aetna', 'Medicare', 'Private', 'Uninsured']

df_hlth = pd.DataFrame({
    "Patient_ID": [f"P-{random.randint(10000, 99999)}" for _ in range(rows)],
    
    # --- NEW COLUMNS ---
    "Patient_Name": [fake.name() for _ in range(rows)], # PII for de-anonymization testing
    "SSN_Last4": [f"{random.randint(1000,9999)}" for _ in range(rows)],
    "Blood_Type": [random.choice(blood_types) for _ in range(rows)],
    "Diagnosis": [random.choice(diagnoses) for _ in range(rows)],
    "Bill_Amount": [round(random.uniform(100.0, 50000.0), 2) for _ in range(rows)],
    "Insurance": [random.choice(insurance_providers) for _ in range(rows)],
    
    # Blood Pressure: mostly valid "120/80" format, but we will mess this up later
    "Blood_Pressure": [f"{random.randint(90,160)}/{random.randint(60,100)}" for _ in range(rows)],
    # -------------------

    # Age: Keep your wide range, we will add negative ages in chaos
    "Age": [random.randint(0, 110) for _ in range(rows)], 
    
    # Gender: Keep your messy formatting (great for cleaning tests)
    "Gender": [random.choice(['M', 'F', 'Male', 'Female', 'm', 'f', 'NB', 'Other']) for _ in range(rows)], 
    
    "Admit_Date": [fake.date_this_year() for _ in range(rows)],
    "Discharge_Date": [fake.date_this_year() for _ in range(rows)]
})

# --- LOGIC & CHAOS UPDATES ---

# 1. Impossible Dates (Discharge BEFORE Admit) - 5% of rows
bad_dates_mask = np.random.rand(len(df_hlth)) < 0.05
df_hlth.loc[bad_dates_mask, "Discharge_Date"] = df_hlth.loc[bad_dates_mask, "Admit_Date"] - datetime.timedelta(days=5)

# 2. Impossible Data (Negative Age)
df_hlth.loc[0:10, "Age"] = [random.randint(-50, -1) for _ in range(11)]

# 3. Dirty Data (Blood Pressure Format)
# Mix in different separators or just text
df_hlth.loc[11:20, "Blood_Pressure"] = "High" 
df_hlth.loc[21:30, "Blood_Pressure"] = "140-90" # Wrong separator
df_hlth.loc[31:40, "Blood_Pressure"] = "120 / 80" # Extra spaces

# 4. Mixed Types in Billing (Currency Symbols)
# We cast to object first to allow string injection into float column
df_hlth = df_hlth.astype(object) 
df_hlth.loc[41:50, "Bill_Amount"] = "$500.00"

# 5. Apply Generic Chaos
df_hlth = apply_generic_chaos(df_hlth)

df_hlth.to_csv(f"{OUTPUT_DIR}/Healthcare_dataset.csv", index=False)
print("Healthcare Dataset generated and saved.")

Generating Healthcare Dataset with 10000 rows.
Healthcare Dataset generated and saved.


In [7]:
# Marketing Dataset
print(f"Generating Marketing Dataset with {rows} rows.")

platforms = ['Facebook', 'Google Ads', 'TikTok', 'Email', 'LinkedIn', 'Instagram']
ad_types = ['Video', 'Carousel', 'Static Image', 'Story', 'Reel']

df_mkt = pd.DataFrame({
    "Campaign_ID": [fake.uuid4()[:12] for _ in range(rows)],
    "Date": [fake.date_this_year() for _ in range(rows)],
    
    # --- NEW COLUMNS ---
    "Channel": [random.choice(platforms) for _ in range(rows)],
    "Ad_Type": [random.choice(ad_types) for _ in range(rows)],
    "Target_Region": [random.choice(['NA', 'EMEA', 'APAC', 'LATAM', 'Global']) for _ in range(rows)],
    
    # Messy URL with parameters (Great for Regex testing)
    "Landing_URL": [f"https://mysite.com/shop?utm_source={random.choice(platforms).lower()}&utm_id={fake.uuid4()[:5]}" for _ in range(rows)],
    
    # JSON-like string (Simulating raw API data)
    "Ad_Parameters": [str({'target_age': f"{random.randint(18,60)}+", 'bidding_strategy': 'auto'}) for _ in range(rows)],
    # -------------------

    "Impressions": [random.randint(1000, 50000) for _ in range(rows)],
    "Clicks": [random.randint(10, 5000) for _ in range(rows)],
    "Conversions": [random.randint(0, 500) for _ in range(rows)],
    "Spend": [round(random.uniform(100, 10000), 2) for _ in range(rows)]
})

# --- LOGIC & CHAOS UPDATES ---

# 1. Impossible Funnel: Clicks > Impressions (CTR > 100%)
bad_ctr_mask = df_mkt.sample(frac=0.1).index
df_mkt.loc[bad_ctr_mask, "Clicks"] = df_mkt.loc[bad_ctr_mask, "Impressions"] * 2

# 2. Impossible Funnel: Conversions > Clicks (Conversion Rate > 100%)
# This implies users bought the product without clicking the ad
bad_cvr_mask = df_mkt.sample(frac=0.1).index
df_mkt.loc[bad_cvr_mask, "Conversions"] = df_mkt.loc[bad_cvr_mask, "Clicks"] + 50

# 3. Negative Spend (Billing error)
df_mkt.loc[0:20, "Spend"] = [random.randint(-500, -1) for _ in range(21)]

# 4. JSON Corruption (Broken Syntax)
# We cast to object first to be safe
df_mkt = df_mkt.astype(object)
df_mkt.loc[21:30, "Ad_Parameters"] = "{'target_age': '18-24', 'bidding': 'aut" # Cut off string

# 5. Broken URLs
df_mkt.loc[31:40, "Landing_URL"] = "http://site.com/??error=true"

# 6. Apply Generic Chaos
df_mkt = apply_generic_chaos(df_mkt)

df_mkt.to_csv(f"{OUTPUT_DIR}/Marketing_dataset.csv", index=False)
print("Marketing Dataset generated and saved.")

Generating Marketing Dataset with 10000 rows.
Marketing Dataset generated and saved.


In [8]:
# HR Dataset
print(f"Generating HR Dataset with {rows} rows.")

depts = ['Engineering', 'Sales', 'HR', 'Finance', 'Legal', 'Operations']
titles = ['Intern', 'Associate', 'Manager', 'Director', 'VP', 'C-Level']

df_hr = pd.DataFrame({
    "Emp_ID": [f"E-{random.randint(1000, 9999)}" for _ in range(rows)],
    "Name": [fake.name() for _ in range(rows)],
    
    # --- NEW COLUMNS ---
    "Email": [fake.company_email() for _ in range(rows)],
    "Department": [random.choice(depts) for _ in range(rows)],
    "Title": [random.choice(titles) for _ in range(rows)],
    "Salary": [random.randint(30000, 250000) for _ in range(rows)],
    "Performance_Score": [random.choice([1, 2, 3, 4, 5, np.nan]) for _ in range(rows)],
    # -------------------

    "Age": [random.randint(18, 65) for _ in range(rows)],
    "Marital_Status": [random.choice(['Single', 'Married', 'Divorced', 'Widowed']) for _ in range(rows)],
    
    "Join_Date": [fake.date_this_year() for _ in range(rows)],
    "Exit_Date": [random.choice([fake.date_this_year(), np.nan, np.nan, np.nan]) for _ in range(rows)]
})

# --- LOGIC & CHAOS UPDATES ---

# 1. Child Labor & Child Marriage (Your original logic)
# 5-year-olds who are married
bad_hr_indices = df_hr.sample(20).index
df_hr.loc[bad_hr_indices, "Age"] = 5
df_hr.loc[bad_hr_indices, "Marital_Status"] = "Married" 

# 2. Time Travel (Exit Date BEFORE Join Date)
# Find people who have an exit date
has_exit = df_hr[df_hr["Exit_Date"].notna()].index
# Take the first 10 of them and make them leave before they joined
df_hr.loc[has_exit[:10], "Exit_Date"] = df_hr.loc[has_exit[:10], "Join_Date"] - datetime.timedelta(days=100)

# 3. Future Hires (Join Date in 2099)
df_hr.loc[0:10, "Join_Date"] = datetime.date(2099, 1, 1)

# 4. Salary Formatting Chaos (Strings in Integer column)
# Cast to object first so we can mix strings and ints
df_hr = df_hr.astype(object) 
df_hr.loc[11:20, "Salary"] = "$100,000" # Currency formatting
df_hr.loc[21:30, "Salary"] = "TBD"      # Text placeholder

# 5. Invalid Emails (Missing @ symbol)
df_hr.loc[31:40, "Email"] = "user_company.com"

# 6. Apply Generic Chaos
df_hr = apply_generic_chaos(df_hr)

df_hr.to_csv(f"{OUTPUT_DIR}/HR_dataset.csv", index=False)
print("HR Dataset generated and saved.")

Generating HR Dataset with 10000 rows.
HR Dataset generated and saved.


In [9]:
# Logistics Dataset
print(f"Generating Logistics Dataset with {rows} rows.")

vehicles = ['Semi-Truck', 'Van', 'Pickup', 'Drone', 'Cargo Ship']
statuses = ['In Transit', 'Idling', 'Maintenance', 'Delivered', 'Accident']

df_log = pd.DataFrame({
    "Trip_ID": [f"TR-{fake.uuid4()[:8]}" for _ in range(rows)],
    "Driver_Name": [fake.name() for _ in range(rows)],
    "Vehicle_Type": [random.choice(vehicles) for _ in range(rows)],
    
    # --- NEW COLUMNS ---
    "Origin": [fake.city() for _ in range(rows)],
    "Destination": [fake.city() for _ in range(rows)],
    
    # Coordinates: formatted as "Lat, Long" string for parsing tests
    "GPS_Coordinates": [f"{fake.latitude()}, {fake.longitude()}" for _ in range(rows)],
    
    "Distance_Miles": [random.randint(0, 3000) for _ in range(rows)],
    "Fuel_Gallons": [random.uniform(0, 500) for _ in range(rows)],
    "Status": [random.choice(statuses) for _ in range(rows)]
})

# --- LOGIC & CHAOS UPDATES ---

# 1. Teleportation (Distance is 0, but Origin != Destination)
# This breaks logic checks that assume Dist=0 implies same location
teleport_mask = df_log.sample(frac=0.05).index
df_log.loc[teleport_mask, "Distance_Miles"] = 0

# 2. Infinite Efficiency (Distance > 0, but Fuel = 0)
# This causes Division By Zero errors if you calculate MPG (Miles Per Gallon)
magic_truck_mask = df_log.sample(frac=0.05).index
df_log.loc[magic_truck_mask, "Distance_Miles"] = 500
df_log.loc[magic_truck_mask, "Fuel_Gallons"] = 0

# 3. Circular Trips (Origin == Destination)
# Legitimate in some cases, but often a data error for long-haul
df_log.loc[0:20, "Origin"] = df_log.loc[0:20, "Destination"]

# 4. Impossible Coordinates (Lat > 90)
# This will break mapping software like Tableau or PowerBI
df_log.loc[21:30, "GPS_Coordinates"] = "99.9999, 150.0000"

# 5. Data Type Chaos (String in Numeric Column)
# Convert to object first to handle the string injection
df_log = df_log.astype(object)
df_log.loc[31:40, "Distance_Miles"] = "Unknown" 
df_log.loc[41:50, "Fuel_Gallons"] = -50.0 # Negative fuel (Production of fuel?)

# 6. Apply Generic Chaos
df_log = apply_generic_chaos(df_log)

df_log.to_csv(f"{OUTPUT_DIR}/Logistics_dataset.csv", index=False)
print("Logistics Dataset generated and saved.")

Generating Logistics Dataset with 10000 rows.
Logistics Dataset generated and saved.


In [10]:
# Customer Service Dataset
print(f"Generating Customer Service Dataset with {rows} rows.")

topics = ['Billing', 'Technical Support', 'Feature Request', 'Login Issue', 'Refund']
channels = ['Email', 'Chat', 'Phone', 'Social Media', 'Pigeon Carrier']
statuses = ['Open', 'In Progress', 'Resolved', 'Closed', 'Escalated']

df_cs = pd.DataFrame({
    "Ticket_ID": [f"TKT-{random.randint(10000, 99999)}" for _ in range(rows)],
    "Customer_Email": [fake.email() for _ in range(rows)],
    
    # --- NEW COLUMNS ---
    "Customer_Name": [fake.name() for _ in range(rows)],
    "Topic": [random.choice(topics) for _ in range(rows)],
    "Channel": [random.choice(channels) for _ in range(rows)],
    "Agent_Name": [random.choice([fake.first_name(), "Bot", np.nan]) for _ in range(rows)],
    
    # CSAT Score: Should be 1-5, but we'll break this rule later
    "CSAT_Score": [random.randint(1, 5) for _ in range(rows)],
    
    # Priority: Intentionally inconsistent (High, H, 1, Urgent)
    "Priority": [random.choice(['High', 'H', 'Medium', 'Med', 'Low', 'L', 'Urgent']) for _ in range(rows)],
    # -------------------

    "Created_Date": [fake.date_this_year() for _ in range(rows)],
    "Resolved_Date": [fake.date_this_year() for _ in range(rows)],
    "Resolution_Hours": [round(random.uniform(0.5, 72.0), 2) for _ in range(rows)],
    "Status": [random.choice(statuses) for _ in range(rows)]
})

# --- LOGIC & CHAOS UPDATES ---

# 1. Time Travel Resolution (Resolved Date BEFORE Created Date)
# A classic database error that messes up SLA calculations
time_travel_mask = df_cs.sample(frac=0.05).index
df_cs.loc[time_travel_mask, "Resolved_Date"] = df_cs.loc[time_travel_mask, "Created_Date"] - datetime.timedelta(days=2)

# 2. Negative Resolution Time (Finished before starting)
df_cs.loc[0:20, "Resolution_Hours"] = [random.randint(-50, -1) for _ in range(21)]

# 3. Ghost Resolutions (Status = Closed, but Agent is Null)
# Implies the ticket closed itself?
ghost_mask = df_cs.sample(frac=0.05).index
df_cs.loc[ghost_mask, "Status"] = "Closed"
df_cs.loc[ghost_mask, "Agent_Name"] = np.nan

# 4. CSAT Out of Bounds (Score of 10 on a 5-point scale)
df_cs.loc[21:30, "CSAT_Score"] = 10 
df_cs.loc[31:40, "CSAT_Score"] = 0 

# 5. Data Type Chaos (Text in CSAT Score)
# Cast to object first so we can mix text into the number column
df_cs = df_cs.astype(object)
df_cs.loc[41:50, "CSAT_Score"] = "Happy" 
df_cs.loc[51:60, "CSAT_Score"] = "Angry"

# 6. Apply Generic Chaos
df_cs = apply_generic_chaos(df_cs)

df_cs.to_csv(f"{OUTPUT_DIR}/Customer_service_dataset.csv", index=False)
print("Customer Service Dataset generated and saved.")

Generating Customer Service Dataset with 10000 rows.
Customer Service Dataset generated and saved.


In [11]:
# Insurance Dataset
print(f"Generating Insurance Dataset with {rows} rows.")

policy_types = ['Auto', 'Home', 'Life', 'Health', 'Cyber', 'Pet']
statuses = ['Approved', 'Denied', 'Under Investigation', 'Pending', 'Fraud Alert']

df_ins = pd.DataFrame({
    "Policy_ID": [f"POL-{fake.uuid4()[:8]}" for _ in range(rows)],
    "Policy_Holder": [fake.name() for _ in range(rows)],
    "Policy_Type": [random.choice(policy_types) for _ in range(rows)],
    
    # --- NEW COLUMNS ---
    "Start_Date": [fake.date_this_year() for _ in range(rows)],
    "Incident_Date": [fake.date_this_year() for _ in range(rows)],
    
    "Premium_Monthly": [round(random.uniform(50, 500), 2) for _ in range(rows)],
    "Deductible": [random.choice([500, 1000, 2500, 5000]) for _ in range(rows)],
    "Risk_Score": [random.randint(1, 100) for _ in range(rows)],
    # -------------------

    "Coverage_Limit": [random.choice([10000, 50000, 100000, 500000, 1000000]) for _ in range(rows)],
    "Claim_Amount": [round(random.uniform(100, 50000), 2) for _ in range(rows)],
    "Status": [random.choice(statuses) for _ in range(rows)]
})

# --- LOGIC & CHAOS UPDATES ---

# 1. Catastrophic Claims (Claim > Coverage)
# Useful for testing logic that should flag "Total Loss" or "Limit Reached"
bad_claim_mask = df_ins.sample(frac=0.05).index
df_ins.loc[bad_claim_mask, "Claim_Amount"] = df_ins.loc[bad_claim_mask, "Coverage_Limit"] * 1.5

# 2. Pre-existing Conditions (Incident Date BEFORE Start Date)
# A classic fraud indicator
fraud_mask = df_ins.sample(frac=0.05).index
df_ins.loc[fraud_mask, "Incident_Date"] = df_ins.loc[fraud_mask, "Start_Date"] - datetime.timedelta(days=30)

# 3. Negative Premiums (Logic Error)
df_ins.loc[0:20, "Premium_Monthly"] = -150.00

# 4. Data Type Chaos (String in Numeric Column)
# Cast to object first so we can inject text
df_ins = df_ins.astype(object)

# Inject "Unlimited" into Coverage Limit (Breaks math operations)
df_ins.loc[21:30, "Coverage_Limit"] = "Unlimited"

# Inject "Waived" into Deductible
df_ins.loc[31:40, "Deductible"] = "Waived"

# 5. Apply Generic Chaos
df_ins = apply_generic_chaos(df_ins)

df_ins.to_csv(f"{OUTPUT_DIR}/Insurance_dataset.csv", index=False)
print("Insurance Dataset generated and saved.")

Generating Insurance Dataset with 10000 rows.
Insurance Dataset generated and saved.


In [12]:
# Real Estate Dataset
print(f"Generating Real Estate Dataset with {rows} rows.")

prop_types = ['Single Family', 'Condo', 'Townhouse', 'Multi-Family', 'Land']
status_list = ['Active', 'Sold', 'Pending', 'Withdrawn', 'Foreclosure']

df_re = pd.DataFrame({
    "Property_ID": [f"MLS-{random.randint(100000, 999999)}" for _ in range(rows)],
    "Address": [fake.street_address() for _ in range(rows)],
    "City": [fake.city() for _ in range(rows)],
    "Zip_Code": [fake.zipcode() for _ in range(rows)],
    
    # --- NEW COLUMNS ---
    "Prop_Type": [random.choice(prop_types) for _ in range(rows)],
    "Year_Built": [random.randint(1900, 2024) for _ in range(rows)],
    "Bathrooms": [random.choice([1, 1.5, 2, 2.5, 3, 4, 5]) for _ in range(rows)],
    "Garage": [random.choice([True, False, "Yes", "No", 0, 1]) for _ in range(rows)], # Mixed Boolean
    # -------------------

    "Price": [random.randint(50000, 2000000) for _ in range(rows)],
    "Bedrooms": [random.randint(1, 6) for _ in range(rows)],
    "SqFt": [random.randint(500, 10000) for _ in range(rows)],
    
    "Listed_Date": [fake.date_this_year() for _ in range(rows)],
    "Sold_Date": [random.choice([fake.date_this_year(), np.nan]) for _ in range(rows)],
    "Status": [random.choice(status_list) for _ in range(rows)]
})

# --- LOGIC & CHAOS UPDATES ---

# 1. Physics Violation (The "Shoebox Mansion")
# 100 Bedrooms inside 500 SqFt (Impossible density)
impossible_mask = df_re.sample(frac=0.05).index
df_re.loc[impossible_mask, "Bedrooms"] = 100
df_re.loc[impossible_mask, "SqFt"] = 500

# 2. Time Travel (Sold BEFORE Listed)
sold_mask = df_re[df_re["Sold_Date"].notna()].sample(frac=0.1).index
df_re.loc[sold_mask, "Sold_Date"] = df_re.loc[sold_mask, "Listed_Date"] - datetime.timedelta(days=100)

# 3. Future Buildings (Year Built 2099)
df_re.loc[0:10, "Year_Built"] = 2099

# 4. Negative SqFt (Dimensional error)
df_re.loc[11:20, "SqFt"] = -1000

# 5. Data Type Chaos (Strings in Price)
# Cast to object to allow text injection
df_re = df_re.astype(object)
df_re.loc[21:30, "Price"] = "Call for Price"
df_re.loc[31:40, "Price"] = "$1M+"

# 6. Apply Generic Chaos
df_re = apply_generic_chaos(df_re)

df_re.to_csv(f"{OUTPUT_DIR}/Real_estate_dataset.csv", index=False)
print("Real Estate Dataset generated and saved.")

Generating Real Estate Dataset with 10000 rows.
Real Estate Dataset generated and saved.
