In [6]:
#Required Libraries
import pandas as pd
import numpy as np
from faker import Faker
import random
import os
import datetime
from src.Generic_Errors import apply_generic_chaos
from src.Format_Errors import chaotic_date
from src.Format_Errors import chaotic_case

In [7]:
# CONFIGURATION
DATASET_SIZE = 5000  # Number of rows per file
OUTPUT_DIR = "Generated_Data"

fake = Faker()
Faker.seed(42)
random.seed(42)
np.random.seed(42)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# *** FIX: Define 'rows' globally here ***
rows = DATASET_SIZE 

print(f"‚ò¢Ô∏è  INITIALIZING DATA Generation ({rows} rows/file)...")

‚ò¢Ô∏è  INITIALIZING DATA Generation (5000 rows/file)...


In [None]:
# Retail Dataset 
print(f"Generating Retail Dataset with {rows} rows.")

# Expanded list of Categories for more variety
categories = ['Electronics', 'Clothing', 'Home', 'Grocery', 'Toys', 'Books']
payment_methods = ['Credit Card', 'Cash', 'UPI', 'Debit Card', 'Wallet']

df_ret = pd.DataFrame({
    "Trans_ID": [fake.uuid4()[:8] for _ in range(rows)],
    "Date": [fake.date_this_year() for _ in range(rows)],
    
    # --- NEW COLUMNS ---
    "Customer_Name": [fake.name() for _ in range(rows)],
    "Customer_Email": [fake.email() for _ in range(rows)],
    "Store_City": [fake.city() for _ in range(rows)],
    "Payment_Method": [random.choice(payment_methods) for _ in range(rows)],
    "Is_Member": [random.choice([True, False]) for _ in range(rows)],
    "Discount_Pct": [random.choice([0, 0, 0.05, 0.10, 0.15, 0.20, 0.50]) for _ in range(rows)],
    # -------------------
    
    "Category": [random.choice(categories) for _ in range(rows)],
    "Qty": [random.choice([1, 2, 3, 5, 10, 0]) for _ in range(rows)],
    "Price": [round(random.uniform(10, 500), 2) for _ in range(rows)]
})

# --- LOGIC UPDATES ---
# Calculate Total with Discount applied
df_ret["Total"] = (df_ret["Price"] * df_ret["Qty"]) * (1 - df_ret["Discount_Pct"])
df_ret["Total"] = df_ret["Total"].round(2)

# --- CHAOS INJECTION (Existing & New) ---

# 1. Existing Chaos
df_ret.loc[0:50, "Total"] += 100 
df_ret.loc[51:100, "Qty"] = -5 
df_ret.loc[101:150, "Date"] = datetime.date(2099, 1, 1) 

# 2. New Chaos for new columns
# Make some discounts greater than 100% (Math error testing)
df_ret.loc[151:170, "Discount_Pct"] = 1.50 

# Make some Payment Methods NaN/Null
df_ret.loc[171:200, "Payment_Method"] = None

# Corrupt some email formats
df_ret.loc[201:220, "Customer_Email"] = "user_at_gmail.com" # Missing @

# 3. Apply Styling Chaos
df_ret["Category"] = df_ret["Category"].apply(chaotic_case) 
df_ret["Date"] = df_ret["Date"].apply(chaotic_date) 
df_ret = apply_generic_chaos(df_ret)

# Export
df_ret.to_csv(f"{OUTPUT_DIR}/01_retail_dataset.csv", index=False)
print("Retail Dataset generated and saved.")

Generating Retail Dataset with 5000 rows.


  df.loc[mask, col] = np.nan


In [None]:
# 2. FINANCE üí∞
print("Generating Finance...")
df_fin = pd.DataFrame({
    "App_ID": [f"LN-{random.randint(1000,9999)}" for _ in range(rows)],
    "Income": [random.randint(-5000, 150000) for _ in range(rows)],
    "Credit_Score": [random.randint(300, 950) for _ in range(rows)],
    "Debt": [random.randint(0, 50000) for _ in range(rows)],
    "Status": [random.choice(['Approved', 'Rejected']) for _ in range(rows)]
})
df_fin["DTI_Ratio"] = df_fin["Debt"] / df_fin["Income"] 
bad_idx = df_fin[df_fin["Credit_Score"] < 500].sample(frac=0.2).index
df_fin.loc[bad_idx, "Status"] = "Approved" 
df_fin = apply_generic_chaos(df_fin)
df_fin.to_csv(f"{OUTPUT_DIR}/02_finance_chaos.csv", index=False)

In [None]:
# 3. SUPPLY CHAIN üè≠
print("Generating Supply Chain...")
df_sc = pd.DataFrame({
    "ID": [fake.bothify('??-####') for _ in range(rows)],
    "Location": [random.choice([fake.city(), fake.state(), "USA", "India"]) for _ in range(rows)], 
    "Weight": [f"{random.randint(10,500)} {random.choice(['kg','lbs','Lbs',''])}" for _ in range(rows)], 
    "Ship_Date": [fake.date_this_year() for _ in range(rows)],
    "Arrival_Date": [fake.date_this_year() for _ in range(rows)]
})
df_sc.loc[0:20, "Arrival_Date"] = df_sc.loc[0:20, "Ship_Date"] - datetime.timedelta(days=10) 
df_sc = apply_generic_chaos(df_sc)
# Manual Delimiter Injection
with open(f"{OUTPUT_DIR}/03_supply_chain_chaos.csv", "a") as f:
    f.write("\nBAD_ROW,New York, NY,50kg,2023-01-01,2023-01-02") 
df_sc.to_csv(f"{OUTPUT_DIR}/03_supply_chain_chaos.csv", index=False, mode='w')

In [None]:
# 4. HEALTHCARE üè•
print("Generating Healthcare...")
df_hlth = pd.DataFrame({
    "Pid": range(rows),
    "Age": [random.randint(1, 150) for _ in range(rows)], 
    "Gender": [random.choice(['M', 'F', 'Male', 'Female', 'm', 'f']) for _ in range(rows)], 
    "Admit": [fake.date_this_year() for _ in range(rows)],
    "Discharge": [fake.date_this_year() for _ in range(rows)]
})
bad_dates = df_hlth.sample(frac=0.05).index
for i in bad_dates:
    df_hlth.at[i, "Discharge"] = df_hlth.at[i, "Admit"] - datetime.timedelta(days=5)
df_hlth = apply_generic_chaos(df_hlth)
df_hlth.to_csv(f"{OUTPUT_DIR}/04_healthcare_chaos.csv", index=False)

In [None]:
# 5. MARKETING üì¢
print("Generating Marketing...")
df_mkt = pd.DataFrame({
    "Campaign": [fake.catch_phrase() for _ in range(rows)],
    "Impressions": [random.randint(1000, 10000) for _ in range(rows)],
    "Clicks": [random.randint(100, 5000) for _ in range(rows)],
    "Spend": [random.randint(-100, 5000) for _ in range(rows)] 
})
bad_mkt = df_mkt.sample(frac=0.1).index
df_mkt.loc[bad_mkt, "Clicks"] = df_mkt.loc[bad_mkt, "Impressions"] + 500 
df_mkt = apply_generic_chaos(df_mkt)
df_mkt.to_csv(f"{OUTPUT_DIR}/05_marketing_chaos.csv", index=False)

In [None]:
# 6. HR üë•
print("Generating HR...")
df_hr = pd.DataFrame({
    "Name": [fake.name() for _ in range(rows)],
    "Age": [random.randint(18, 65) for _ in range(rows)],
    "Marital": [random.choice(['Single', 'Married', 'Divorced']) for _ in range(rows)],
    "Join_Date": [fake.date_this_year() for _ in range(rows)]
})
bad_hr = df_hr.sample(20).index
df_hr.loc[bad_hr, "Age"] = 5
df_hr.loc[bad_hr, "Marital"] = "Married" 
df_hr.loc[0:10, "Join_Date"] = datetime.date(2099, 1, 1) 
df_hr = apply_generic_chaos(df_hr)
df_hr.to_csv(f"{OUTPUT_DIR}/06_hr_chaos.csv", index=False)

In [None]:
# 7. LOGISTICS üöö
print("Generating Logistics...")
df_log = pd.DataFrame({
    "Origin": [fake.city() for _ in range(rows)],
    "Dest": [fake.city() for _ in range(rows)],
    "Distance": [random.randint(0, 5000) for _ in range(rows)],
    "Fuel": [random.randint(0, 500) for _ in range(rows)]
})
df_log.loc[0:20, "Distance"] = 0 
df_log.loc[21:40, "Origin"] = df_log.loc[21:40, "Dest"] 
df_log = apply_generic_chaos(df_log)
df_log.to_csv(f"{OUTPUT_DIR}/07_logistics_chaos.csv", index=False)

In [None]:
# 8. CUSTOMER SERVICE üéß
print("Generating CS...")
df_cs = pd.DataFrame({
    "Email": [fake.email() for _ in range(rows)],
    "Priority": [random.choice(['High', 'H', 'Low', 'L', 'Medium', 'Med']) for _ in range(rows)],
    "Resolution_Hours": [random.uniform(-10, 72) for _ in range(rows)] 
})
df_cs = apply_generic_chaos(df_cs)
df_cs.to_csv(f"{OUTPUT_DIR}/08_customer_service_chaos.csv", index=False)

In [None]:
# 9. INSURANCE üõ°Ô∏è
print("Generating Insurance...")
df_ins = pd.DataFrame({
    "Policy": [random.choice(['Auto', 'Home', 'Life']) for _ in range(rows)],
    "Claim_Amt": [random.uniform(100, 10000) for _ in range(rows)],
    "Coverage": [1000 for _ in range(rows)]
})
df_ins.loc[0:50, "Claim_Amt"] = 5000 
df_ins = apply_generic_chaos(df_ins)
df_ins.to_csv(f"{OUTPUT_DIR}/09_insurance_chaos.csv", index=False)

In [None]:
# 10. REAL ESTATE üè†
print("Generating Real Estate...")
df_re = pd.DataFrame({
    "Price": [random.randint(100000, 500000) for _ in range(rows)],
    "Bedrooms": [random.randint(1, 5) for _ in range(rows)],
    "SqFt": [random.randint(500, 5000) for _ in range(rows)]
})
df_re.loc[0:20, "Price"] = 0
df_re.loc[21:30, "Bedrooms"] = 100 
df_re.loc[31:40, "SqFt"] = -500 
df_re = apply_generic_chaos(df_re)
df_re.to_csv(f"{OUTPUT_DIR}/10_real_estate_chaos.csv", index=False)