In [6]:
import pandas as pd
import numpy as np

# Example US states and counties/cities for demonstration
states = [
    ("US-CA", ["Los Angeles", "San Diego", "San Francisco"]),
    ("US-TX", ["Harris", "Dallas", "Tarrant"]),
    ("US-NY", ["Kings", "Queens", "New York"]),
    ("US-FL", ["Miami-Dade", "Broward", "Palm Beach"]),
    ("US-IL", ["Cook", "DuPage", "Lake"]),
    ("US-PA", ["Philadelphia", "Allegheny", "Montgomery"]),
    ("US-OH", ["Cuyahoga", "Franklin", "Hamilton"]),
    ("US-GA", ["Fulton", "Gwinnett", "Cobb"]),
    ("US-NC", ["Mecklenburg", "Wake", "Guilford"]),
    ("US-MI", ["Wayne", "Oakland", "Macomb"])
]
entities = ["Reliance","DeMart","PizzaHut"]
cities = {
    "Los Angeles": ["Los Angeles", "Long Beach", "Glendale"],
    "San Diego": ["San Diego", "Chula Vista", "Oceanside"],
    "San Francisco": ["San Francisco", "Daly City", "South San Francisco"],
    "Harris": ["Houston", "Pasadena"],
    "Dallas": ["Dallas", "Irving"],
    "Tarrant": ["Fort Worth", "Arlington"],
    "Kings": ["Brooklyn", "Williamsburg"],
    "Queens": ["Queens", "Astoria"],
    "New York": ["Manhattan", "Harlem"],
    "Miami-Dade": ["Miami", "Hialeah"],
    "Broward": ["Fort Lauderdale", "Pembroke Pines"],
    "Palm Beach": ["West Palm Beach", "Boca Raton"],
    "Cook": ["Chicago", "Cicero"],
    "DuPage": ["Naperville", "Wheaton"],
    "Lake": ["Waukegan", "Mundelein"],
    "Philadelphia": ["Philadelphia", "Chestnut Hill"],
    "Allegheny": ["Pittsburgh", "Monroeville"],
    "Montgomery": ["Norristown", "Pottstown"],
    "Cuyahoga": ["Cleveland", "Parma"],
    "Franklin": ["Columbus", "Dublin"],
    "Hamilton": ["Cincinnati", "Norwood"],
    "Fulton": ["Atlanta", "Sandy Springs"],
    "Gwinnett": ["Lawrenceville", "Duluth"],
    "Cobb": ["Marietta", "Smyrna"],
    "Mecklenburg": ["Charlotte", "Matthews"],
    "Wake": ["Raleigh", "Cary"],
    "Guilford": ["Greensboro", "High Point"],
    "Wayne": ["Detroit", "Livonia"],
    "Oakland": ["Troy", "Pontiac"],
    "Macomb": ["Warren", "Sterling Heights"]
}

num_records = 1000
np.random.seed(42)

tax_rates = {}
for county, city_list in cities.items():
    for city in city_list:
        # Assign a random tax rate between 5% and 15% for each city
        tax_rates[city] = np.round(np.random.uniform(0.05, 0.15), 4)

data = []
# Precompute which rows will have unreported tax > 0
unreported_indices = set(np.random.choice(num_records, int(num_records * 0.01), replace=False))
for i in range(num_records):
    state_idx = np.random.randint(0, len(states))
    state, counties = states[state_idx]
    county = np.random.choice(counties)
    entity = np.random.choice(entities)
    city = np.random.choice(cities[county])
    tax_rate = tax_rates[city]
    district = f"District-{np.random.randint(1, 100)}"
    gross = np.round(np.random.uniform(100, 10000), 2)
    taxable = np.round(gross * tax_rate, 2)
    if i in unreported_indices:
        unreportedtax = np.round(np.random.uniform(0.01, 1000), 2)
    else:
        unreportedtax = 0.0
    month = np.random.randint(1, 13)
    year = 2025
    data.append([
        state, city, county, entity, gross, taxable,
        unreportedtax, tax_rate, year, month
    ])

columns = [
    "Region", "City", "County","Entity", "Gross", "Taxable",
    "UnreportedTax", "TaxRate", "Year", "Month"
]

df = pd.DataFrame(data, columns=columns)



df.to_csv("../data/reconcillation_synthetic.csv", index=False)
print("Generated ../data/reconcillation_synthetic.csv ")

Generated ../data/reconcillation_synthetic.csv 
