In [5]:
import numpy as np
import pandas as pd

def seed_everything(seed: int = 42):
    """Reproducible RNG across NumPy and pandas-related randomness."""
    np.random.seed(seed)

def generate_housing_data_au(n: int = 10000, seed: int = 42) -> pd.DataFrame:
    """
    Generate a realistic synthetic Australian housing dataset.

    Columns:
      - city, state, property_type, postcode_region
      - bedrooms, bathrooms, car_spaces
      - building_area_m2, land_size_m2
      - distance_to_cbd_km
      - year_built, renovated_recently
      - walk_score, public_transport_score, school_quality, crime_rate_per_1k
      - has_water_view, has_garden, strata_fees_monthly
      - price (AUD)
    """
    seed_everything(seed)
    rng = np.random.default_rng(seed)

    # City profiles (tune freely)
    city_profiles = {
        "Sydney":   {"state": "NSW", "base": 950_000, "p_bed": 120_000, "p_bath": 90_000, "p_car": 45_000,
                     "p_build": 5_000, "p_land": 1_200, "decay": 30_000, "coastal": 180_000,
                     "unit_disc": -140_000, "town_disc": -60_000, "dist_shape": 2.2, "dist_scale": 4.0},
        "Melbourne":{"state": "VIC", "base": 800_000, "p_bed": 105_000, "p_bath": 80_000, "p_car": 35_000,
                     "p_build": 4_300, "p_land": 1_000, "decay": 22_000, "coastal": 120_000,
                     "unit_disc": -120_000, "town_disc": -50_000, "dist_shape": 2.0, "dist_scale": 4.5},
        "Brisbane": {"state": "QLD", "base": 720_000, "p_bed": 90_000, "p_bath": 70_000, "p_car": 30_000,
                     "p_build": 3_800, "p_land": 900,   "decay": 16_000, "coastal": 100_000,
                     "unit_disc": -110_000, "town_disc": -45_000, "dist_shape": 2.0, "dist_scale": 5.2},
        "Perth":    {"state": "WA",  "base": 650_000, "p_bed": 85_000, "p_bath": 60_000, "p_car": 25_000,
                     "p_build": 3_500, "p_land": 800,   "decay": 12_000, "coastal": 90_000,
                     "unit_disc": -90_000, "town_disc": -40_000, "dist_shape": 1.8, "dist_scale": 6.0},
        "Adelaide": {"state": "SA",  "base": 680_000, "p_bed": 85_000, "p_bath": 60_000, "p_car": 25_000,
                     "p_build": 3_600, "p_land": 850,   "decay": 13_000, "coastal": 85_000,
                     "unit_disc": -95_000, "town_disc": -40_000, "dist_shape": 1.8, "dist_scale": 5.2},
        "Canberra": {"state": "ACT", "base": 820_000, "p_bed": 100_000, "p_bath": 80_000, "p_car": 35_000,
                     "p_build": 4_100, "p_land": 950,   "decay": 14_000, "coastal": 40_000,
                     "unit_disc": -120_000, "town_disc": -55_000, "dist_shape": 2.4, "dist_scale": 3.5},
        "Hobart":   {"state": "TAS", "base": 620_000, "p_bed": 80_000, "p_bath": 55_000, "p_car": 22_000,
                     "p_build": 3_200, "p_land": 750,   "decay": 11_000, "coastal": 70_000,
                     "unit_disc": -80_000,  "town_disc": -35_000, "dist_shape": 1.6, "dist_scale": 4.0},
        "Darwin":   {"state": "NT",  "base": 580_000, "p_bed": 75_000, "p_bath": 50_000, "p_car": 20_000,
                     "p_build": 3_000, "p_land": 700,   "decay": 9_000,  "coastal": 60_000,
                     "unit_disc": -75_000,  "town_disc": -30_000, "dist_shape": 1.6, "dist_scale": 4.0},
    }

    cities = list(city_profiles.keys())
    # Rough sampling weights by relative market size (tweak as needed)
    weights = np.array([0.28, 0.26, 0.14, 0.10, 0.08, 0.06, 0.04, 0.04])
    weights = weights / weights.sum()

    chosen_idx = rng.choice(len(cities), size=n, p=weights)
    city = np.array([cities[i] for i in chosen_idx])

    # Vectorize city parameter lookup
    def pluck(param):
        return np.array([city_profiles[c][param] for c in city])

    state = np.array([city_profiles[c]["state"] for c in city])
    base = pluck("base")
    p_bed = pluck("p_bed")
    p_bath = pluck("p_bath")
    p_car = pluck("p_car")
    p_build = pluck("p_build")
    p_land = pluck("p_land")
    decay = pluck("decay")
    coastal_bonus = pluck("coastal")
    unit_disc = pluck("unit_disc")
    town_disc = pluck("town_disc")
    dist_shape = pluck("dist_shape")
    dist_scale = pluck("dist_scale")

    # Property type distribution varies by city density
    # Higher unit share in Sydney/Melbourne/Brisbane, more houses elsewhere
    type_probs = {
        "Sydney":   [0.50, 0.35, 0.15],  # house, unit, townhouse
        "Melbourne":[0.52, 0.33, 0.15],
        "Brisbane": [0.60, 0.25, 0.15],
        "Perth":    [0.70, 0.15, 0.15],
        "Adelaide": [0.68, 0.17, 0.15],
        "Canberra": [0.55, 0.25, 0.20],
        "Hobart":   [0.72, 0.14, 0.14],
        "Darwin":   [0.65, 0.20, 0.15],
    }
    pt_map = {"house": 0, "unit": 1, "townhouse": 2}
    property_type = np.empty(n, dtype=object)
    for i in range(n):
        probs = type_probs[city[i]]
        property_type[i] = rng.choice(["house", "unit", "townhouse"], p=probs)

    # Distance to CBD (km): gamma-distributed per city
    distance_to_cbd_km = rng.gamma(shape=dist_shape, scale=dist_scale)

    # Bedrooms: distributions by property type
    def sample_bedrooms(pt):
        if pt == "house":
            return rng.choice([2, 3, 4, 5, 6], p=[0.10, 0.35, 0.32, 0.18, 0.05])
        if pt == "townhouse":
            return rng.choice([2, 3, 4], p=[0.30, 0.55, 0.15])
        # unit
        return rng.choice([1, 2, 3], p=[0.30, 0.55, 0.15])
    bedrooms = np.array([sample_bedrooms(pt) for pt in property_type])

    # Bathrooms: correlated with bedrooms
    bathrooms = np.maximum(1, np.round(
        np.clip(bedrooms + rng.normal(0.0, 0.4, size=n), 1, None)
    )).astype(int)

    # Car spaces: fewer for units
    car_spaces = np.zeros(n, dtype=int)
    for i, pt in enumerate(property_type):
        if pt == "unit":
            car_spaces[i] = rng.choice([0, 1, 2], p=[0.25, 0.65, 0.10])
        elif pt == "townhouse":
            car_spaces[i] = rng.choice([1, 2], p=[0.60, 0.40])
        else:  # house
            car_spaces[i] = rng.choice([1, 2, 3], p=[0.35, 0.55, 0.10])

    # Building area (m^2): log-normal; larger for houses, scales with bedrooms
    base_build_mu = np.where(property_type == "house", 5.1, np.where(property_type == "townhouse", 4.8, 4.5))
    base_build_sigma = 0.25
    build_bed_scale = 1 + (bedrooms - 3) * 0.12
    building_area_m2 = np.exp(rng.normal(base_build_mu, base_build_sigma, size=n)) * build_bed_scale
    building_area_m2 = np.clip(building_area_m2, 35, None)

    # Land size (m^2): houses >> townhouses; zero for units
    land_mu = np.where(property_type == "house", 6.2, np.where(property_type == "townhouse", 5.0, 0.0))
    land_sigma = 0.5
    land_size_m2 = np.where(
        property_type == "unit",
        0.0,
        np.exp(rng.normal(land_mu, land_sigma, size=n))
    )
    land_size_m2 = np.where(property_type == "townhouse", np.clip(land_size_m2, 60, 300), land_size_m2)
    land_size_m2 = np.where(property_type == "house", np.clip(land_size_m2, 150, None), land_size_m2)

    # Year built and renovation
    current_year = 2025
    year_built = rng.integers(1950, current_year + 1, size=n)
    renovated_recently = rng.random(n) < np.clip((current_year - year_built) / 100, 0.05, 0.6)

    # Amenities driven by distance to CBD (closer => higher walk/PT)
    noise = lambda s: rng.normal(0, s, size=n)
    walk_score = np.clip(85 - 4.0 * distance_to_cbd_km + noise(8), 0, 100)
    public_transport_score = np.clip(80 - 3.2 * distance_to_cbd_km + noise(9), 0, 100)

    # School quality: slightly higher nearer CBD and in higher-base cities
    school_quality = np.clip(60 + (base - base.mean()) / 10000 - 0.6 * distance_to_cbd_km + noise(10), 20, 100)

    # Crime rate per 1k residents: modestly higher further from CBD, varies by city
    city_crime_bias = np.interp(base, (base.min(), base.max()), (4, -4))
    crime_rate_per_1k = np.clip(32 + 0.8 * distance_to_cbd_km + noise(4) + city_crime_bias, 5, 80)

    # Water view, garden
    has_water_view = (rng.random(n) < np.clip(0.10 + (coastal_bonus / coastal_bonus.max()) * 0.10 - 0.02 * (distance_to_cbd_km > 20), 0.02, 0.25)).astype(int)
    has_garden = ((property_type != "unit") & (rng.random(n) < 0.75)).astype(int)

    # Strata fees (units only)
    strata_fees_monthly = np.where(
        property_type == "unit",
        np.clip(np.exp(rng.normal(4.8, 0.35, size=n)), 60, 800),  # ~120–800
        0.0
    )

    # Postcode-like region buckets for fixed effects (per city)
    postcode_region = rng.integers(2000, 9999, size=n)

    # Pricing model
    # Diminishing returns for size via sqrt, distance penalty, amenities, etc.
    base_price = base
    pt_adjust = np.where(property_type == "unit", unit_disc,
                 np.where(property_type == "townhouse", town_disc, 0.0))
    size_effect = (
        p_build * np.sqrt(building_area_m2) +
        p_land * np.sqrt(np.clip(land_size_m2, 0, None)) * np.where(property_type == "house", 1.0, 0.5)
    )
    layout_effect = p_bed * bedrooms + p_bath * bathrooms + p_car * car_spaces
    loc_effect = (
        coastal_bonus * has_water_view -
        decay * (distance_to_cbd_km ** 1.15)
    )
    amenity_effect = (
        3_000 * (walk_score - 50) +
        2_500 * (public_transport_score - 50) +
        2_800 * (school_quality - 60) -
        5_000 * (crime_rate_per_1k - 30)
    )
    age_effect = (
        -2_200 * (current_year - year_built) +
        45_000 * renovated_recently
    )
    garden_effect = 25_000 * has_garden
    strata_effect = -150 * strata_fees_monthly  # capitalized into price

    price_det = (
        base_price + pt_adjust + size_effect + layout_effect +
        loc_effect + amenity_effect + age_effect + garden_effect + strata_effect
    )

    # Heteroskedastic multiplicative noise
    noise_mult = rng.lognormal(mean=0.0, sigma=0.12, size=n)
    price = price_det * noise_mult

    # Mild outliers (high and low)
    outlier_mask = rng.random(n) < 0.02
    outlier_factor = rng.choice([0.6, 1.5], size=n, p=[0.5, 0.5])
    price = np.where(outlier_mask, price * outlier_factor, price)

    # Floor and rounding to dollars
    price = np.clip(price, 220_000, None)
    price = np.round(price, -2)

    df = pd.DataFrame({
        "city": city,
        "state": state,
        "postcode_region": postcode_region,
        "property_type": property_type,
        "bedrooms": bedrooms,
        "bathrooms": bathrooms,
        "car_spaces": car_spaces,
        "building_area_m2": np.round(building_area_m2, 1),
        "land_size_m2": np.round(land_size_m2, 1),
        "distance_to_cbd_km": np.round(distance_to_cbd_km, 2),
        "year_built": year_built,
        "renovated_recently": renovated_recently.astype(int),
        "walk_score": np.round(walk_score, 0).astype(int),
        "public_transport_score": np.round(public_transport_score, 0).astype(int),
        "school_quality": np.round(school_quality, 0).astype(int),
        "crime_rate_per_1k": np.round(crime_rate_per_1k, 1),
        "has_water_view": has_water_view,
        "has_garden": has_garden,
        "strata_fees_monthly": np.round(strata_fees_monthly, 2),
        "price": price.astype(int),
    })

    # Optional: shuffle rows
    df = df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return df

if __name__ == "__main__":
    # Example usage
    df = generate_housing_data_au(n=200000, seed=42)
    df.to_csv("house_prices_au.csv", index=False)
    print(df.head())
    print(f"Saved {len(df)} rows to house_prices_au.csv")

        city state  postcode_region property_type  bedrooms  bathrooms  \
0     Sydney   NSW             2117         house         5          5   
1      Perth    WA             6685         house         3          3   
2  Melbourne   VIC             4386          unit         3          4   
3     Sydney   NSW             8460         house         3          3   
4  Melbourne   VIC             4572     townhouse         3          3   

   car_spaces  building_area_m2  land_size_m2  distance_to_cbd_km  year_built  \
0           1             343.7         440.8                4.86        1985   
1           2             142.7         350.7                6.79        1983   
2           1             144.7           0.0               19.92        1957   
3           2             196.7         541.7                0.80        1958   
4           1              80.4         103.4               13.63        2010   

   renovated_recently  walk_score  public_transport_score  school_qu