In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)
n = 1000

# Core structured data
df = pd.DataFrame({
    "Price": np.random.normal(300000, 75000, n).round(-3),
    "Area": np.random.normal(2000, 500, n).round(),
    "Bedrooms": np.random.randint(2, 6, n),
    "Bathrooms": np.random.randint(1, 4, n),
    "Garage": np.random.randint(0, 3, n),
    "Age": np.random.randint(0, 50, n),
    "Location": np.random.choice(
        ["Downtown", "downtown", "Down Town", "Suburb", "suburb", "Rural", "Midtown"], n
    ),
    "Distance_to_city": np.random.normal(10, 5, n).clip(0.5).round(2),
    "Crime_Rate": np.random.normal(3, 1.5, n).clip(0).round(2),
    "School_Rating": np.random.randint(1, 11, n)
})

# Date field with noise
df["Date_Sold"] = pd.to_datetime("2023-01-01") + pd.to_timedelta(np.random.randint(0, 730, n), unit='D')
df.loc[np.random.choice(df.index, 25, replace=False), "Date_Sold"] = None  # inject missing dates

# HOA Fees with missing values
df["HOA_Fees"] = np.where(df["Location"].str.lower().str.contains("downtown"),
                          np.random.normal(250, 50, n),
                          np.where(df["Location"].str.lower().str.contains("suburb"),
                                   np.random.normal(150, 30, n),
                                   np.nan)).round(2)
df.loc[np.random.choice(df.index, 50, replace=False), "HOA_Fees"] = None  # extra NaNs

# Binary flags (some typos + noise)
df["Has_Basement"] = np.random.choice(["Yes", "No", "Y", "N", "yes", "no", None], n, p=[0.3, 0.3, 0.1, 0.1, 0.05, 0.1, 0.05])
df["Pool"] = np.random.choice(["Yes", "No", "Unknown", "yes", "no", None], n, p=[0.15, 0.65, 0.05, 0.05, 0.05, 0.05])

# Text field with mess
df["Description"] = np.random.choice([
    "Recently renovated with modern finishes.",
    "Needs some TLC but great potential.",
    "close to public transport and schools.",
    "LARGE BACKYARD and spacious living area.",
    np.nan,
    "Quiet neighborhood; low crime-rate!",
    "needs-work"
], n)

# Outliers for Price and Area
outlier_indices = np.random.choice(df.index, 10, replace=False)
df.loc[outlier_indices, "Price"] *= 4
df.loc[outlier_indices, "Area"] *= 3

# Duplicates
dupes = df.sample(5, random_state=1)
df = pd.concat([df, dupes], ignore_index=True)

# Save to CSV
df.to_csv("simulated_housing_data_dirty.csv", index=False)
print("✅ Dataset with cleaning challenges saved as simulated_housing_data_dirty.csv")


✅ Dataset with cleaning challenges saved as simulated_housing_data_dirty.csv
