In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

pd.set_option("display.max_columns", None)
plt.style.use("seaborn-v0_8-whitegrid")

# Create directories
Path("../data/raw/property_data").mkdir(parents=True, exist_ok=True)
Path("../data/processed/property_data").mkdir(parents=True, exist_ok=True)

In [None]:
# Create sample property data for Atlanta ZIP codes
np.random.seed(42)

zip_codes = [30305, 30306, 30307, 30308, 30309]
n_properties = 500

sample_data = {
    'property_id': range(1, n_properties + 1),
    'zip_code': np.random.choice(zip_codes, n_properties),
    'bedrooms': np.random.randint(1, 6, n_properties),
    'bathrooms': np.random.randint(1, 4, n_properties) + np.random.choice([0, 0.5], n_properties),
    'sqft': np.random.randint(800, 4000, n_properties),
    'lot_size': np.random.randint(2000, 10000, n_properties),
    'year_built': np.random.randint(1950, 2024, n_properties),
    'garage': np.random.randint(0, 3, n_properties),
    'sale_price': np.random.randint(200000, 800000, n_properties),
    'sale_date': pd.date_range('2020-01-01', periods=n_properties, freq='D')
}

df_properties = pd.DataFrame(sample_data)
print(f"Created sample dataset with {len(df_properties)} properties")
df_properties.head(10)

In [None]:
# Basic Data Exploration
print("Dataset shape:", df_properties.shape)
print("\nData types:")
print(df_properties.dtypes)
print("\nMissing values:")
print(df_properties.isnull().sum())
print("\nBasic statistics:")
df_properties.describe()

In [None]:
# Distribution of Key Features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

axes[0, 0].hist(df_properties['bedrooms'], bins=10, edgecolor='black')
axes[0, 0].set_title('Bedrooms Distribution')
axes[0, 0].set_xlabel('Number of Bedrooms')

axes[0, 1].hist(df_properties['bathrooms'], bins=10, edgecolor='black')
axes[0, 1].set_title('Bathrooms Distribution')
axes[0, 1].set_xlabel('Number of Bathrooms')

axes[0, 2].hist(df_properties['sqft'], bins=20, edgecolor='black')
axes[0, 2].set_title('Square Footage Distribution')
axes[0, 2].set_xlabel('Square Feet')

axes[1, 0].hist(df_properties['year_built'], bins=20, edgecolor='black')
axes[1, 0].set_title('Year Built Distribution')
axes[1, 0].set_xlabel('Year')

axes[1, 1].hist(df_properties['sale_price'], bins=20, edgecolor='black')
axes[1, 1].set_title('Sale Price Distribution')
axes[1, 1].set_xlabel('Price ($)')

axes[1, 2].scatter(df_properties['sqft'], df_properties['sale_price'], alpha=0.5)
axes[1, 2].set_title('Price vs Square Footage')
axes[1, 2].set_xlabel('Square Feet')
axes[1, 2].set_ylabel('Price ($)')

plt.tight_layout()
plt.show()

In [None]:
# Price by ZIP Code
plt.figure(figsize=(10, 6))
df_properties.boxplot(column='sale_price', by='zip_code', ax=plt.gca())
plt.title('Sale Price Distribution by ZIP Code')
plt.suptitle('')  # Remove default title
plt.xlabel('ZIP Code')
plt.ylabel('Sale Price ($)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Correlation Analysis
# Select numeric columns only
numeric_cols = ['bedrooms', 'bathrooms', 'sqft', 'lot_size', 'year_built', 'garage', 'sale_price']
correlation_matrix = df_properties[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Save Processed Property Data
df_properties.to_csv("../data/processed/property_data/atlanta_properties.csv", index=False)
print("Property data saved to ../data/processed/property_data/atlanta_properties.csv")
print(f"  Total properties: {len(df_properties)}")
print(f"  Date range: {df_properties['sale_date'].min()} to {df_properties['sale_date'].max()}")