In [None]:
import folium
import pandas as pd
# from matplotlib import colormaps, colors

df = pd.read_csv('eda_clean.csv')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

old_poor_condition_houses = df[(df['condition'] < 3) & (df['yr_built'] < 1990)]
other_houses = df[(df['condition'] >= 3) | (df['yr_built'] >= 1990)]
old_poor_median = old_poor_condition_houses['price'].median()
other_median = other_houses['price'].median()

plt.figure(figsize=(10,6))

# Add vertical median lines
plt.axvline(old_poor_median, color='red', linestyle='--', linewidth=2, label=f'Old Poor Median: ${old_poor_median:,.0f}')
plt.axvline(other_median, color='blue', linestyle='--', linewidth=2, label=f'Other Median: ${other_median:,.0f}')

# Format x-axis to show full prices
plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Labels and legend
plt.xlim(0, 1_500_000)
plt.xlabel('House Price')
plt.ylabel('Density')
plt.title('Price Distribution: Old Poor Condition Houses vs Other Houses')
plt.legend()
plt.tight_layout()

sns.kdeplot(old_poor_condition_houses['price'], label='Poor Condition & Built < 1990', color='red', fill=True, alpha=0.4)
sns.kdeplot(other_houses['price'], label='Other Houses', color='blue', fill=True, alpha=0.4)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from scipy.stats import gaussian_kde

old_poor_condition_houses = df[(df['condition'] < 3) & (df['yr_built'] < 1990)]
other_houses = df[(df['condition'] >= 3) | (df['yr_built'] >= 1990)]

old_poor_median = old_poor_condition_houses['price'].median()
other_median = other_houses['price'].median()

# Extract clean arrays
x1 = old_poor_condition_houses['price'].dropna().to_numpy()
x2 = other_houses['price'].dropna().to_numpy()

# Grid and bin width for "counts per $10k"
bin_width = 10_000
x_min, x_max = 0, 3_000_000
grid = np.arange(x_min, x_max + bin_width, bin_width)

# KDEs
kde1 = gaussian_kde(x1)
kde2 = gaussian_kde(x2)

# Convert density -> counts per bin_width
y1 = kde1(grid) * x1.size * bin_width
y2 = kde2(grid) * x2.size * bin_width

# Plot
plt.figure(figsize=(10,6))
plt.plot(grid, y1, label=f'Poor & <1990 (n={x1.size})')
plt.plot(grid, y2, label=f'Other (n={x2.size})')
plt.fill_between(grid, 0, y1, alpha=0.3)
plt.fill_between(grid, 0, y2, alpha=0.3)

# Medians
plt.axvline(old_poor_median, linestyle='--', linewidth=2,
            label=f'Old Poor Median: ${old_poor_median:,.0f}')
plt.axvline(other_median, linestyle='--', linewidth=2,
            label=f'Other Median: ${other_median:,.0f}')

# Format axes
ax = plt.gca()
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f'${x:,.0f}'))
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f'{int(round(y)):,}'))
plt.xlim(x_min, x_max)
plt.xlabel('House Price')
plt.ylabel(f'Number of houses per ${bin_width:,.0f}')
plt.title('Price distribution (scaled to counts)')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

old_poor_condition_houses = df[(df['condition'] < 3) & (df['yr_built'] < 1990)]
other_houses = df[(df['condition'] >= 3) | (df['yr_built'] >= 1990)]
old_poor_median = old_poor_condition_houses['price'].median()
other_median = other_houses['price'].median()

plt.figure(figsize=(10,6))

# Histogram with counts
sns.histplot(
    old_poor_condition_houses['price'],
    bins=50, color='red', alpha=0.3,
    label='Poor Condition & Built < 1990',
    stat='count',
    log_scale=(False, True)
)

sns.histplot(
    other_houses['price'],
    bins=50, color='blue', alpha=0.3,
    label='Other Houses',
    stat='count',
    log_scale=(False, True)
)

# Add vertical median lines
plt.axvline(old_poor_median, color='red', linestyle='--', linewidth=2)
plt.axvline(other_median, color='blue', linestyle='--', linewidth=2)

# # Annotate median values directly on plot
# ymax = plt.gca().get_ylim()[1]  # top of y-axis
# plt.text(old_poor_median, ymax*0.95, f"${old_poor_median:,.0f}", color='red', ha='center', va='top', rotation=90, fontsize=10, fontweight='bold')
# plt.text(other_median, ymax*0.95, f"${other_median:,.0f}", color='blue', ha='center', va='top', rotation=90, fontsize=10, fontweight='bold')

# Format x-axis to show dollars
plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Limit x-axis to 0–3 million
# plt.xlim(0, 3_000_000)

# Labels and legend
plt.xlabel('House Price')
plt.ylabel('Number of Houses')
plt.title('Price Distribution: Old Poor Condition Houses vs Other Houses')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

old_poor_condition_houses = df[(df['condition'] < 3) & (df['yr_built'] < 1990)]
other_houses = df[(df['condition'] >= 3) | (df['yr_built'] >= 1990)]
old_poor_median = old_poor_condition_houses['price'].median()
other_median = other_houses['price'].median()

plt.figure(figsize=(10,6))

# Histogram with counts + log scale
sns.histplot(
    old_poor_condition_houses['price'],
    bins=50, color='red', alpha=0.3,
    label='Poor Condition & Built < 1990',
    stat='count',
    log_scale=(False, True)   # 👈 log scale on y-axis
)

sns.histplot(
    other_houses['price'],
    bins=50, color='blue', alpha=0.3,
    label='Other Houses',
    stat='count',
    log_scale=(False, True)
)

# Smooth density curves
# sns.kdeplot(old_poor_condition_houses['price'], color='red', linewidth=2)
# sns.kdeplot(other_houses['price'], color='blue', linewidth=2)

# Median lines
plt.axvline(old_poor_median, color='red', linestyle='--', linewidth=2)
plt.axvline(other_median, color='blue', linestyle='--', linewidth=2)

# # Annotate medians
# ymax = plt.gca().get_ylim()[1]
# plt.text(old_poor_median, ymax*0.8, f"${old_poor_median:,.0f}", color='red', ha='center', va='top', rotation=90, fontsize=10, fontweight='bold')
# plt.text(other_median, ymax*0.8, f"${other_median:,.0f}", color='blue', ha='center', va='top', rotation=90, fontsize=10, fontweight='bold')

# Format x-axis as dollars
plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Limit x-axis
plt.xlim(0, 3_000_000)

plt.xlabel('House Price')
plt.ylabel('Number of Houses (log scale)')
plt.title('Price Distribution: Old Poor Condition Houses vs Other Houses')
plt.legend()
plt.tight_layout()
plt.show()