In [1]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split

# ------------------------------------------------------------------
# Load data from SQLite using JOIN (reuse STEP 3 logic)
# ------------------------------------------------------------------
conn = sqlite3.connect("../data/housing.db")

housing = pd.read_sql_query(
    """
    SELECT
        b.longitude,
        b.latitude,
        s.housing_median_age,
        s.total_rooms,
        s.total_bedrooms,
        s.population,
        s.households,
        s.median_income,
        op.name AS ocean_proximity,
        pc.label AS price_class
    FROM block b
    JOIN block_housing_stats s
        ON s.block_id = b.block_id
    JOIN ocean_proximity op
        ON op.ocean_proximity_id = b.ocean_proximity_id
    JOIN price_class pc
        ON pc.price_class_id = s.price_class_id
    """,
    conn
)

conn.close()

print("Dataset shape:", housing.shape)
housing.head()


Dataset shape: (20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,price_class
0,-122.23,37.88,41.0,880,129.0,322,126,8.3252,NEAR BAY,HIGH
1,-122.22,37.86,21.0,7099,1106.0,2401,1138,8.3014,NEAR BAY,HIGH
2,-122.24,37.85,52.0,1467,190.0,496,177,7.2574,NEAR BAY,HIGH
3,-122.25,37.85,52.0,1274,235.0,558,219,5.6431,NEAR BAY,HIGH
4,-122.25,37.85,52.0,1627,280.0,565,259,3.8462,NEAR BAY,HIGH


In [2]:
housing["price_class"].value_counts(normalize=True)


price_class
LOW       0.333527
HIGH      0.333333
MEDIUM    0.333140
Name: proportion, dtype: float64

In [3]:
X = housing.drop("price_class", axis=1)
y = housing["price_class"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (16512, 9)
Test shape: (4128, 9)


In [4]:
print("Train distribution:")
print(y_train.value_counts(normalize=True))

print("\nTest distribution:")
print(y_test.value_counts(normalize=True))


Train distribution:
price_class
LOW       0.333515
HIGH      0.333333
MEDIUM    0.333152
Name: proportion, dtype: float64

Test distribution:
price_class
LOW       0.333576
HIGH      0.333333
MEDIUM    0.333091
Name: proportion, dtype: float64


In [5]:
X_train.shape, X_test.shape


((16512, 9), (4128, 9))

In [6]:
y_train.value_counts(normalize=True)


price_class
LOW       0.333515
HIGH      0.333333
MEDIUM    0.333152
Name: proportion, dtype: float64