In [2]:
import sqlite3
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report


In [3]:
DB_PATH = "../data/housing.db"

conn = sqlite3.connect(DB_PATH)

housing = pd.read_sql_query(
    """
    SELECT
        b.longitude,
        b.latitude,
        s.housing_median_age,
        s.total_rooms,
        s.total_bedrooms,
        s.population,
        s.households,
        s.median_income,
        op.name AS ocean_proximity,
        pc.label AS price_class
    FROM block b
    JOIN block_housing_stats s
        ON s.block_id = b.block_id
    JOIN ocean_proximity op
        ON op.ocean_proximity_id = b.ocean_proximity_id
    JOIN price_class pc
        ON pc.price_class_id = s.price_class_id
    """,
    conn
)

conn.close()

housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,price_class
0,-122.23,37.88,41.0,880,129.0,322,126,8.3252,NEAR BAY,HIGH
1,-122.22,37.86,21.0,7099,1106.0,2401,1138,8.3014,NEAR BAY,HIGH
2,-122.24,37.85,52.0,1467,190.0,496,177,7.2574,NEAR BAY,HIGH
3,-122.25,37.85,52.0,1274,235.0,558,219,5.6431,NEAR BAY,HIGH
4,-122.25,37.85,52.0,1627,280.0,565,259,3.8462,NEAR BAY,HIGH


In [4]:
X = housing.drop(columns=["price_class"])
y = housing["price_class"]

X.head(), y.value_counts()


(   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
 0    -122.23     37.88                41.0          880           129.0   
 1    -122.22     37.86                21.0         7099          1106.0   
 2    -122.24     37.85                52.0         1467           190.0   
 3    -122.25     37.85                52.0         1274           235.0   
 4    -122.25     37.85                52.0         1627           280.0   
 
    population  households  median_income ocean_proximity  
 0         322         126         8.3252        NEAR BAY  
 1        2401        1138         8.3014        NEAR BAY  
 2         496         177         7.2574        NEAR BAY  
 3         558         219         5.6431        NEAR BAY  
 4         565         259         3.8462        NEAR BAY  ,
 price_class
 LOW       6884
 HIGH      6880
 MEDIUM    6876
 Name: count, dtype: int64)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

print("Train distribution:")
print(y_train.value_counts(normalize=True))

print("\nTest distribution:")
print(y_test.value_counts(normalize=True))


Train distribution:
price_class
LOW       0.333515
HIGH      0.333333
MEDIUM    0.333152
Name: proportion, dtype: float64

Test distribution:
price_class
LOW       0.333576
HIGH      0.333333
MEDIUM    0.333091
Name: proportion, dtype: float64


In [6]:
num_features = [
    "longitude",
    "latitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income",
]

cat_features = ["ocean_proximity"]

num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_features),
        ("cat", cat_pipeline, cat_features),
    ]
)


In [7]:
logreg_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("classifier", LogisticRegression(
            max_iter=1000,
            n_jobs=-1
        )),
    ]
)


In [8]:
cv_f1 = cross_val_score(
    logreg_pipeline,
    X_train,
    y_train,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1
)

print("CV F1 scores:", cv_f1)
print("Mean CV F1:", cv_f1.mean())


CV F1 scores: [0.73097169 0.72664139 0.72840427]
Mean CV F1: 0.7286724507712856


In [9]:
logreg_pipeline.fit(X_train, y_train)

y_pred = logreg_pipeline.predict(X_test)

test_f1 = f1_score(y_test, y_pred, average="macro")

print("Test F1-score:", test_f1)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Test F1-score: 0.7179625206645565

Classification Report:

              precision    recall  f1-score   support

        HIGH       0.77      0.77      0.77      1376
         LOW       0.76      0.79      0.78      1377
      MEDIUM       0.62      0.59      0.61      1375

    accuracy                           0.72      4128
   macro avg       0.72      0.72      0.72      4128
weighted avg       0.72      0.72      0.72      4128





In [10]:
##Experiment2

In [11]:
from sklearn.linear_model import RidgeClassifier


In [12]:
ridge_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("classifier", RidgeClassifier())
    ]
)



In [13]:
cv_f1_ridge = cross_val_score(
    ridge_pipeline,
    X_train,
    y_train,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1
)

print("Ridge CV F1 scores:", cv_f1_ridge)
print("Mean Ridge CV F1:", cv_f1_ridge.mean())


Ridge CV F1 scores: [0.6838389  0.67151143 0.67673838]
Mean Ridge CV F1: 0.6773629008992551


In [14]:
ridge_pipeline.fit(X_train, y_train)

y_pred_ridge = ridge_pipeline.predict(X_test)

test_f1_ridge = f1_score(y_test, y_pred_ridge, average="macro")

print("Ridge Test F1-score:", test_f1_ridge)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_ridge))


Ridge Test F1-score: 0.6785025887598826

Classification Report:

              precision    recall  f1-score   support

        HIGH       0.71      0.82      0.76      1376
         LOW       0.74      0.75      0.74      1377
      MEDIUM       0.59      0.49      0.53      1375

    accuracy                           0.69      4128
   macro avg       0.68      0.69      0.68      4128
weighted avg       0.68      0.69      0.68      4128



In [15]:
##EXPERIMENT #3Gradient Boosting Classifier (NO PCA)

In [16]:
from sklearn.ensemble import GradientBoostingClassifier


In [17]:
gb_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("classifier", GradientBoostingClassifier(random_state=42))
    ]
)


In [18]:
cv_f1_gb = cross_val_score(
    gb_pipeline,
    X_train,
    y_train,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1
)

print("Gradient Boosting CV F1 scores:", cv_f1_gb)
print("Mean GB CV F1:", cv_f1_gb.mean())


Gradient Boosting CV F1 scores: [0.78491854 0.77707604 0.78046359]
Mean GB CV F1: 0.780819387262988


In [19]:
gb_pipeline.fit(X_train, y_train)

y_pred_gb = gb_pipeline.predict(X_test)

test_f1_gb = f1_score(y_test, y_pred_gb, average="macro")

print("Gradient Boosting Test F1-score:", test_f1_gb)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_gb))


Gradient Boosting Test F1-score: 0.7814562958266755

Classification Report:

              precision    recall  f1-score   support

        HIGH       0.86      0.79      0.83      1376
         LOW       0.80      0.84      0.82      1377
      MEDIUM       0.68      0.71      0.70      1375

    accuracy                           0.78      4128
   macro avg       0.78      0.78      0.78      4128
weighted avg       0.78      0.78      0.78      4128



In [20]:
#Encode labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

print("Class mapping:")
for cls, idx in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    print(cls, "->", idx)


Class mapping:
HIGH -> 0
LOW -> 1
MEDIUM -> 2


In [1]:
##Experiment #4 — XGBoost (NO PCA)
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

In [34]:
xgb_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        (
            "classifier",
            XGBClassifier(
                objective="multi:softmax",
                num_class=3,
                eval_metric="mlogloss",
                tree_method="hist",          # ✅ CPU-only
                predictor="cpu_predictor",  # ✅ Force CPU
                n_estimators=300,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
        )),
    ]
)


In [35]:
from sklearn.model_selection import cross_val_score
cv_f1_xgb = cross_val_score(
    xgb_pipeline,
    X_train,
    y_train_enc,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1
)

print("XGBoost CV F1 scores:", cv_f1_xgb)
print("Mean XGB CV F1:", cv_f1_xgb.mean())


XGBoost CV F1 scores: [0.82359756 0.81091066 0.81498311]
Mean XGB CV F1: 0.8164971095483096


In [36]:
from sklearn.metrics import f1_score, classification_report

# Train
xgb_pipeline.fit(X_train, y_train_enc)

# Predict
y_pred_xgb_enc = xgb_pipeline.predict(X_test)

# F1 score
test_f1_xgb = f1_score(y_test_enc, y_pred_xgb_enc, average="macro")

print("XGBoost Test F1-score:", test_f1_xgb)

# Decode predictions back to labels (for report)
y_pred_xgb = label_encoder.inverse_transform(y_pred_xgb_enc)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_xgb))


Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Test F1-score: 0.823594438031543

Classification Report:

              precision    recall  f1-score   support

        HIGH       0.89      0.83      0.86      1376
         LOW       0.84      0.88      0.86      1377
      MEDIUM       0.74      0.76      0.75      1375

    accuracy                           0.82      4128
   macro avg       0.83      0.82      0.82      4128
weighted avg       0.83      0.82      0.82      4128



In [24]:
##LightGBM Classifier (NO PCA)

In [25]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(label_encoder.classes_)


['HIGH' 'LOW' 'MEDIUM']


In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_encoded_train, y_encoded_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=42
)


In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

num_features = X.select_dtypes(include=["float64", "int64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_features),
    ("cat", categorical_pipeline, cat_features)
])


In [28]:
from lightgbm import LGBMClassifier

lgbm_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LGBMClassifier(
        objective="multiclass",
        num_class=3,
        n_estimators=300,
        learning_rate=0.05,
        max_depth=-1,
        random_state=42
    ))
])


In [29]:
from sklearn.model_selection import cross_val_score

cv_f1_lgbm = cross_val_score(
    lgbm_pipeline,
    X_train,
    y_encoded_train,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1
)

print("LightGBM CV F1 scores:", cv_f1_lgbm)
print("Mean CV F1:", cv_f1_lgbm.mean())


LightGBM CV F1 scores: [0.82282669 0.81043979 0.81232553]
Mean CV F1: 0.815197337499351


In [30]:
###Train FINAL XGBoost model on FULL TRAIN SET


# The final model must be trained once using:

# best-performing algorithm

# full training data

In [31]:
# Refit XGBoost pipeline on FULL training data
xgb_pipeline.fit(X_train, y_encoded_train)

# Predict on test set
y_pred_xgb = xgb_pipeline.predict(X_test)


In [32]:
##Final Evaluation
from sklearn.metrics import classification_report, f1_score

print("FINAL XGBoost Test F1-score:", f1_score(
    y_encoded_test, y_pred_xgb, average="macro"
))

print("\nClassification Report:")
print(classification_report(
    y_encoded_test,
    y_pred_xgb,
    target_names=label_encoder.classes_
))


FINAL XGBoost Test F1-score: 0.823594438031543

Classification Report:
              precision    recall  f1-score   support

        HIGH       0.89      0.83      0.86      1376
         LOW       0.84      0.88      0.86      1377
      MEDIUM       0.74      0.76      0.75      1375

    accuracy                           0.82      4128
   macro avg       0.83      0.82      0.82      4128
weighted avg       0.83      0.82      0.82      4128



In [33]:
import joblib
from pathlib import Path

# Create models directory if not exists
MODELS_DIR = Path("../models")
MODELS_DIR.mkdir(exist_ok=True)

# Save model pipeline
joblib.dump(xgb_pipeline, MODELS_DIR / "final_xgb_classifier.pkl")

# Save label encoder
joblib.dump(label_encoder, MODELS_DIR / "label_encoder.pkl")

print("✅ Final XGBoost model and label encoder saved!")


✅ Final XGBoost model and label encoder saved!
