In [1]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect("../data/diabetes.db")

df = pd.read_sql("""
SELECT 
    p.pregnancies,
    p.glucose,
    p.blood_pressure,
    p.skin_thickness,
    p.insulin,
    p.bmi,
    p.diabetes_pedigree,
    p.age,
    l.outcome
FROM patients p
JOIN labels l
ON p.patient_id = l.patient_id
""", conn)

conn.close()
df.head()


Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree,age,outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

X = df.drop("outcome", axis=1)
y = df["outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# PCA (keep 95% variance)
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("PCA components:", pca.n_components_)


PCA components: 7


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import joblib
import os

os.makedirs("../models", exist_ok=True)

models = {
    "logistic_regression": LogisticRegression(max_iter=1000),
    "random_forest": RandomForestClassifier(random_state=42),
    "svm": SVC(),
    "gradient_boosting": GradientBoostingClassifier(random_state=42)
}

for name, model in models.items():
    model.fit(X_train_pca, y_train)
    preds = model.predict(X_test_pca)
    f1 = f1_score(y_test, preds)

    joblib.dump(model, f"../models/{name}_pca_no_optuna.pkl")

    print(f"{name} PCA F1-score: {f1:.4f}")


logistic_regression PCA F1-score: 0.5253
random_forest PCA F1-score: 0.5741
svm PCA F1-score: 0.6061
gradient_boosting PCA F1-score: 0.5607


In [4]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import joblib

def objective_lr_pca(trial):
    C = trial.suggest_float("C", 0.01, 10, log=True)
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train_pca, y_train)
    preds = model.predict(X_test_pca)
    return f1_score(y_test, preds)

study_lr_pca = optuna.create_study(direction="maximize")
study_lr_pca.optimize(objective_lr_pca, n_trials=20)

best_lr_pca = LogisticRegression(**study_lr_pca.best_params, max_iter=1000)
best_lr_pca.fit(X_train_pca, y_train)

joblib.dump(best_lr_pca, "../models/logistic_regression_pca_optuna.pkl")
print("LR PCA Optuna F1:", study_lr_pca.best_value)


[I 2025-12-17 21:36:31,016] A new study created in memory with name: no-name-4306a209-fee8-4294-bec2-962b57126d12
[I 2025-12-17 21:36:31,021] Trial 0 finished with value: 0.5252525252525253 and parameters: {'C': 0.25344970647445136}. Best is trial 0 with value: 0.5252525252525253.
[I 2025-12-17 21:36:31,025] Trial 1 finished with value: 0.54 and parameters: {'C': 0.028216764248054117}. Best is trial 1 with value: 0.54.
[I 2025-12-17 21:36:31,031] Trial 2 finished with value: 0.5252525252525253 and parameters: {'C': 0.17257833705385445}. Best is trial 1 with value: 0.54.
[I 2025-12-17 21:36:31,038] Trial 3 finished with value: 0.5454545454545454 and parameters: {'C': 6.817457324863747}. Best is trial 3 with value: 0.5454545454545454.
[I 2025-12-17 21:36:31,043] Trial 4 finished with value: 0.5252525252525253 and parameters: {'C': 0.784241845270263}. Best is trial 3 with value: 0.5454545454545454.
[I 2025-12-17 21:36:31,047] Trial 5 finished with value: 0.5252525252525253 and parameters:

LR PCA Optuna F1: 0.574468085106383


In [5]:
from sklearn.ensemble import RandomForestClassifier

def objective_rf_pca(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 2, 20)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )
    model.fit(X_train_pca, y_train)
    preds = model.predict(X_test_pca)
    return f1_score(y_test, preds)

study_rf_pca = optuna.create_study(direction="maximize")
study_rf_pca.optimize(objective_rf_pca, n_trials=20)

best_rf_pca = RandomForestClassifier(**study_rf_pca.best_params, random_state=42)
best_rf_pca.fit(X_train_pca, y_train)

joblib.dump(best_rf_pca, "../models/random_forest_pca_optuna.pkl")
print("RF PCA Optuna F1:", study_rf_pca.best_value)


[I 2025-12-17 21:36:39,430] A new study created in memory with name: no-name-e739a9a8-6c2c-4505-b987-689a595abfd6
[I 2025-12-17 21:36:39,684] Trial 0 finished with value: 0.5688073394495413 and parameters: {'n_estimators': 170, 'max_depth': 11}. Best is trial 0 with value: 0.5688073394495413.
[I 2025-12-17 21:36:39,982] Trial 1 finished with value: 0.5607476635514018 and parameters: {'n_estimators': 206, 'max_depth': 14}. Best is trial 0 with value: 0.5688073394495413.
[I 2025-12-17 21:36:40,082] Trial 2 finished with value: 0.5925925925925926 and parameters: {'n_estimators': 70, 'max_depth': 16}. Best is trial 2 with value: 0.5925925925925926.
[I 2025-12-17 21:36:40,252] Trial 3 finished with value: 0.5607476635514018 and parameters: {'n_estimators': 133, 'max_depth': 11}. Best is trial 2 with value: 0.5925925925925926.
[I 2025-12-17 21:36:40,461] Trial 4 finished with value: 0.5794392523364486 and parameters: {'n_estimators': 156, 'max_depth': 20}. Best is trial 2 with value: 0.59259

RF PCA Optuna F1: 0.6


In [6]:
from sklearn.svm import SVC

def objective_svm_pca(trial):
    C = trial.suggest_float("C", 0.01, 10, log=True)
    gamma = trial.suggest_float("gamma", 0.0001, 1, log=True)

    model = SVC(C=C, gamma=gamma)
    model.fit(X_train_pca, y_train)
    preds = model.predict(X_test_pca)
    return f1_score(y_test, preds)

study_svm_pca = optuna.create_study(direction="maximize")
study_svm_pca.optimize(objective_svm_pca, n_trials=20)

best_svm_pca = SVC(**study_svm_pca.best_params)
best_svm_pca.fit(X_train_pca, y_train)

joblib.dump(best_svm_pca, "../models/svm_pca_optuna.pkl")
print("SVM PCA Optuna F1:", study_svm_pca.best_value)


[I 2025-12-17 21:36:51,084] A new study created in memory with name: no-name-49b404e4-00fa-4a65-9343-b3d952450ccb
[I 2025-12-17 21:36:51,098] Trial 0 finished with value: 0.5531914893617021 and parameters: {'C': 0.5435100919335675, 'gamma': 0.28666318608546726}. Best is trial 0 with value: 0.5531914893617021.
[I 2025-12-17 21:36:51,110] Trial 1 finished with value: 0.0 and parameters: {'C': 0.02588526566885614, 'gamma': 0.14532782448349807}. Best is trial 0 with value: 0.5531914893617021.
[I 2025-12-17 21:36:51,122] Trial 2 finished with value: 0.0 and parameters: {'C': 1.0033531007475904, 'gamma': 0.0001355448817862556}. Best is trial 0 with value: 0.5531914893617021.
[I 2025-12-17 21:36:51,134] Trial 3 finished with value: 0.0 and parameters: {'C': 0.06521335218236085, 'gamma': 0.0010364809848629752}. Best is trial 0 with value: 0.5531914893617021.
[I 2025-12-17 21:36:51,146] Trial 4 finished with value: 0.5416666666666666 and parameters: {'C': 5.189225730362656, 'gamma': 0.002384439

SVM PCA Optuna F1: 0.5806451612903226


In [7]:
from sklearn.ensemble import GradientBoostingClassifier

def objective_gb_pca(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)

    model = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        random_state=42
    )
    model.fit(X_train_pca, y_train)
    preds = model.predict(X_test_pca)
    return f1_score(y_test, preds)

study_gb_pca = optuna.create_study(direction="maximize")
study_gb_pca.optimize(objective_gb_pca, n_trials=20)

best_gb_pca = GradientBoostingClassifier(**study_gb_pca.best_params, random_state=42)
best_gb_pca.fit(X_train_pca, y_train)

joblib.dump(best_gb_pca, "../models/gradient_boosting_pca_optuna.pkl")
print("GB PCA Optuna F1:", study_gb_pca.best_value)


[I 2025-12-17 21:36:59,212] A new study created in memory with name: no-name-085c8d9b-8ddd-4ca0-9bb9-f533becbd2a7
[I 2025-12-17 21:36:59,421] Trial 0 finished with value: 0.5471698113207547 and parameters: {'n_estimators': 103, 'learning_rate': 0.2152914573415977}. Best is trial 0 with value: 0.5471698113207547.
[I 2025-12-17 21:36:59,629] Trial 1 finished with value: 0.5981308411214953 and parameters: {'n_estimators': 106, 'learning_rate': 0.2897647589119929}. Best is trial 1 with value: 0.5981308411214953.
[I 2025-12-17 21:36:59,985] Trial 2 finished with value: 0.5436893203883495 and parameters: {'n_estimators': 177, 'learning_rate': 0.2605995135713804}. Best is trial 1 with value: 0.5981308411214953.
[I 2025-12-17 21:37:00,248] Trial 3 finished with value: 0.5504587155963303 and parameters: {'n_estimators': 141, 'learning_rate': 0.025363875242219537}. Best is trial 1 with value: 0.5981308411214953.
[I 2025-12-17 21:37:00,379] Trial 4 finished with value: 0.5765765765765766 and para

GB PCA Optuna F1: 0.5981308411214953


In [2]:
base_folder = "/content/drive/MyDrive/Colab Notebooks/housing_fall2025"
%cd "{base_folder}"

/content/drive/MyDrive/Colab Notebooks/housing_fall2025


In [3]:
import sqlite3
import pandas as pd
conn = sqlite3.connect(f"{base_folder}/data/housing.db")
housing = pd.read_sql_query(
    """
    SELECT
        b.block_id,
        b.longitude,
        b.latitude,
        s.housing_median_age,
        s.total_rooms,
        s.total_bedrooms,
        s.population,
        s.households,
        s.median_income,
        s.median_house_value,
        op.name AS ocean_proximity
    FROM block AS b
    JOIN block_housing_stats AS s
        ON s.block_id = b.block_id
    JOIN ocean_proximity AS op
        ON op.ocean_proximity_id = b.ocean_proximity_id
    ORDER BY b.block_id
    """,
    conn,
)
conn.close()

housing.head()

Unnamed: 0,block_id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,0,-122.23,37.88,41.0,880,129.0,322,126,8.3252,452600.0,NEAR BAY
1,1,-122.22,37.86,21.0,7099,1106.0,2401,1138,8.3014,358500.0,NEAR BAY
2,2,-122.24,37.85,52.0,1467,190.0,496,177,7.2574,352100.0,NEAR BAY
3,3,-122.25,37.85,52.0,1274,235.0,558,219,5.6431,341300.0,NEAR BAY
4,4,-122.25,37.85,52.0,1627,280.0,565,259,3.8462,342200.0,NEAR BAY


In [5]:
# =============================================================================
# ANALYZE HOUSING DATA FOR STREAMLIT APP
# Find min/max/median for numerical features and unique values for categorical features
# =============================================================================

import json

print("=" * 80)
print("ANALYZING HOUSING DATA FOR STREAMLIT APP")
print("=" * 80)

# Define the features we need for prediction
numerical_features = [
    'longitude',
    'latitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income'
]

categorical_features = ['ocean_proximity']

# Create schema dictionary
data_schema = {
    "numerical": {},
    "categorical": {}
}

# Analyze numerical features
print("\n" + "-" * 80)
print("NUMERICAL FEATURES")
print("-" * 80)
print(f"{'Feature':<25} {'Min':<15} {'Max':<15} {'Mean':<15} {'Median':<15}")
print("-" * 80)

for feature in numerical_features:
    min_val = float(housing[feature].min())
    max_val = float(housing[feature].max())
    mean_val = float(housing[feature].mean())
    median_val = float(housing[feature].median())

    data_schema["numerical"][feature] = {
        "min": min_val,
        "max": max_val,
        "mean": mean_val,
        "median": median_val
    }

    print(f"{feature:<25} {min_val:<15.2f} {max_val:<15.2f} {mean_val:<15.2f} {median_val:<15.2f}")

# Analyze categorical features
print("\n" + "-" * 80)
print("CATEGORICAL FEATURES")
print("-" * 80)

for feature in categorical_features:
    unique_values = housing[feature].unique().tolist()
    value_counts = housing[feature].value_counts().to_dict()

    data_schema["categorical"][feature] = {
        "unique_values": unique_values,
        "value_counts": value_counts
    }

    print(f"\n{feature}:")
    print(f"  Unique values: {unique_values}")
    print(f"  Value counts:")
    for value, count in value_counts.items():
        print(f"    {value}: {count} ({count/len(housing)*100:.1f}%)")

# Save schema to JSON file
output_file = f"{base_folder}/data/data_schema.json"
with open(output_file, 'w') as f:
    json.dump(data_schema, f, indent=2)

print("\n" + "=" * 80)
print(f"✓ Data schema saved to {output_file}")
print("=" * 80)

# Display the JSON structure
print("\n" + "-" * 80)
print("GENERATED SCHEMA (data_schema.json)")
print("-" * 80)
print(json.dumps(data_schema, indent=2))

print("\n" + "=" * 80)
print("DONE! Use data_schema.json in your Streamlit app")
print("=" * 80)

ANALYZING HOUSING DATA FOR STREAMLIT APP

--------------------------------------------------------------------------------
NUMERICAL FEATURES
--------------------------------------------------------------------------------
Feature                   Min             Max             Mean            Median         
--------------------------------------------------------------------------------
longitude                 -124.35         -114.31         -119.57         -118.49        
latitude                  32.54           41.95           35.63           34.26          
housing_median_age        1.00            52.00           28.64           29.00          
total_rooms               2.00            39320.00        2635.76         2127.00        
total_bedrooms            1.00            6445.00         537.87          435.00         
population                3.00            35682.00        1425.48         1166.00        
households                1.00            6082.00         499.54  