In [20]:
base_folder = "R:\\Downloads\\housing_app_fall25-main\\housing_app_fall25-main"
%cd "{base_folder}"

R:\Downloads\housing_app_fall25-main\housing_app_fall25-main


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [21]:
import sqlite3
import pandas as pd

conn = sqlite3.connect(f"{base_folder}/data/titanic.db")

titanic = pd.read_sql_query(
    """
    SELECT
        p.passenger_id,
        p.Pclass,
        p.Age,
        p.Fare,
        ps.SibSp,
        ps.Parch,
        ps.Survived,
        s.name AS sex
    FROM passenger AS p
    JOIN passenger_survival AS ps
        ON ps.passenger_id = p.passenger_id
    JOIN sex AS s
        ON s.sex_id = p.sex_id
    ORDER BY p.passenger_id
    """,
    conn,
)

conn.close()

titanic.head()


Unnamed: 0,passenger_id,Pclass,Age,Fare,SibSp,Parch,Survived,sex
0,0,3,22.0,7.25,1,0,0,male
1,1,1,38.0,71.2833,1,0,1,female
2,2,3,26.0,7.925,0,0,1,female
3,3,1,35.0,53.1,1,0,1,female
4,4,3,35.0,8.05,0,0,0,male


In [22]:
# =============================================================================
# ANALYZE TITANIC DATA FOR STREAMLIT APP (CLASSIFICATION)
# Find min/max/mean/median for numerical features
# Find unique values for categorical features
# =============================================================================

import json

print("=" * 80)
print("ANALYZING TITANIC DATA FOR STREAMLIT APP")
print("=" * 80)

# ------------------------------------------------------------------
# Define features used for prediction (MUST match training pipeline)
# ------------------------------------------------------------------

numerical_features = [
    "Pclass",
    "Age",
    "Fare",
    "SibSp",
    "Parch",
]

categorical_features = ["sex"]

# Create schema dictionary
data_schema = {
    "numerical": {},
    "categorical": {}
}

# ------------------------------------------------------------------
# Analyze numerical features
# ------------------------------------------------------------------

print("\n" + "-" * 80)
print("NUMERICAL FEATURES")
print("-" * 80)
print(f"{'Feature':<20} {'Min':<10} {'Max':<10} {'Mean':<10} {'Median':<10}")
print("-" * 80)

for feature in numerical_features:
    min_val = float(titanic[feature].min())
    max_val = float(titanic[feature].max())
    mean_val = float(titanic[feature].mean())
    median_val = float(titanic[feature].median())

    data_schema["numerical"][feature] = {
        "min": min_val,
        "max": max_val,
        "mean": mean_val,
        "median": median_val
    }

    print(
        f"{feature:<20} "
        f"{min_val:<10.2f} "
        f"{max_val:<10.2f} "
        f"{mean_val:<10.2f} "
        f"{median_val:<10.2f}"
    )

# ------------------------------------------------------------------
# Analyze categorical features
# ------------------------------------------------------------------

print("\n" + "-" * 80)
print("CATEGORICAL FEATURES")
print("-" * 80)

for feature in categorical_features:
    unique_values = sorted(titanic[feature].dropna().unique().tolist())
    value_counts = titanic[feature].value_counts().to_dict()

    data_schema["categorical"][feature] = {
        "unique_values": unique_values,
        "value_counts": value_counts
    }

    print(f"\n{feature}:")
    print(f"  Unique values: {unique_values}")
    print("  Value counts:")
    for value, count in value_counts.items():
        percent = count / len(titanic) * 100
        print(f"    {value}: {count} ({percent:.1f}%)")

# ------------------------------------------------------------------
# Save schema to JSON for Streamlit
# ------------------------------------------------------------------

output_file = f"{base_folder}/data/data_schema.json"
with open(output_file, "w") as f:
    json.dump(data_schema, f, indent=2)

print("\n" + "=" * 80)
print(f"✓ Data schema saved to {output_file}")
print("=" * 80)

# ------------------------------------------------------------------
# Display JSON structure
# ------------------------------------------------------------------

print("\n" + "-" * 80)
print("GENERATED SCHEMA (data_schema.json)")
print("-" * 80)
print(json.dumps(data_schema, indent=2))

print("\n" + "=" * 80)
print("DONE! Use data_schema.json in your Streamlit app")
print("=" * 80)


ANALYZING TITANIC DATA FOR STREAMLIT APP

--------------------------------------------------------------------------------
NUMERICAL FEATURES
--------------------------------------------------------------------------------
Feature              Min        Max        Mean       Median    
--------------------------------------------------------------------------------
Pclass               1.00       3.00       2.31       3.00      
Age                  0.42       80.00      29.70      28.00     
Fare                 0.00       512.33     32.20      14.45     
SibSp                0.00       8.00       0.52       0.00      
Parch                0.00       6.00       0.38       0.00      

--------------------------------------------------------------------------------
CATEGORICAL FEATURES
--------------------------------------------------------------------------------

sex:
  Unique values: ['female', 'male']
  Value counts:
    male: 577 (64.8%)
    female: 314 (35.2%)

✓ Data schema sav