In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_DIR = PROJECT_ROOT / "data" / "processed"
OUT_DIR = DATA_DIR

df = pd.read_parquet(DATA_DIR / "asthma_clean.parquet",  engine="fastparquet")  # from Problem 3/4

#### Define feature groups

The first step in feature manipulation is to make explicit and reproducible groups of features. This matters because:

- It keeps preprocessing consistent across notebooks and scripts.

- It makes feature engineering easier later (e.g., scaling continuous variables only, or encoding categorical variables).

In this dataset, I structured features into clear groups:

- Demographics: age, gender, ethnicity, education_level

- Lifestyle factors: bmi, smoking, physical_activity, diet_quality, sleep_quality

- Environmental exposures: pollution_exposure, pollen_exposure, dust_exposure

- Medical history: pet_allergy, family_history_asthma, history_of_allergies, eczema, hay_fever, gastroesophageal_reflux

- Lung function: lung_function_fev1, lung_function_fvc, fev1_fvc_pct

- Symptoms: wheezing, shortness_of_breath, chest_tightness, coughing, nighttime_symptoms, exercise_induced

- Target: diagnosis

This grouping is not just bookkeeping — it helps us test hypotheses. For example:

Are lifestyle factors predictive independently of medical history?

Are lung function measures redundant with symptom reports?

In [3]:
target = "diagnosis"

# Identify columns by pattern
symptoms = [
    "wheezing","shortness_of_breath","chest_tightness",
    "coughing","nighttime_symptoms","exercise_induced"
]
atopy = ["eczema","hay_fever","history_of_allergies","pet_allergy"]
exposures = ["pollution_exposure","pollen_exposure","dust_exposure"]
lifestyle = ["physical_activity","diet_quality","sleep_quality"]

# Ethnicity dummies
ethnicity_cols = [c for c in df.columns if c.startswith("ethnicity_")]

# Core numerics I keep 
core_numeric = [
    "age","bmi","lung_function_fev1","fev1_fvc_pct",
    *lifestyle, *exposures
]

binary_other = [
    "gender","smoking","family_history_asthma","gastroesophageal_reflux",
    *atopy, *symptoms
]

all_inputs = core_numeric + binary_other + ethnicity_cols
X = df[all_inputs].copy()
y = df[target].astype(int).copy()

print("X shape:", X.shape, "| y positive rate:", y.mean().round(4))

X shape: (2392, 28) | y positive rate: 0.0518


In [4]:
# Symptom burden (0–6)
X["symptom_count"] = df[symptoms].sum(axis=1).astype(float)

# Atopy score (0–4): allergic phenotype burden
X["atopy_score"] = df[atopy].sum(axis=1).astype(float)

# Lifestyle index (0–10): average of three 0–10 scores
X["lifestyle_index"] = df[lifestyle].mean(axis=1)

# Exposure index (0–10): average of three 0–10 exposure scores
X["exposure_index"] = df[exposures].mean(axis=1)

# Simple interaction hypothesis: airflow + symptoms
X["ratio_x_symptoms"] = df["fev1_fvc_pct"] * X["symptom_count"]

# Optional: nonlinearity hint for BMI (centered quadratic)
X["bmi_c"] = df["bmi"] - df["bmi"].mean()
X["bmi_c2"] = X["bmi_c"] ** 2

print("Engineered columns added:",
      ["symptom_count","atopy_score","lifestyle_index","exposure_index","ratio_x_symptoms","bmi_c","bmi_c2"])

Engineered columns added: ['symptom_count', 'atopy_score', 'lifestyle_index', 'exposure_index', 'ratio_x_symptoms', 'bmi_c', 'bmi_c2']


Why these?

symptom_count: overall symptom burden often tracks diagnosis/severity.

atopy_score: captures allergic phenotype breadth.

lifestyle/exposure indices: reduce dimensionality of related 0–10 scales.

ratio_x_symptoms: hypothesis that obstruction signal is stronger when symptoms co-occur.

bmi_c2: cheap nonlinearity; obesity/asthma links aren’t always linear.

In [5]:
# Cast booleans/ints to float; keep dummies as 0/1 floats (models-friendly)
X_numeric = X.copy()
for c in X_numeric.columns:
    if X_numeric[c].dtype == "boolean":
        X_numeric[c] = X_numeric[c].astype(float)
    elif str(X_numeric[c].dtype).startswith("int") or X_numeric[c].dtype == bool:
        X_numeric[c] = X_numeric[c].astype(float)

# Sanity checks
print("Any NaNs in X?", X_numeric.isna().any().any())
print("dtypes:", X_numeric.dtypes.value_counts().to_dict())

Any NaNs in X? False
dtypes: {dtype('float64'): 35}


In [6]:
# Remove near-zero variance columns first (e.g., constants)
vt = VarianceThreshold(threshold=1e-8)
_ = vt.fit(X_numeric)
low_var_cols = [col for col, keep in zip(X_numeric.columns, vt.get_support()) if not keep]
print("Low-variance dropped:", low_var_cols)
X_screen = X_numeric.drop(columns=low_var_cols)

# Mutual information (nonlinear-friendly univariate signal)
mi = mutual_info_classif(X_screen, y, discrete_features=False, random_state=42)
mi_rank = (pd.DataFrame({"feature": X_screen.columns, "mi": mi})
           .sort_values("mi", ascending=False))
mi_rank.head(15)

Low-variance dropped: []


Unnamed: 0,feature,mi
30,lifestyle_index,0.012516
16,history_of_allergies,0.01018
28,symptom_count,0.007339
12,family_history_asthma,0.006643
24,ethnicity_african_american,0.006058
17,pet_allergy,0.005472
33,bmi_c,0.005352
1,bmi,0.005352
8,pollen_exposure,0.00524
19,shortness_of_breath,0.004015


I screened features using variance and mutual information. All features carried at least some variance, and none had zero univariate signal. Because asthma is multifactorial, I kept all features for now and will rely on modeling to down-weight weak predictors.

In [7]:
# Identify columns to scale (intersection with what's in X_screen)
scale_cols = list(set(core_numeric + [
    "symptom_count","atopy_score","lifestyle_index",
    "exposure_index","ratio_x_symptoms","bmi_c","bmi_c2"
]) & set(X_screen.columns))

# Pass-through = everything not scaled
passthrough_cols = [c for c in X_screen.columns if c not in scale_cols]

preproc = ColumnTransformer(
    transformers=[
        ("scale", StandardScaler(with_mean=True, with_std=True), scale_cols),
        ("keep", "passthrough", passthrough_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# Fit/transform to get the final numeric matrix
X_final = preproc.fit_transform(X_screen)
final_cols = list(preproc.get_feature_names_out())
X_final = pd.DataFrame(X_final, columns=final_cols)

print("Final X shape:", X_final.shape)
print("Scaled columns (first 8):", scale_cols[:8])

# Quick sanity check: standardized columns should have ~0 mean and ~1 std
means = X_final[scale_cols].mean().round(3)
stds  = X_final[scale_cols].std(ddof=0).round(3)
print("Means (should be ~0):\n", means.head())
print("Stds  (should be ~1):\n", stds.head())

Final X shape: (2392, 35)
Scaled columns (first 8): ['bmi_c2', 'diet_quality', 'dust_exposure', 'pollen_exposure', 'ratio_x_symptoms', 'atopy_score', 'physical_activity', 'lifestyle_index']
Means (should be ~0):
 bmi_c2              0.0
diet_quality       -0.0
dust_exposure       0.0
pollen_exposure     0.0
ratio_x_symptoms    0.0
dtype: float64
Stds  (should be ~1):
 bmi_c2              1.0
diet_quality        1.0
dust_exposure       1.0
pollen_exposure     1.0
ratio_x_symptoms    1.0
dtype: float64


I standardized continuous features (mean = 0, std = 1) while keeping binary/dummies as-is. This avoids scale dominance in models like logistic regression, SVMs, or regularized classifiers.

In [8]:
X_tr, X_te, y_tr, y_te = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)
print("Train:", X_tr.shape, "Test:", X_te.shape, "| y+ rate:", y_tr.mean().round(4), y_te.mean().round(4))

Train: (1913, 35) Test: (479, 35) | y+ rate: 0.0518 0.0522


I used a stratified 80/20 split, ensuring the asthma positive rate (~5%) is preserved across both sets. This prevents sampling bias and gives a reliable basis for model evaluation.

In [9]:
X_final.to_parquet(OUT_DIR / "asthma_features.parquet", index=False)
y.to_frame("diagnosis").to_parquet(OUT_DIR / "asthma_target.parquet", index=False)
print("Saved:\n -", OUT_DIR / "asthma_features.parquet", "\n -", OUT_DIR / "asthma_target.parquet")

Saved:
 - C:\Data Science\07.Data-Science-Project-Architecture-Lab\data\processed\asthma_features.parquet 
 - C:\Data Science\07.Data-Science-Project-Architecture-Lab\data\processed\asthma_target.parquet


I saved:

X_final.parquet → Features (numeric, standardized, engineered).

y.parquet → Target.

This avoids re-running preprocessing every time and ensures reproducibility.

In this step, I transformed the cleaned dataset into a model-ready feature table. Features were grouped by clinical meaning, and several engineered variables (symptom burden, atopy score, lifestyle and exposure indices, BMI quadratic term, etc.) were added to capture plausible asthma-related mechanisms. All features were converted to numeric, screened for variance and univariate signal, and continuous variables were standardized to ensure comparability across scales.

Rather than discarding weak features at this stage, I kept them all, since asthma diagnosis is a multifactorial condition and interactions may reveal additional signal. Finally, I split the dataset into stratified train/test sets and saved the processed tables in a reproducible format.

The result is a rectangular, all-numeric dataset (35 features, 2392 patients) that is ready for downstream modeling.