In [3]:
import os, sys
print("CWD:", os.getcwd())
print("sys.path[0]:", sys.path[0])

CWD: /workspaces/COMP560FinalProject/notebooks
sys.path[0]: /workspaces/COMP560FinalProject/notebooks


In [4]:
import os, sys

# Go up one directory from notebooks/ to the project root
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

print("Added to sys.path:", PROJECT_ROOT)

Added to sys.path: /workspaces/COMP560FinalProject


In [6]:
from src.model import load_raw_data, add_targets, run_regression_models, FEATURE_COLUMNS

df = load_raw_data("../data/merged.csv")
df = add_targets(df)

len(df), df[FEATURE_COLUMNS].shape[1]


(1773, 29)

In [7]:
results = run_regression_models(df, models=["linear", "sgd"])
results


{'linear': RegressionResult(model_name='linear', mse=5.511008024907479e+22, r2=0.8622330063074354),
 'sgd': RegressionResult(model_name='sgd', mse=1.2975132914090717e+23, r2=0.6756409995672737)}

In [8]:
from src.model import make_regression_pipeline, train_test_split_all

X_train, X_test, y_train, y_test, _, _ = train_test_split_all(df)
pipe = make_regression_pipeline("linear")
pipe.fit(X_train, y_train)

scaler = pipe.named_steps["scaler"]
model = pipe.named_steps["model"]

coef = model.coef_
feature_importance = sorted(zip(FEATURE_COLUMNS, coef), key=lambda x: abs(x[1]), reverse=True)

for name, c in feature_importance[:15]:
    print(f"{name:35s} {c: .4f}")


Air pollution                       -36169851064163.9219
Household air pollution from solid fuels  19284564602657.8750
Outdoor air pollution                17950535286737.9648
Unsafe sanitation                   -5276394206241.1123
Unsafe water source                  4747439474937.9463
Low physical activity                3533302709492.0610
Diet low in whole grains            -3261528544314.4541
Population                           2073181437924.6194
Smoking                              1887928534795.6641
Secondhand smoke                    -1749962901558.0569
High systolic blood pressure        -1309237846172.6858
Low birth weight for gestation      -1183498490891.5784
Diet low in vegetables              -1063305427284.6097
Iron deficiency                      891441911823.5647
Low bone mineral density             890865374473.9675


In [10]:
import json
from src.model import (
    run_regression_models,
    run_classification_models,
    FEATURE_COLUMNS,
    make_regression_pipeline,
    train_test_split_all,
)

df = load_raw_data("../data/merged.csv")
df = add_targets(df)

reg_results = run_regression_models(df)
clf_results = run_classification_models(df)

# Get top coefficients from one model for interpretability
X_train, X_test, y_train, y_test, _, _ = train_test_split_all(df)
pipe = make_regression_pipeline("linear")
pipe.fit(X_train, y_train)
model = pipe.named_steps["model"]

coef = model.coef_
feature_importance = sorted(
    zip(FEATURE_COLUMNS, coef),
    key=lambda x: abs(x[1]),
    reverse=True,
)

top_features = feature_importance[:10]

summary_data = {
    "n_rows": int(df.shape[0]),
    "n_features": len(FEATURE_COLUMNS),
    "regression_results": {k: vars(v) for k, v in reg_results.items()},
    "classification_results": {k: vars(v) for k, v in clf_results.items()},
    "top_features": [
        {"name": name, "coef": float(c)} for name, c in top_features
    ],
}

print(json.dumps(summary_data, indent=2))

{
  "n_rows": 1773,
  "n_features": 29,
  "regression_results": {
    "linear": {
      "model_name": "linear",
      "mse": 5.511008024907479e+22,
      "r2": 0.8622330063074354
    },
    "sgd": {
      "model_name": "sgd",
      "mse": 1.4410562476904441e+23,
      "r2": 0.6397573981222189
    }
  },
  "classification_results": {
    "logreg_gd": {
      "model_name": "logreg_gd",
      "accuracy": 0.8845070422535212,
      "f1": 0.8857938718662952
    },
    "logreg_liblinear": {
      "model_name": "logreg_liblinear",
      "accuracy": 0.8704225352112676,
      "f1": 0.8808290155440415
    }
  },
  "top_features": [
    {
      "name": "Air pollution",
      "coef": -36169851064163.92
    },
    {
      "name": "Household air pollution from solid fuels",
      "coef": 19284564602657.875
    },
    {
      "name": "Outdoor air pollution",
      "coef": 17950535286737.965
    },
    {
      "name": "Unsafe sanitation",
      "coef": -5276394206241.112
    },
    {
      "name": "Uns