In [7]:
# Make sure to install these libraries
# pip install ipywidgets
# jupyter nbextension enable

In [12]:
# import libraries
import pandas as pd
import statsmodels.api as sm
from ipywidgets import interact, SelectMultiple, Dropdown
import plotly.express as px

In [13]:
# Load dataset
df = pd.read_csv("data/df_enriched.csv")

target_var = "life_expectancy"

candidate_predictors = [
    "health_expenditure_usd",
    "physicians_per_1000",
    "nurses_per_1000",
    "smoking_prevalence",
    "diabetes_prevalence",
    "pollution_mortality_rate"
]

In [10]:
import pandas as pd
import statsmodels.api as sm
from ipywidgets import interact, SelectMultiple

# Load dataset
df = pd.read_csv("data/df_enriched.csv")

target_var = "life_expectancy"

candidate_predictors = [
    "health_expenditure_usd",
    "physicians_per_1000",
    "nurses_per_1000",
    "smoking_prevalence",
    "diabetes_prevalence",
    "pollution_mortality_rate",
]

@interact(
    predictors=SelectMultiple(
        options=candidate_predictors,
        value=("health_expenditure_usd",),   # default selection
        description="Select predictors",
    )
)
def run_regression(predictors):
    # Convert tuple -> list so pandas understands it
    predictors = list(predictors)

    if len(predictors) == 0:
        print("⚠ Please select at least ONE independent variable.")
        return

    # Build modeling DataFrame and drop rows with missing values
    cols = [target_var] + predictors
    df_model = df[cols].dropna()

    if df_model.empty:
        print("⚠ No usable rows after filtering — try fewer predictors.")
        return

    X = df_model[predictors]
    y = df_model[target_var]

    # Add intercept and fit OLS
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()

    print("================================")
    print(f"Dependent variable: {target_var}")
    print(f"Predictors: {', '.join(predictors)}")
    print(f"Observations used: {len(df_model)}")
    print(f"R² = {model.rsquared:.3f}")
    print("================================")
    print(model.summary())


interactive(children=(SelectMultiple(description='Select predictors', index=(0,), options=('health_expenditure…

In [14]:
# Make sure df and target_var are already defined above:
# df = pd.read_csv("data/df_enriched.csv")
# target_var = "life_expectancy"

numeric_predictors = [
    "health_expenditure_usd",
    "physicians_per_1000",
    "nurses_per_1000",
    "smoking_prevalence",
    "diabetes_prevalence",
    "pollution_mortality_rate",
]

@interact(
    x_var=Dropdown(
        options=numeric_predictors,
        value="health_expenditure_usd",
        description="X variable",
    )
)
def interactive_scatter(x_var):
    # Keep only rows with both X and Y present
    df_plot = df[[x_var, target_var, "location_key"]].dropna()

    fig = px.scatter(
        df_plot,
        x=x_var,
        y=target_var,
        hover_name="location_key",
        trendline="ols",  # adds regression line
        labels={
            x_var: x_var.replace("_", " ").title(),
            target_var: "Life Expectancy (years)",
        },
        title=f"Life Expectancy vs {x_var.replace('_', ' ').title()}",
    )

    # Optional: log scale for spending since it’s very skewed
    if x_var == "health_expenditure_usd":
        fig.update_xaxes(type="log")

    fig.show()


interactive(children=(Dropdown(description='X variable', options=('health_expenditure_usd', 'physicians_per_10…