In [None]:
import os
import sys
import subprocess

def is_colab():
    return 'google.colab' in str(get_ipython())

# Setup for Google Colab environment only
if is_colab():
    GIT_REPO_URL = 'https://github.com/sksizer/dat490.git'
    LOCAL_DIR = '/content/code/dat490'

    if not os.path.exists(LOCAL_DIR):
        print(f"Cloning repo into {LOCAL_DIR}...")
        subprocess.run(['git', 'clone', GIT_REPO_URL, LOCAL_DIR], check=True)
    else:
        print(f"Repo already exists at {LOCAL_DIR}, pulling latest changes...")
        subprocess.run(['git', '-C', LOCAL_DIR, 'pull'], check=True)

    if LOCAL_DIR not in sys.path:
        sys.path.insert(0, LOCAL_DIR)
        print(f"Added {LOCAL_DIR} to sys.path")

    # Import dat490 package
    import dat490
else:
    # Running locally - assume dat490 is already available
    import dat490

# Load Data and Metadata
- The new BFRSS wrapper provides a single interface to access both data and metadata
- It automatically handles file paths, lazy loading, and metadata parsing
- Uses the LLCP2023_desc_categorized.parquet file which contains both raw codes and _DESC columns
- The metadata generation can optionally exclude _DESC columns to avoid duplication with value_ranges information

In [ ]:
# Load BFRSS data and metadata using the new wrapper
import pandas as pd
import logging
from dat490 import load_bfrss

# Configure logging for the notebook - clean format with just time and message
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%H:%M:%S',
    handlers=[
        logging.StreamHandler()  # Output to notebook cells
    ]
)

# Get logger for this notebook
logger = logging.getLogger(__name__)

# Single function call to load everything
# By default, this loads the _DESC columns in the DataFrame
# but excludes them from metadata generation to avoid duplication

# Set root directory based on environment
if is_colab():
    root_directory = '/content/code/dat490'
else:
    root_directory = None  # Use default search paths

logger.info("Initializing BFRSS data loader...")
bfrss = load_bfrss(exclude_desc_columns=True, root_dir=root_directory)

# Get a copy of the raw DataFrame (includes _DESC columns)
bfrss_raw_df = bfrss.cloneDF()
logger.info(f"DataFrame shape: {bfrss_raw_df.shape}")
logger.info(f"_DESC columns: {len([col for col in bfrss_raw_df.columns if col.endswith('_DESC')])}")
bfrss_raw_df.info()

In [None]:
# Note about _DESC columns
# The LLCP2023_desc_categorized.parquet file contains both raw numeric codes AND _DESC columns
# The _DESC columns have the human-readable categorized versions of the data
# These are particularly useful for analysis and visualization

# List all _DESC columns
desc_columns = [col for col in bfrss_raw_df.columns if col.endswith('_DESC')]
print(f"Total _DESC columns in DataFrame: {len(desc_columns)}")
print(f"First 10 _DESC columns: {desc_columns[:10]}")

# Example: Compare raw vs _DESC for a specific column
if '_AGEG5YR' in bfrss_raw_df.columns and '_AGEG5YR_DESC' in bfrss_raw_df.columns:
    print("\nExample: Age group comparison")
    comparison = bfrss_raw_df[['_AGEG5YR', '_AGEG5YR_DESC']].value_counts().head(10)
    print(comparison)

In [None]:
# Get the metadata dictionary from BFRSS wrapper
logger.info("Loading metadata from BFRSS wrapper...")
bfrss_metadata = bfrss.cloneMetadata()

# The metadata is already parsed and ready to use
logger.info(f"Total metadata entries: {len(bfrss_metadata)}")
print(f"Total metadata entries: {len(bfrss_metadata)}")

In [None]:
# Display the number of columns parsed
print(f"Parsed {len(bfrss_metadata)} column definitions from the codebook")

# Show a sample of the metadata
sample_keys = list(bfrss_metadata.keys())[:5]
for key in sample_keys:
    metadata = bfrss_metadata[key]
    print(f"\n{key}:")
    print(f"  Label: {metadata.label}")
    print(f"  Question: {metadata.question}")
    print(f"  Column: {metadata.column}")
    print(f"  Type: {metadata.type_of_variable}")
    print(f"  Computed: {metadata.computed}")
    print(f"  Section Name: {metadata.section_name}")
    print(f"  Section Number: {metadata.section_number}")
    print(f"  Question Number: {metadata.question_number}")

In [None]:
# Demonstrate BFRSS wrapper utility methods

# 1. Direct value lookup
state_1_desc = bfrss.lookup_value('_STATE', 1)
print(f"State code 1 = {state_1_desc}")

# 2. Get all sections
sections = bfrss.get_sections()
print(f"\nTotal sections: {len(sections)}")
print("First 5 sections:", sections[:5])

# 3. Get columns by section
calc_columns = bfrss.get_columns_by_section('Calculated Variables')
print(f"\nCalculated Variables section has {len(calc_columns)} columns")
print("First 5:", calc_columns[:5])

# 4. Search for columns
diabetes_columns = bfrss.search_columns('diabetes')
print(f"\nColumns mentioning 'diabetes': {len(diabetes_columns)}")
for col in diabetes_columns[:3]:
    meta = bfrss.get_column_info(col)
    print(f"  {col}: {meta.label}")

# Metadata Documentation
Notes and examples of the metadata extraction:



In [None]:
print(f"Total columns in dataframe: {len(bfrss_raw_df.columns)}")
print(f"Total metadata parsed: {len(bfrss_metadata)}")
print(f"Coverage: {len(bfrss_metadata) / len(bfrss_raw_df.columns) * 100:.1f}%")

# Check which columns don't have metadata
missing_metadata = [col for col in bfrss_raw_df.columns if col not in bfrss_metadata]
print(f"\nColumns without metadata: {len(missing_metadata)}")
if missing_metadata:
    print("First 10 missing:", missing_metadata[:10])
print("Note: There is data for these columns but no metadata is available, likely purged bc of policy changes.")

In [None]:
# Simple example: Get all column names for 'Calculated Variables' section using wrapper
calculated_columns = bfrss.get_columns_by_section('Calculated Variables')

print(f"Columns in 'Calculated Variables' section: {len(calculated_columns)}")
print(f"\nColumn names: {calculated_columns}")

Jaime - RQ3

In [None]:
#Arthritis EDA

# Arthritis calculated variable info
arthritis_metadata = bfrss_metadata['_DRDXAR2']
print(f"Column: {arthritis_metadata.sas_variable_name}")
print(f"Label: {arthritis_metadata.label}")
print(f"Question: {arthritis_metadata.question}")
print(f"\nValue mappings:")
# Show arthritis mappings
for i, (value, description) in enumerate(arthritis_metadata.value_lookup.items()):
    if i < 10:
        print(f"  {value}: {description}")

#Arthritis value counts
sample_values = bfrss_raw_df['_DRDXAR2'].value_counts().head(10)
print("\nArthritis category counts:")

for value, count in sample_values.items():
    # Get the description from value_lookup
    description = arthritis_metadata.value_lookup.get(int(value) if not pd.isna(value) else None, "Unknown")
    print(f"Code {int(value)}: {description} (Count: {count:,})")

In [None]:
# List of Variable Names in Category: Chronic Health Conditions
chronic_health_conditions_columns = [col for col, meta in bfrss_metadata.items()
                     if meta.section_name == 'Chronic Health Conditions']

print(f"Columns in 'Chronic Health Conditions' section: {len(chronic_health_conditions_columns)}")
print(f"\nColumn names: {chronic_health_conditions_columns}")

In [None]:
# List of Variable Names in Category: Demographics
demographics_columns = [col for col, meta in bfrss_metadata.items()
                        if 'Demographics' in meta.section_name]

print(f"Columns in 'Demographics' section: {len(demographics_columns)}")
print(f"\nColumn names: {demographics_columns}")

In [None]:
# Print column names that contain 'FMONTH'
matching_columns = [col for col in bfrss_raw_df.columns if 'FMONTH' in col]
print("Columns containing 'FMONTH':", matching_columns)

# Optionally: show first few values if column exists
if matching_columns:
    print("\nSample values:")
    print(bfrss_raw_df[matching_columns].head())



# Print column names that contain 'DISPCODE'
matching_columns = [col for col in bfrss_raw_df.columns if 'DISPCODE' in col]
print("Columns containing 'DISPCODE':", matching_columns)

# Optionally: show first few values if column exists
if matching_columns:
    print("\nSample values:")
    print(bfrss_raw_df[matching_columns].head())


In [None]:
# Demographics EDA

# Issue, seems not all calculated variables are mapped, because those with Column name == 'Demographics' is only 13, but contains 'Demographics' is 344. Need to access metadata of calculated variables like _AGE5YR and _MRACE1

# Ensure keys in bfrss_metadata are strings, and filter based on section_name
demographics_columns = [
    col for col, meta in bfrss_metadata.items()
    if isinstance(col, str) and hasattr(meta, 'section_name') and 'Demographics' in meta.section_name
]

print(f"Columns in 'Demographics' section: {len(demographics_columns)}")
print(f"\nColumn names: {demographics_columns}")

# Loop through demographic variables and print metadata
for var in demographics_columns:
    meta = bfrss_metadata[var]
    print(f"\n=== Column: {meta.sas_variable_name} ===")
    print(f"Label: {meta.label}")
    print(f"Question: {meta.question}")

# Restrict to a specific list of demographic variables (the ones associated with socioeconomic status)
demographics_columns = ['EDUCA', 'EMPLOY1', 'INCOME3']

# Loop through demographic variables
for demographics_variable in demographics_columns:
    metadata = bfrss_metadata[demographics_variable]

    print(f"\n=== Column: {metadata.sas_variable_name} ===")
    print(f"Label: {metadata.label}")
    print(f"Question: {metadata.question}")

    print("\nValue mappings:")
    for i, (value, description) in enumerate(metadata.value_lookup.items()):
        if i < 10:
            print(f"  {value}: {description}")

    # Demographics_variable value counts
    sample_values = bfrss_raw_df[demographics_variable].value_counts().head(10)
    print(f"\n{metadata.sas_variable_name} category counts:")

    for value, count in sample_values.items():
        if pd.notna(value):
            try:
                lookup_key = int(value)
            except ValueError:
                lookup_key = value
        else:
            lookup_key = None

        description = metadata.value_lookup.get(lookup_key, "Unknown")
        print(f"  Code {value}: {description} (Count: {count:,})")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Demographics Bar Charts

# Restrict to a specific list of demographic variables
demographics_columns = ['MARITAL', 'EDUCA', 'EMPLOY1', 'INCOME3']

for demographics_variable in demographics_columns:
    metadata = bfrss_metadata[demographics_variable]
    col_values = bfrss_raw_df[demographics_variable].dropna()

    # Map code to label for top 10 value counts
    value_counts = col_values.value_counts().head(10)
    labels = [metadata.value_lookup.get(int(val), f"Code {val}") for val in value_counts.index]

    plt.figure(figsize=(10, 5))
    sns.barplot(x=value_counts.values, y=labels, orient='h', palette='viridis')
    plt.title(f"Top {len(labels)} Categories for {metadata.sas_variable_name}\n{metadata.label}")
    plt.xlabel("Count")
    plt.ylabel("Category")
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.interpolate import make_interp_spline

# INCOME3-specific visualization in logical income order
metadata = bfrss_metadata['INCOME3']
income_series = bfrss_raw_df['INCOME3'].dropna()

# Define the intended logical income order manually
ordered_codes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 77, 99]
ordered_labels = [metadata.value_lookup.get(code, f"Code {code}") for code in ordered_codes]
counts = income_series.value_counts()

# Extract counts for codes in the desired order (skip codes not in data)
ordered_counts = [counts.get(code, 0) for code in ordered_codes]

# Custom colors: grays for 77 and 99, color palette for others
main_palette = sns.color_palette("mako", len(ordered_codes) - 2)
palette = main_palette + [(0.6, 0.6, 0.6), (0.3, 0.3, 0.3)]

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=ordered_counts, y=ordered_labels, orient='h', palette=palette)

# Smooth line over the main income groups (exclude 77 and 99)
x_vals = np.array(ordered_counts[:-2])
y_vals = np.array(range(len(ordered_counts) - 2))

# Fit a spline for smoothing
if len(x_vals) > 3:
    spline = make_interp_spline(y_vals, x_vals, k=2)
    y_smooth = np.linspace(y_vals.min(), y_vals.max(), 200)
    x_smooth = spline(y_smooth)
    plt.plot(x_smooth, y_smooth, color='black', linewidth=2)

plt.title(f"Income Distribution ({metadata.sas_variable_name})\n{metadata.label}")
plt.xlabel("Count")
plt.ylabel("Income Range")
plt.tight_layout()
plt.show()


# Appendix

This section contains examples and documentation for working with the BFRSS data and metadata.

## Understanding the Value Ranges Feature

The metadata parser has been updated to use `value_ranges` instead of `value_lookup`. This provides more detailed information about each value range including counts.

### How It Works

Each `ColumnMetadata` object contains a `value_ranges` list with `ValueRange` objects that include:
- `start`: The starting value of the range
- `end`: The ending value of the range
- `description`: Human-readable description
- `count`: Number of occurrences in the data

The BFRSS wrapper provides convenient methods to work with these value ranges.

In [None]:
import statsmodels.formula.api as smf

# Demographic and health condition variables
demographics = ['EDUCA', 'EMPLOY1', 'INCOME3', '_RACE']
conditions = ['MICHD', 'ASTHMS1', '_DRDXAR2']

# Loop through each condition and run a logistic regression on all demographics
for cond in conditions:
    if cond in bfrss_raw_df.columns:
        # Build formula
        predictors = ' + '.join([d for d in demographics if d in bfrss_raw_df.columns])
        formula = f"{cond} ~ {predictors}"

        # Subset and clean data
        model_data = bfrss_raw_df[[cond] + demographics].dropna()
        for col in demographics:
            model_data[col] = model_data[col].astype('category')

        # Fit logistic regression
        try:
            model = smf.logit(formula=formula, data=model_data).fit(disp=False)
            print(f"\n=== Logistic Regression Results for {cond} ===")
            print(model.summary2().tables[1])
        except Exception as e:
            print(f"Could not fit model for {cond}: {e}")
