<a href="https://colab.research.google.com/github/sksizer/dat490/blob/main/BFRSS_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Setup
- check env
- set and test paths for data

In [None]:
import os
from IPython import get_ipython
import urllib.request
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

def is_colab():
    return 'google.colab' in str(get_ipython())

def download_if_needed(url: str, filename: str) -> str:
    """Download a file if it doesn't already exist."""
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url, filename)
        print(f"Downloaded: {filename}")
    else:
        print(f"Using cached file: {filename}")
    return filename

# Set up file paths
if is_colab():
    BFRSS_DATA_PATH = download_if_needed(
        "https://singular-eclair-6a5a16.netlify.app/LLCP2023.parquet",
        "LLCP2023.parquet"
    )
    BFRSS_CODEBOOK_PATH = download_if_needed(
        "https://singular-eclair-6a5a16.netlify.app/html/codebook_USCODE23_LLCP_021924.HTML",
        "codebook_USCODE23_LLCP_021924.HTML"
    )
else:
    BFRSS_DATA_PATH = '../data/LLCP2023.parquet'
    BFRSS_CODEBOOK_PATH = '../data/codebook_USCODE23_LLCP_021924.HTML'

# Check file existence
if not os.path.exists(BFRSS_DATA_PATH):
    raise FileNotFoundError(f"Data path {BFRSS_DATA_PATH} does not exist")

if not os.path.exists(BFRSS_CODEBOOK_PATH):
    raise FileNotFoundError(f"Codebook path {BFRSS_CODEBOOK_PATH} does not exist")

logger.info('Environment setup complete')


# Load Data and Metadata
- creates starting DF `bfrss_raw_df` from BFRSS data
- extract metadata: parses [Codebook](https://github.com/sksizer/dat490/blob/main/data/codebook_USCODE23_LLCP_021924.HTML) into a dictionary that uses columns as keys:
  ```
  bfrss_metadata
  # to get the metadata for a column:
  bfrss_metadata['COLUMN1']

  # It also has value to value descriptions such as:
  bfrss_metadata['COLUMN1'].value_lookup[1] # will return something like 'Number of times worked out in last week'
  ```

In [None]:
import pandas as pd
bfrss_raw_df = pd.read_parquet(BFRSS_DATA_PATH)
bfrss_raw_df.info()

In [None]:
bfrss_raw_df.loc[:,'_AGEG5YR'].describe()

In [None]:
import re
from pathlib import Path
from typing import Optional, Dict, List, Any, Union

import pandas as pd
from bs4 import PageElement, BeautifulSoup

from pydantic import BaseModel, Field


class ValueDef(BaseModel):
    """Base model for representing value definitions in BRFSS survey data."""
    description: str
    missing: bool = Field(default=False)


class ValueRange(ValueDef):
    """Model for value definitions that have a numeric range (single value or range of values)."""
    start: int
    end: int
    count: int  # How many values fall in this range


class ColumnStatistics(BaseModel):
    """Base model for statistical information about a column."""
    count: int                          # Number of non-null values
    null_count: int                     # Number of null values
    unique_count: Optional[int] = None  # Number of unique values


class NumericStatistics(ColumnStatistics):
    """Statistical information for numeric columns."""
    mean: Optional[float] = None        # Mean value
    std: Optional[float] = None         # Standard deviation
    min: Optional[float] = None         # Minimum value
    q25: Optional[float] = None         # 25th percentile
    median: Optional[float] = None      # Median value (50th percentile)
    q75: Optional[float] = None         # 75th percentile
    max: Optional[float] = None         # Maximum value


class CategoricalStatistics(ColumnStatistics):
    """Statistical information for categorical columns."""
    value_counts: Dict[str, int]        # Count of each unique value
    top_values: List[Dict[str, Any]]    # List of most common values with counts


class ColumnMetadata(BaseModel):
    """
    Model representing metadata for a single column in the BRFSS dataset.
    Contains information parsed from the codebook including variable details,
    associated question text, and possible values.
    """
    computed: bool                      # Whether this is a calculated/derived variable
    label: str                          # Human-readable label for the variable
    sas_variable_name: str              # Original SAS variable name from dataset
    section_name: Optional[str] = None  # Name of the survey section
    section_number: Optional[int] = None # Core section number
    module_number: Optional[int] = None # Module number for optional modules
    question_number: Optional[int] = None # Question number within section
    column: Optional[str] = None        # Column position in dataset (can be range like "1-2")
    type_of_variable: Optional[str] = None # "Num" or "Char"
    question_prologue: Optional[str] = None # Text before the actual question
    question: Optional[str] = None      # The actual question text from survey
    value_lookup: list[ValueDef | ValueRange]        # Possible values for this variable
    html_name: str                      # HTML anchor name for linking to codebook
    statistics: Optional[Union[NumericStatistics, CategoricalStatistics]] = None  # Statistical information


def get_value_def(tr:PageElement, df: Optional[pd.DataFrame] = None, column_name: Optional[str] = None) -> ValueDef | ValueRange:
    """
    Extract value definition from a table row in the codebook.

    Parses a table row containing value codes and their descriptions. Handles both
    single values and ranges (e.g., "1-30"). If DataFrame and column name are provided,
    calculates the count of values in the range.

    Args:
        tr: BeautifulSoup PageElement representing a table row with value information
        df: Optional DataFrame containing the data
        column_name: Optional column name to calculate counts for

    Returns:
        Either a ValueDef (for non-numeric or unparseable values) or
        ValueRange (for single numbers or numeric ranges)
    """
    cells = tr.find_all('td')

    value_text = cells[0].text.strip()
    description = cells[1].text.strip()

    # Check if the value is actually a range such as "1 - 30" or "1-30"
    range_match = re.match(r'^(\d+)\s*[-–]\s*(\d+)$', value_text)
    if range_match:
        start = int(range_match.group(1))
        end = int(range_match.group(2))

        # Calculate count if DataFrame and column are provided
        count = 0
        if df is not None and column_name is not None and column_name in df.columns:
            try:
                series = df[column_name]
                # Count values in the range (inclusive)
                count = int(series.between(start, end, inclusive='both').sum())
            except Exception as e:
                print(f"Error calculating count for range {start}-{end} in column {column_name}: {e}")
                count = 0

        return ValueRange(
            start=start,
            end=end,
            description=description,
            count=count
        )
    else:
        # Try to parse as single integer
        try:
            value = int(value_text)

            # Calculate count if DataFrame and column are provided
            count = 0
            if df is not None and column_name is not None and column_name in df.columns:
                try:
                    series = df[column_name]
                    # Count occurrences of this specific value
                    count = int((series == value).sum())
                except Exception as e:
                    print(f"Error calculating count for value {value} in column {column_name}: {e}")
                    count = 0

            return ValueRange(
                start=value,
                end=value,
                description=description,
                count=count
            )
        except:
            return ValueDef(
                description=description
            )


def get_value_lookup(table:PageElement, df: Optional[pd.DataFrame] = None, column_name: Optional[str] = None) -> list[ValueDef]:
    """
    Extract all possible values for a column from a codebook table.

    Given a table from the codebook HTML, extracts all value definitions
    (codes and their descriptions) from the rows. If DataFrame and column name
    are provided, calculates counts for ValueRange objects.

    Args:
        table: BeautifulSoup PageElement representing a table containing value codes
              and descriptions
        df: Optional DataFrame containing the data
        column_name: Optional column name to calculate counts for

    Returns:
        List of ValueDef/ValueRange objects containing all possible values
        for the column

    Example table structure:
    <table>
    <tbody>
    <tr>
        <td>value</td> <!-- single int value, blank, or range like "1-30" -->
        <td>Value description</td>
    </tr>
    </tbody>
    </table>
    """
    value_ranges : list[ValueDef] = []

    for tr in table.find('tbody').find_all('tr'):
        value_ranges.append(get_value_def(tr, df, column_name))

    return value_ranges


def parse_codebook_html(html_path: Path, df: Optional[pd.DataFrame] = None) -> Dict[str, ColumnMetadata]:
    """
    Parse the BRFSS codebook HTML file and extract column metadata.

    Args:
        html_path: Path to the HTML codebook file
        df: Optional DataFrame containing BRFSS data for calculating statistics

    Returns:
        Dictionary mapping SAS variable names to ColumnMetadata objects
    """
    with open(html_path, 'r', encoding='windows-1252') as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all div elements with class "branch"
    branches = soup.find_all('div', class_='branch')

    # The first one is the Codebook header table which we don't want
    branches = branches[1:]

    metadata_dict = {}

    for branch in branches:
        html_name = branch.find('a')['name']
        print('html_name' + html_name)
        # Find the table with summary="Procedure Report: Report"
        table = branch.find('table', attrs={'summary': 'Procedure Report: Report'})
        if not table:
            continue

        # Find the first td in the thead > tr
        thead = table.find('thead')
        if not thead:
            continue

        first_tr = thead.find('tr')
        if not first_tr:
            continue

        # Find td with metadata content - may not have all classes
        metadata_cell = None
        for td in first_tr.find_all('td'):
            text = td.get_text()
            if text:
                # Clean text before checking
                text_clean = text.replace('\xa0', ' ')
                if 'Label:' in text_clean and 'SAS Variable Name:' in text_clean:
                    metadata_cell = td
                    break

        if not metadata_cell:
            continue

        cell_text = metadata_cell.get_text()

        # Check if this cell contains column metadata by looking for key fields
        try:
            # Extract fields using regex - handle non-breaking spaces
            cell_text = cell_text.replace('\xa0', ' ')  # Replace non-breaking spaces

            label_match = re.search(r'Label:\s*(.+?)(?=Section\s*Name:|Core\s*Section\s*Number:|Module\s*Number:|$)', cell_text, re.DOTALL)
            section_name_match = re.search(r'Section\s*Name:\s*(.+?)(?=Core\s*Section\s*Number:|Section\s*Number:|Module\s*Number:|Question\s*Number:|$)', cell_text, re.DOTALL)
            # Handle both "Core Section Number" and "Section Number"
            section_number_match = re.search(r'(?:Core\s*)?Section\s*Number:\s*(\d+)', cell_text)
            # Handle "Module Number"
            module_number_match = re.search(r'Module\s*Number:\s*(\d+)', cell_text)
            question_number_match = re.search(r'Question\s*Number:\s*(\d+)', cell_text)
            column_match = re.search(r'Column:\s*(.+?)(?=Type\s*of\s*Variable:|$)', cell_text, re.DOTALL)
            type_match = re.search(r'Type\s*of\s*Variable:\s*(.+?)(?=SAS\s*Variable\s*Name:|$)', cell_text, re.DOTALL)
            sas_name_match = re.search(r'SAS\s*Variable\s*Name:\s*(.+?)(?=Question\s*Prologue:|Question:|$)', cell_text, re.DOTALL)
            prologue_match = re.search(r'Question\s*Prologue:\s*(.+?)(?=Question:|$)', cell_text, re.DOTALL)
            question_match = re.search(r'Question:\s*(.+?)$', cell_text, re.DOTALL)

            # Only require label and SAS variable name
            if label_match and sas_name_match:

                # Clean up the extracted values
                label = label_match.group(1).strip()
                sas_variable_name = sas_name_match.group(1).strip()

                # Extract optional fields
                section_name = section_name_match.group(1).strip() if section_name_match else None
                section_number = int(section_number_match.group(1)) if section_number_match else None
                module_number = int(module_number_match.group(1)) if module_number_match else None
                question_number = int(question_number_match.group(1)) if question_number_match else None
                column = column_match.group(1).strip() if column_match else None
                type_of_variable = type_match.group(1).strip() if type_match else None
                question_prologue = prologue_match.group(1).strip() if prologue_match else None
                question = question_match.group(1).strip() if question_match else None

                # Remove any extra whitespace or newlines
                if question_prologue and not question_prologue:
                    question_prologue = None

                # Calculate statistics if DataFrame is provided and column exists
                statistics = None
                if df is not None and sas_variable_name in df.columns:
                    series = df[sas_variable_name]

                    # Common statistics for all columns
                    count = series.count()
                    null_count = series.isna().sum()
                    unique_count = series.nunique()

                    # Determine if column should be treated as numeric or categorical
                    is_numeric = False
                    if type_of_variable == "Num" and pd.api.types.is_numeric_dtype(series):
                        try:
                            # Calculate numeric statistics
                            desc = series.describe()

                            # Create numeric statistics
                            statistics = NumericStatistics(
                                count=count,
                                null_count=null_count,
                                unique_count=unique_count,
                                mean=float(desc['mean']) if not pd.isna(desc['mean']) else None,
                                std=float(desc['std']) if not pd.isna(desc['std']) else None,
                                min=float(desc['min']) if not pd.isna(desc['min']) else None,
                                q25=float(desc['25%']) if not pd.isna(desc['25%']) else None,
                                median=float(desc['50%']) if not pd.isna(desc['50%']) else None,
                                q75=float(desc['75%']) if not pd.isna(desc['75%']) else None,
                                max=float(desc['max']) if not pd.isna(desc['max']) else None
                            )
                            is_numeric = True
                        except Exception as e:
                            print(f"Error calculating numeric stats for {sas_variable_name}: {e}")
                            is_numeric = False

                    # If not numeric or numeric calculation failed, treat as categorical
                    if not is_numeric:
                        try:
                            # Get value counts (limited to top 20 for brevity)
                            value_counts = series.value_counts().head(20).to_dict()

                            # Convert all keys to strings for JSON compatibility
                            value_counts_str = {str(k): int(v) for k, v in value_counts.items()}

                            # Create list of top values with counts and descriptions
                            top_values = []
                            for value, count in value_counts.items():
                                # Try to get description from value_lookup
                                description = None
                                value_lookup_list = get_value_lookup(table, df, sas_variable_name)
                                if isinstance(value, (int, float)) and not pd.isna(value):
                                    value_int = int(value) if hasattr(value, 'is_integer') and value.is_integer() else int(value) if isinstance(value, int) else None
                                    # Search through ValueRange objects to find a match
                                    for val_def in value_lookup_list:
                                        if isinstance(val_def, ValueRange) and value_int is not None and val_def.start <= value_int <= val_def.end:
                                            description = val_def.description
                                            break

                                top_values.append({
                                    "value": str(value),
                                    "count": int(count),
                                    "description": description if description else "Unknown"
                                })

                            # Create categorical statistics
                            statistics = CategoricalStatistics(
                                count=count,
                                null_count=null_count,
                                unique_count=unique_count,
                                value_counts=value_counts_str,
                                top_values=top_values
                            )
                        except Exception as e:
                            print(f"Error calculating categorical stats for {sas_variable_name}: {e}")

                # Create ColumnMetadata object
                metadata = ColumnMetadata(
                    label=label,
                    sas_variable_name=sas_variable_name,
                    section_name=section_name,
                    section_number=section_number,
                    module_number=module_number,
                    question_number=question_number,
                    column=column,
                    type_of_variable=type_of_variable,
                    question_prologue=question_prologue,
                    question=question,
                    value_lookup=get_value_lookup(table, df, sas_variable_name),
                    computed= True if section_name == 'Calculated Variables' or section_name == 'Calculated Race Variables' else False,
                    html_name=html_name,
                    statistics=statistics
                )

                metadata_dict[sas_variable_name] = metadata

        except Exception as e:
            # Skip cells that don't parse correctly but show problems
            print(e)

    return metadata_dict

In [None]:
bfrss_metadata = parse_codebook_html(Path(BFRSS_CODEBOOK_PATH), bfrss_raw_df)

In [None]:
# Display the number of columns parsed
print(f"Parsed {len(bfrss_metadata)} column definitions from the codebook")

# Show a sample of the metadata
sample_keys = list(bfrss_metadata.keys())[:5]
for key in sample_keys:
    metadata = bfrss_metadata[key]
    print(f"\n{key}:")
    print(f"  Label: {metadata.label}")
    print(f"  Question: {metadata.question}")
    print(f"  Column: {metadata.column}")
    print(f"  Type: {metadata.type_of_variable}")
    print(f"  Computed: {metadata.computed}")
    print(f"  Section Name: {metadata.section_name}")
    print(f"  Section Number: {metadata.section_number}")
    print(f"  Question Number: {metadata.question_number}")

# Metadata Documentation
Notes and examples of the metadata extraction:



In [None]:
print(f"Total columns in dataframe: {len(bfrss_raw_df.columns)}")
print(f"Total metadata parsed: {len(bfrss_metadata)}")
print(f"Coverage: {len(bfrss_metadata) / len(bfrss_raw_df.columns) * 100:.1f}%")

# Check which columns don't have metadata
missing_metadata = [col for col in bfrss_raw_df.columns if col not in bfrss_metadata]
print(f"\nColumns without metadata: {len(missing_metadata)}")
if missing_metadata:
    print("First 10 missing:", missing_metadata[:10])
print("Note: There is data for these columns but no metadata is available, likely purged bc of policy changes.")

## Understanding the Friendly Mapping Feature
(note I generated the following docs and examples with ChatGPT, but I've vetted all of it)

The metadata parser includes a powerful "friendly mapping" feature that translates numeric codes in the dataset to their human-readable descriptions. This is particularly useful for categorical variables where numeric codes represent specific responses.

### How It Works

Each `ColumnMetadata` object contains a `value_lookup` dictionary that maps numeric values (or None) to their text descriptions. This mapping is automatically extracted from the codebook HTML file during parsing.

#### Key Components:

1. **`value_lookup` dictionary**: Found in each `ColumnMetadata` object
   - Keys: Numeric codes (int) or None
   - Values: Human-readable descriptions (str)

2. **Automatic extraction**: The `get_value_lookup()` function in `parser.py` extracts these mappings from HTML tables in the codebook

### Example 1: Understanding what values mean for a specific column

In [None]:
# Example 1: Understanding what values mean for a specific column
# Let's look at the _STATE column which has distinct state codes

state_metadata = bfrss_metadata['_STATE']
print(f"Column: {state_metadata.sas_variable_name}")
print(f"Label: {state_metadata.label}")
print(f"Question: {state_metadata.question}")
print(f"\nSample of value mappings (first 10):")

# Show first 10 state mappings
for i, val_def in enumerate(state_metadata.value_lookup[:10]):
    if isinstance(val_def, ValueRange):
        if val_def.start == val_def.end:
            print(f"  {val_def.start}: {val_def.description}")
        else:
            print(f"  {val_def.start}-{val_def.end}: {val_def.description}")
    else:
        print(f"  [Non-numeric]: {val_def.description}")

### Example 2: Translating values in your data

In [None]:
# Example 2: Translating values in your data
# Let's translate some actual STATE values from the dataframe

# Get a sample of state values
sample_values = bfrss_raw_df['_STATE'].value_counts().head(10)
print("Top 10 states by number of respondents:\n")

for value, count in sample_values.items():
    # Get the description from value_lookup
    description = "Unknown"
    if not pd.isna(value):
        value_int = int(value)
        for val_def in state_metadata.value_lookup:
            if isinstance(val_def, ValueRange) and val_def.start <= value_int <= val_def.end:
                description = val_def.description
                break
    
    print(f"Code {int(value)}: {description} (Count: {count:,})")

### Example 3: Creating a mapping function for easy translation

In [None]:
# Example 3: Creating a mapping function for easy translation
def translate_column_values(df, column_name, metadata_dict):
    """
    Translate numeric codes to descriptions for a specific column.

    Args:
        df: The dataframe containing the data
        column_name: Name of the column to translate
        metadata_dict: Dictionary of column metadata

    Returns:
        Pandas Series with translated values
    """
    if column_name not in metadata_dict:
        print(f"No metadata found for column: {column_name}")
        return df[column_name]

    metadata = metadata_dict[column_name]

    # Create translation function
    def translate(value):
        if pd.isna(value):
            return "Missing"
        
        value_int = int(value) if isinstance(value, (int, float)) else None
        if value_int is not None:
            for val_def in metadata.value_lookup:
                if isinstance(val_def, ValueRange) and val_def.start <= value_int <= val_def.end:
                    return val_def.description
        
        return f"Unknown code: {value}"

    return df[column_name].apply(translate)

# Example usage - translate STATE codes
bfrss_raw_df['STATE_NAME'] = translate_column_values(bfrss_raw_df, '_STATE', bfrss_metadata)

# Show sample
print("Sample of translated state values:")
print(bfrss_raw_df[['_STATE', 'STATE_NAME']].head(10))

### Example 4: Working with columns that have ranges

In [None]:
# Example 4: Working with columns that have ranges
# Now let's test with POORHLTH which has a range value "1 - 30"
poorhlth_metadata = bfrss_metadata['POORHLTH']
print(f"Column: {poorhlth_metadata.sas_variable_name}")
print(f"Label: {poorhlth_metadata.label}")

# Check value mappings
print(f"\nValue mappings:")
for val_def in poorhlth_metadata.value_lookup:
    if isinstance(val_def, ValueRange):
        if val_def.start == val_def.end:
            print(f"  {val_def.start}: {val_def.description} (Count: {val_def.count})")
        else:
            print(f"  {val_def.start}-{val_def.end}: {val_def.description} (Count: {val_def.count})")
    else:
        print(f"  [Non-numeric]: {val_def.description}")

### Example 5: Batch translation of multiple columns

In [None]:
# Example 5: Batch translation of multiple columns
# This example shows how to efficiently translate multiple columns at once

def batch_translate_columns(df, column_list, metadata_dict):
    """
    Translate multiple columns from numeric codes to descriptions.

    Args:
        df: The dataframe containing the data
        column_list: List of column names to translate
        metadata_dict: Dictionary of column metadata

    Returns:
        Dictionary of translated series
    """
    translated = {}

    for col in column_list:
        if col in metadata_dict and col in df.columns:
            translated[f"{col}_DESC"] = translate_column_values(df, col, metadata_dict)
            print(f"Translated {col}")
        else:
            print(f"Skipped {col} (not found in metadata or dataframe)")

    return translated

# Translate several categorical columns
columns_to_translate = ['_STATE', 'FMONTH', 'DISPCODE', 'SEX1']
translations = batch_translate_columns(bfrss_raw_df, columns_to_translate, bfrss_metadata)

# Add translations to dataframe
for col_name, translated_series in translations.items():
    bfrss_raw_df[col_name] = translated_series

# Show sample of multiple translations
print("\nSample of translated data:")
original_cols = columns_to_translate[:3]  # Show first 3
desc_cols = [f"{col}_DESC" for col in original_cols]
print(bfrss_raw_df[original_cols + desc_cols].head())

### Example 6: Getting columns by Section Name

### Example 7: Using the new statistics feature

The updated parser now automatically calculates statistics for each column during parsing. This includes:
- For numeric columns: mean, std, min, max, quartiles
- For categorical columns: value counts and top values with descriptions

In [None]:
# Example 7: Accessing pre-calculated statistics
# Let's examine the statistics for a numeric column
ageg5yr_metadata = bfrss_metadata['_AGEG5YR']
print(f"Column: {ageg5yr_metadata.sas_variable_name}")
print(f"Label: {ageg5yr_metadata.label}")
print(f"Type: {ageg5yr_metadata.type_of_variable}")

if ageg5yr_metadata.statistics:
    stats = ageg5yr_metadata.statistics
    print(f"\nStatistics:")
    print(f"  Count: {stats.count:,}")
    print(f"  Null Count: {stats.null_count:,}")
    print(f"  Unique Values: {stats.unique_count}")
    
    if hasattr(stats, 'mean'):  # NumericStatistics
        print(f"  Mean: {stats.mean:.2f}")
        print(f"  Std Dev: {stats.std:.2f}")
        print(f"  Min: {stats.min}")
        print(f"  25th percentile: {stats.q25}")
        print(f"  Median: {stats.median}")
        print(f"  75th percentile: {stats.q75}")
        print(f"  Max: {stats.max}")

# Let's also look at a categorical column
state_stats = bfrss_metadata['_STATE'].statistics
if state_stats and hasattr(state_stats, 'top_values'):
    print(f"\n\nTop states by response count:")
    for item in state_stats.top_values[:5]:
        print(f"  {item['description']}: {item['count']:,} responses")

### Example 8: Value counts in ranges

In [None]:
# Example 8: The new parser calculates counts for each value range
# This is particularly useful for understanding data distribution

poorhlth_metadata = bfrss_metadata['POORHLTH']
print(f"Column: {poorhlth_metadata.sas_variable_name}")
print(f"Label: {poorhlth_metadata.label}")
print(f"\nValue distribution:")

total_responses = 0
for val_def in poorhlth_metadata.value_lookup:
    if isinstance(val_def, ValueRange) and val_def.count > 0:
        total_responses += val_def.count
        if val_def.start == val_def.end:
            print(f"  Value {val_def.start} ({val_def.description}): {val_def.count:,} responses")
        else:
            print(f"  Range {val_def.start}-{val_def.end} ({val_def.description}): {val_def.count:,} responses")

print(f"\nTotal responses captured in value ranges: {total_responses:,}")

# We can also check the column's overall statistics
if poorhlth_metadata.statistics:
    print(f"Total non-null responses: {poorhlth_metadata.statistics.count:,}")
    print(f"Null/missing responses: {poorhlth_metadata.statistics.null_count:,}")

In [None]:
# Simple example: Get all column names for 'Calculated Variables' section
calculated_columns = [col for col, meta in bfrss_metadata.items()
                     if meta.section_name == 'Calculated Variables']

print(f"Columns in 'Calculated Variables' section: {len(calculated_columns)}")
print(f"\nColumn names: {calculated_columns}")

# Kelly Scratch

- making own copies of data for experimentation: k_df, k_metadata


In [None]:
k_df = bfrss_raw_df.copy()
k_metadata = bfrss_metadata.copy()

# Metadata Tests
##

k_m_df = pd.DataFrame.from_dict(k_metadata, orient='index')
k_m_df.info()

# New Section