<a href="https://colab.research.google.com/github/sksizer/dat490/blob/main/BFRSS_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Setup
- check env
- set and test paths for data

In [ ]:
import os
import sys
from IPython import get_ipython
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

def is_colab():
    return 'google.colab' in str(get_ipython())

# Set up environment and paths
if is_colab():
    print("Running in Google Colab")
    
    # Clone the repository if not already cloned
    if not os.path.exists('dat490'):
        import subprocess
        print("Cloning repository...")
        subprocess.run(['git', 'clone', 'https://github.com/sksizer/dat490.git'], check=True)
        print("Repository cloned successfully")
    
    # Add the repository to Python path for imports
    sys.path.insert(0, '/content/dat490')
    
    # Set paths to use data from the cloned repository
    BFRSS_DATA_PATH = 'dat490/data/LLCP2023.parquet'
    BFRSS_CODEBOOK_PATH = 'dat490/data/codebook_USCODE23_LLCP_021924.HTML'
    BFRSS_DESC_PATH = 'dat490/data/LLCP2023_desc.parquet'  # Additional metadata file if needed
else:
    print("Running in local environment")
    
    # Add parent directory to path for dat490 module imports
    sys.path.insert(0, os.path.abspath('..'))
    
    # Use local data paths
    BFRSS_DATA_PATH = '../data/LLCP2023.parquet'
    BFRSS_CODEBOOK_PATH = '../data/codebook_USCODE23_LLCP_021924.HTML'
    BFRSS_DESC_PATH = '../data/LLCP2023_desc.parquet'  # Additional metadata file if needed

# Verify files exist
print(f"\\nData path: {BFRSS_DATA_PATH}")
print(f"Codebook path: {BFRSS_CODEBOOK_PATH}")

if not os.path.exists(BFRSS_DATA_PATH):
    raise FileNotFoundError(f"Data file not found at {BFRSS_DATA_PATH}")

if not os.path.exists(BFRSS_CODEBOOK_PATH):
    raise FileNotFoundError(f"Codebook file not found at {BFRSS_CODEBOOK_PATH}")

print("\\nAll required files found!")
logger.info('Environment setup complete')

# Load Data and Metadata
- The new BFRSS wrapper provides a single interface to access both data and metadata
- It automatically handles file paths, lazy loading, and metadata parsing
- By default, we load the _DESC columns which contain the categorized version of the BRFSS data
- The metadata generation will skip _DESC columns to avoid duplication with value_ranges information

In [ ]:
# Load BFRSS data and metadata using the new wrapper
from dat490 import load_bfrss

# Single function call to load everything
# By default, this loads the _DESC columns in the DataFrame
# but excludes them from metadata generation to avoid duplication
bfrss = load_bfrss(exclude_desc_columns=True)

# Get a copy of the raw DataFrame (includes _DESC columns)
bfrss_raw_df = bfrss.cloneDF()
print(f"DataFrame shape: {bfrss_raw_df.shape}")
print(f"_DESC columns: {len([col for col in bfrss_raw_df.columns if col.endswith('_DESC')])}")
bfrss_raw_df.info()

In [None]:
bfrss_raw_df.loc[:,'_AGEG5YR'].describe()

In [ ]:
# The parser module is already imported by the BFRSS wrapper
# but we can import specific classes if needed for type hints or direct use
try:
    from dat490.parser import (
        ColumnMetadata, 
        ValueDef,
        ValueRange,
        ColumnStatistics,
        NumericStatistics,
        CategoricalStatistics
    )
    print("Parser classes available for direct use")
except ImportError as e:
    print(f"Error importing parser classes: {e}")

In [ ]:
# Get the metadata dictionary from BFRSS wrapper
bfrss_metadata = bfrss.cloneMetadata()

# The metadata is already parsed and ready to use
print(f"Total metadata entries: {len(bfrss_metadata)}")

In [ ]:
# Demonstrate BFRSS wrapper utility methods

# 1. Direct value lookup
state_1_desc = bfrss.lookup_value('_STATE', 1)
print(f"State code 1 = {state_1_desc}")

# 2. Get all sections
sections = bfrss.get_sections()
print(f"\nTotal sections: {len(sections)}")
print("First 5 sections:", sections[:5])

# 3. Get columns by section
calc_columns = bfrss.get_columns_by_section('Calculated Variables')
print(f"\nCalculated Variables section has {len(calc_columns)} columns")
print("First 5:", calc_columns[:5])

# 4. Search for columns
diabetes_columns = bfrss.search_columns('diabetes')
print(f"\nColumns mentioning 'diabetes': {len(diabetes_columns)}")
for col in diabetes_columns[:3]:
    meta = bfrss.get_column_info(col)
    print(f"  {col}: {meta.label}")

In [ ]:
# Note about _DESC columns
# The LLCP2023.parquet file contains both raw numeric codes AND _DESC columns
# The _DESC columns have the human-readable categorized versions of the data
# These are particularly useful for analysis and visualization

# List all _DESC columns
desc_columns = [col for col in bfrss_raw_df.columns if col.endswith('_DESC')]
print(f"Total _DESC columns in DataFrame: {len(desc_columns)}")
print(f"First 10 _DESC columns: {desc_columns[:10]}")

# Example: Compare raw vs _DESC for a specific column
if '_AGEG5YR' in bfrss_raw_df.columns and '_AGEG5YR_DESC' in bfrss_raw_df.columns:
    print("\nExample: Age group comparison")
    comparison = bfrss_raw_df[['_AGEG5YR', '_AGEG5YR_DESC']].value_counts().head(10)
    print(comparison)

In [None]:
# Display the number of columns parsed
print(f"Parsed {len(bfrss_metadata)} column definitions from the codebook")

# Show a sample of the metadata
sample_keys = list(bfrss_metadata.keys())[:5]
for key in sample_keys:
    metadata = bfrss_metadata[key]
    print(f"\n{key}:")
    print(f"  Label: {metadata.label}")
    print(f"  Question: {metadata.question}")
    print(f"  Column: {metadata.column}")
    print(f"  Type: {metadata.type_of_variable}")
    print(f"  Computed: {metadata.computed}")
    print(f"  Section Name: {metadata.section_name}")
    print(f"  Section Number: {metadata.section_number}")
    print(f"  Question Number: {metadata.question_number}")

# Metadata Documentation
Notes and examples of the metadata extraction:



In [None]:
print(f"Total columns in dataframe: {len(bfrss_raw_df.columns)}")
print(f"Total metadata parsed: {len(bfrss_metadata)}")
print(f"Coverage: {len(bfrss_metadata) / len(bfrss_raw_df.columns) * 100:.1f}%")

# Check which columns don't have metadata
missing_metadata = [col for col in bfrss_raw_df.columns if col not in bfrss_metadata]
print(f"\nColumns without metadata: {len(missing_metadata)}")
if missing_metadata:
    print("First 10 missing:", missing_metadata[:10])
print("Note: There is data for these columns but no metadata is available, likely purged bc of policy changes.")

## Using BFRSS Wrapper Utility Methods

The BFRSS wrapper provides several convenient methods for working with the data:

## Understanding the Friendly Mapping Feature
(note I generated the following docs and examples with ChatGPT, but I've vetted all of it)

The metadata parser includes a powerful "friendly mapping" feature that translates numeric codes in the dataset to their human-readable descriptions. This is particularly useful for categorical variables where numeric codes represent specific responses.

### How It Works

Each `ColumnMetadata` object contains a `value_lookup` dictionary that maps numeric values (or None) to their text descriptions. This mapping is automatically extracted from the codebook HTML file during parsing.

#### Key Components:

1. **`value_lookup` dictionary**: Found in each `ColumnMetadata` object
   - Keys: Numeric codes (int) or None
   - Values: Human-readable descriptions (str)

2. **Automatic extraction**: The `get_value_lookup()` function in `parser.py` extracts these mappings from HTML tables in the codebook

### Example 1: Understanding what values mean for a specific column

In [ ]:
# Example 1: Understanding what values mean for a specific column
# Let's look at the _STATE column which has distinct state codes

state_metadata = bfrss_metadata['_STATE']
print(f"Column: {state_metadata.sas_variable_name}")
print(f"Label: {state_metadata.label}")
print(f"Question: {state_metadata.question}")
print(f"\nSample of value mappings (first 10):")

# Show first 10 state mappings using value_ranges
for i, val_def in enumerate(state_metadata.value_ranges[:10]):
    if isinstance(val_def, ValueRange):
        if val_def.start == val_def.end:
            print(f"  {val_def.start}: {val_def.description}")
        else:
            print(f"  {val_def.start}-{val_def.end}: {val_def.description}")
    else:
        print(f"  [Non-numeric]: {val_def.description}")

### Example 2: Translating values in your data

In [ ]:
# Example 2: Translating values in your data
# Let's translate some actual STATE values from the dataframe

# Get a sample of state values
sample_values = bfrss_raw_df['_STATE'].value_counts().head(10)
print("Top 10 states by number of respondents:\n")

for value, count in sample_values.items():
    # Get the description from value_ranges
    description = "Unknown"
    if not pd.isna(value):
        value_int = int(value)
        for val_def in state_metadata.value_ranges:
            if isinstance(val_def, ValueRange) and val_def.start <= value_int <= val_def.end:
                description = val_def.description
                break
    
    print(f"Code {int(value)}: {description} (Count: {count:,})")

### Example 3: Creating a mapping function for easy translation

In [ ]:
# Example 3: Creating a mapping function for easy translation
def translate_column_values(df, column_name, metadata_dict):
    """
    Translate numeric codes to descriptions for a specific column.
    Updated to work with value_ranges instead of value_lookup.

    Args:
        df: The dataframe containing the data
        column_name: Name of the column to translate
        metadata_dict: Dictionary of column metadata

    Returns:
        Pandas Series with translated values
    """
    if column_name not in metadata_dict:
        print(f"No metadata found for column: {column_name}")
        return df[column_name]

    metadata = metadata_dict[column_name]

    # Create translation function
    def translate(value):
        if pd.isna(value):
            return "Missing"
        
        value_int = int(value) if isinstance(value, (int, float)) else None
        if value_int is not None:
            # Use value_ranges instead of value_lookup
            for val_def in metadata.value_ranges:
                if isinstance(val_def, ValueRange) and val_def.start <= value_int <= val_def.end:
                    return val_def.description
        
        return f"Unknown code: {value}"

    return df[column_name].apply(translate)

# Example usage - translate STATE codes
bfrss_raw_df['STATE_NAME'] = translate_column_values(bfrss_raw_df, '_STATE', bfrss_metadata)

# Show sample
print("Sample of translated state values:")
print(bfrss_raw_df[['_STATE', 'STATE_NAME']].head(10))

### Example 4: Working with columns that have ranges

In [ ]:
# Example 4: Working with columns that have ranges
# Now let's test with POORHLTH which has a range value "1 - 30"
poorhlth_metadata = bfrss_metadata['POORHLTH']
print(f"Column: {poorhlth_metadata.sas_variable_name}")
print(f"Label: {poorhlth_metadata.label}")

# Check value mappings using value_ranges
print(f"\nValue mappings:")
for val_def in poorhlth_metadata.value_ranges:
    if isinstance(val_def, ValueRange):
        if val_def.start == val_def.end:
            print(f"  {val_def.start}: {val_def.description} (Count: {val_def.count})")
        else:
            print(f"  {val_def.start}-{val_def.end}: {val_def.description} (Count: {val_def.count})")
    else:
        print(f"  [Non-numeric]: {val_def.description}")

### Example 5: Batch translation of multiple columns

In [None]:
# Example 5: Batch translation of multiple columns
# This example shows how to efficiently translate multiple columns at once

def batch_translate_columns(df, column_list, metadata_dict):
    """
    Translate multiple columns from numeric codes to descriptions.

    Args:
        df: The dataframe containing the data
        column_list: List of column names to translate
        metadata_dict: Dictionary of column metadata

    Returns:
        Dictionary of translated series
    """
    translated = {}

    for col in column_list:
        if col in metadata_dict and col in df.columns:
            translated[f"{col}_DESC"] = translate_column_values(df, col, metadata_dict)
            print(f"Translated {col}")
        else:
            print(f"Skipped {col} (not found in metadata or dataframe)")

    return translated

# Translate several categorical columns
columns_to_translate = ['_STATE', 'FMONTH', 'DISPCODE', 'SEX1']
translations = batch_translate_columns(bfrss_raw_df, columns_to_translate, bfrss_metadata)

# Add translations to dataframe
for col_name, translated_series in translations.items():
    bfrss_raw_df[col_name] = translated_series

# Show sample of multiple translations
print("\nSample of translated data:")
original_cols = columns_to_translate[:3]  # Show first 3
desc_cols = [f"{col}_DESC" for col in original_cols]
print(bfrss_raw_df[original_cols + desc_cols].head())

### Example 6: Getting columns by Section Name

### Example 7: Using the new statistics feature

The updated parser now automatically calculates statistics for each column during parsing. This includes:
- For numeric columns: mean, std, min, max, quartiles
- For categorical columns: value counts and top values with descriptions

In [None]:
# Example 7: Accessing pre-calculated statistics
# Let's examine the statistics for a numeric column
ageg5yr_metadata = bfrss_metadata['_AGEG5YR']
print(f"Column: {ageg5yr_metadata.sas_variable_name}")
print(f"Label: {ageg5yr_metadata.label}")
print(f"Type: {ageg5yr_metadata.type_of_variable}")

if ageg5yr_metadata.statistics:
    stats = ageg5yr_metadata.statistics
    print(f"\nStatistics:")
    print(f"  Count: {stats.count:,}")
    print(f"  Null Count: {stats.null_count:,}")
    print(f"  Unique Values: {stats.unique_count}")
    
    if hasattr(stats, 'mean'):  # NumericStatistics
        print(f"  Mean: {stats.mean:.2f}")
        print(f"  Std Dev: {stats.std:.2f}")
        print(f"  Min: {stats.min}")
        print(f"  25th percentile: {stats.q25}")
        print(f"  Median: {stats.median}")
        print(f"  75th percentile: {stats.q75}")
        print(f"  Max: {stats.max}")

# Let's also look at a categorical column
state_stats = bfrss_metadata['_STATE'].statistics
if state_stats and hasattr(state_stats, 'top_values'):
    print(f"\n\nTop states by response count:")
    for item in state_stats.top_values[:5]:
        print(f"  {item['description']}: {item['count']:,} responses")

### Example 8: Value counts in ranges

In [ ]:
# Example 8: The new parser calculates counts for each value range
# This is particularly useful for understanding data distribution

poorhlth_metadata = bfrss_metadata['POORHLTH']
print(f"Column: {poorhlth_metadata.sas_variable_name}")
print(f"Label: {poorhlth_metadata.label}")
print(f"\nValue distribution:")

total_responses = 0
for val_def in poorhlth_metadata.value_ranges:
    if isinstance(val_def, ValueRange) and val_def.count > 0:
        total_responses += val_def.count
        if val_def.start == val_def.end:
            print(f"  Value {val_def.start} ({val_def.description}): {val_def.count:,} responses")
        else:
            print(f"  Range {val_def.start}-{val_def.end} ({val_def.description}): {val_def.count:,} responses")

print(f"\nTotal responses captured in value ranges: {total_responses:,}")

# We can also check the column's overall statistics
if poorhlth_metadata.statistics:
    print(f"Total non-null responses: {poorhlth_metadata.statistics.count:,}")
    print(f"Null/missing responses: {poorhlth_metadata.statistics.null_count:,}")

In [None]:
# Simple example: Get all column names for 'Calculated Variables' section
calculated_columns = [col for col, meta in bfrss_metadata.items()
                     if meta.section_name == 'Calculated Variables']

print(f"Columns in 'Calculated Variables' section: {len(calculated_columns)}")
print(f"\nColumn names: {calculated_columns}")

# Kelly Scratch

- making own copies of data for experimentation: k_df, k_metadata


In [None]:
k_df = bfrss_raw_df.copy()
k_metadata = bfrss_metadata.copy()

# Metadata Tests
##

k_m_df = pd.DataFrame.from_dict(k_metadata, orient='index')
k_m_df.info()

# New Section