In [None]:
import pandas as pd

df = pd.read_parquet('./data/LLCP2023.parquet')
df_mapped = pd.read_parquet('./data/LLCP2023_partialmap.parquet')

In [None]:
df.info()
df.head(n=1000)

In [None]:
df_mapped.head()

# Friendly Mapping
This creates a lookup dictionary between a column and the metadata the codebook provides on it.

With this we can lookup what a value at a row/column (such as 1) 'means'.

My intent was to add additional metadata there to support further EDA and analysis.
```python
# Example
```
- This create a lookup object between columns and 'friendly names'
-

In [None]:
# Import the metadata parser
from pathlib import Path
from metadata.parser import parse_codebook_html

# Parse the codebook HTML file
codebook_path = Path('./data/codebook_USCODE23_LLCP_021924.HTML')
column_metadata = parse_codebook_html(codebook_path)

# Display the number of columns parsed
print(f"Parsed {len(column_metadata)} column definitions from the codebook")

# Show a sample of the metadata
sample_keys = list(column_metadata.keys())[:5]
for key in sample_keys:
    metadata = column_metadata[key]
    print(f"\n{key}:")
    print(f"  Label: {metadata.label}")
    print(f"  Question: {metadata.question}")
    print(f"  Column: {metadata.column}")
    print(f"  Type: {metadata.type_of_variable}")

# Examining Metadata
At this point we should have the metadata about columns extracted.

Right now it is a dictionary where column name is key.

In [None]:
# Check how complete the metadata is
print(f"Total columns in dataframe: {len(df.columns)}")
print(f"Total metadata parsed: {len(column_metadata)}")
print(f"Coverage: {len(column_metadata) / len(df.columns) * 100:.1f}%")

# Check which columns don't have metadata
missing_metadata = [col for col in df.columns if col not in column_metadata]
print(f"\nColumns without metadata: {len(missing_metadata)}")
if missing_metadata:
    print("First 10 missing:", missing_metadata[:10])

To be a bit more data science oriented we'll turn the dictionary into another dataframe:




# Understanding the Friendly Mapping Feature

The metadata parser includes a powerful "friendly mapping" feature that translates numeric codes in the dataset to their human-readable descriptions. This is particularly useful for categorical variables where numeric codes represent specific responses.

## How It Works

Each `ColumnMetadata` object contains a `value_lookup` dictionary that maps numeric values (or None) to their text descriptions. This mapping is automatically extracted from the codebook HTML file during parsing.

### Key Components:

1. **`value_lookup` dictionary**: Found in each `ColumnMetadata` object
   - Keys: Numeric codes (int) or None
   - Values: Human-readable descriptions (str)

2. **Automatic extraction**: The `get_value_lookup()` function in `parser.py` extracts these mappings from HTML tables in the codebook

## Example Usage

In [None]:
# Example 1: Understanding what values mean for a specific column
# Let's look at the _STATE column which has distinct state codes

state_metadata = column_metadata['_STATE']
print(f"Column: {state_metadata.sas_variable_name}")
print(f"Label: {state_metadata.label}")
print(f"Question: {state_metadata.question}")
print(f"\nSample of value mappings (first 10):")
# Show first 10 state mappings
for i, (value, description) in enumerate(state_metadata.value_lookup.items()):
    if i < 10:
        print(f"  {value}: {description}")

In [None]:
# Example 2: Translating values in your data
# Let's translate some actual STATE values from the dataframe

# Get a sample of state values
sample_values = df['_STATE'].value_counts().head(10)
print("Top 10 states by number of respondents:\n")

for value, count in sample_values.items():
    # Get the description from value_lookup
    description = state_metadata.value_lookup.get(int(value) if not pd.isna(value) else None, "Unknown")
    print(f"Code {int(value)}: {description} (Count: {count:,})")

In [None]:
# Example 3: Creating a mapping function for easy translation
def translate_column_values(df, column_name, metadata_dict):
    """
    Translate numeric codes to descriptions for a specific column.
    
    Args:
        df: The dataframe containing the data
        column_name: Name of the column to translate
        metadata_dict: Dictionary of column metadata
    
    Returns:
        Pandas Series with translated values
    """
    if column_name not in metadata_dict:
        print(f"No metadata found for column: {column_name}")
        return df[column_name]
    
    metadata = metadata_dict[column_name]
    
    # Create translation function
    def translate(value):
        if pd.isna(value):
            return "Missing"
        return metadata.value_lookup.get(int(value), f"Unknown code: {value}")
    
    return df[column_name].apply(translate)

# Example usage - translate STATE codes
df['STATE_NAME'] = translate_column_values(df, '_STATE', column_metadata)

# Show sample
print("Sample of translated state values:")
print(df[['_STATE', 'STATE_NAME']].head(10))

In [None]:
# Example 4: Working with columns that have ranges
# Now let's test with POORHLTH which has a range value "1 - 30"

# Re-parse the metadata with the updated parser
from metadata.parser import parse_codebook_html
column_metadata = parse_codebook_html(codebook_path)

poorhlth_metadata = column_metadata['POORHLTH']
print(f"Column: {poorhlth_metadata.sas_variable_name}")
print(f"Label: {poorhlth_metadata.label}")

# Check if the range was properly expanded
print(f"\nTotal value mappings: {len(poorhlth_metadata.value_lookup)}")
print("\nSample mappings:")
# Show some specific values to verify range expansion
for value in [1, 15, 30, 77, 88, 99]:
    if value in poorhlth_metadata.value_lookup:
        print(f"  {value}: {poorhlth_metadata.value_lookup[value]}")

In [None]:
# Example 5: Batch translation of multiple columns
# This example shows how to efficiently translate multiple columns at once

def batch_translate_columns(df, column_list, metadata_dict):
    """
    Translate multiple columns from numeric codes to descriptions.
    
    Args:
        df: The dataframe containing the data
        column_list: List of column names to translate
        metadata_dict: Dictionary of column metadata
    
    Returns:
        Dictionary of translated series
    """
    translated = {}
    
    for col in column_list:
        if col in metadata_dict and col in df.columns:
            translated[f"{col}_DESC"] = translate_column_values(df, col, metadata_dict)
            print(f"Translated {col}")
        else:
            print(f"Skipped {col} (not found in metadata or dataframe)")
    
    return translated

# Translate several categorical columns
columns_to_translate = ['_STATE', 'FMONTH', 'DISPCODE', 'SEX1']
translations = batch_translate_columns(df, columns_to_translate, column_metadata)

# Add translations to dataframe
for col_name, translated_series in translations.items():
    df[col_name] = translated_series

# Show sample of multiple translations
print("\nSample of translated data:")
original_cols = columns_to_translate[:3]  # Show first 3
desc_cols = [f"{col}_DESC" for col in original_cols]
print(df[original_cols + desc_cols].head())

## Advanced Usage Tips (NOTE these tips were ChatGPT generated)

### 1. Filtering Data by Descriptions
Once you have the friendly mappings, you can filter data using human-readable criteria:

```python
# Find all respondents from California
california_code = next(k for k, v in state_metadata.value_lookup.items() if 'California' in v)
ca_data = df[df['_STATE'] == california_code]
```

### 2. Creating Analysis-Ready DataFrames
The friendly mappings are especially useful when creating subsets for analysis:

```python
# Create a subset with translated categorical variables
analysis_df = df[['_STATE', 'SEX1', 'POORHLTH']].copy()
for col in analysis_df.columns:
    if col in column_metadata:
        analysis_df[f'{col}_desc'] = translate_column_values(analysis_df, col, column_metadata)
```

### 3. Handling Special Values
Many BRFSS variables use special codes like:
- 77: Don't know/Not sure
- 88: None (for days-based questions)
- 99: Refused

The value_lookup dictionary includes these, making it easy to identify and handle them appropriately in your analysis.

### 4. Integration with Existing Mapped Data
The `LLCP2023_partialmap.parquet` file already has some columns translated. You can use the metadata parser to:
- Verify existing mappings
- Add mappings for additional columns
- Create custom mapping schemes for specific analyses