In [16]:
import pandas as pd

df = pd.read_parquet('./data/LLCP2023.parquet')
df_mapped = pd.read_parquet('./data/LLCP2023_partialmap.parquet')

In [17]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433323 entries, 0 to 433322
Columns: 350 entries, _STATE to _DRNKDRV
dtypes: float64(345), object(5)
memory usage: 1.1+ GB


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,DROCDY4_,_RFBING6,_DRNKWK2,_RFDRHV8,_FLSHOT7,_PNEUMO3,_AIDTST4,_RFSEAT2,_RFSEAT3,_DRNKDRV
0,1.0,1.0,3012023,3,1,2023,1100.0,2023000001,2023000000.0,1.0,...,0.0,1.0,0.0,1.0,2.0,2.0,2.0,1.0,1.0,9.0
1,1.0,1.0,1062023,1,6,2023,1100.0,2023000002,2023000000.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,9.0
2,1.0,1.0,3082023,3,8,2023,1100.0,2023000003,2023000000.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,9.0
3,1.0,1.0,3062023,3,6,2023,1100.0,2023000004,2023000000.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,9.0
4,1.0,1.0,1062023,1,6,2023,1100.0,2023000005,2023000000.0,1.0,...,7.0,1.0,47.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0


In [18]:
df_mapped.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_DRNKWK2,_RFDRHV8,_FLSHOT7,_PNEUMO3,_AIDTST4,_RFSEAT2,_RFSEAT3,_DRNKDRV,MAXVO21_vo2ml,FC601_mets
0,Alabama,January,3012023,3,1,2023,1100.0,2023000001,2023000000.0,1.0,...,0.0,Not a heavy drinker,No (65+ only),No,No,Always or Almost Always,Always,Don’t know/Refused/Missing,18.4,3.15
1,Alabama,January,1062023,1,6,2023,1100.0,2023000002,2023000000.0,1.0,...,0.0,Not a heavy drinker,Yes (flu shot in past year),"Yes (pneumonia vaccine, 65+ only)",No,Always or Almost Always,Always,Don’t know/Refused/Missing,18.03,3.09
2,Alabama,January,3082023,3,8,2023,1100.0,2023000003,2023000000.0,1.0,...,0.0,Not a heavy drinker,Yes (flu shot in past year),"Yes (pneumonia vaccine, 65+ only)",No,Always or Almost Always,Always,Don’t know/Refused/Missing,13.22,2.27
3,Alabama,January,3062023,3,6,2023,1100.0,2023000004,2023000000.0,1.0,...,0.0,Not a heavy drinker,Yes (flu shot in past year),"Yes (pneumonia vaccine, 65+ only)","Yes, tested for HIV",Always or Almost Always,Always,Don’t know/Refused/Missing,19.14,3.28
4,Alabama,January,1062023,1,6,2023,1100.0,2023000005,2023000000.0,1.0,...,47.0,Not a heavy drinker,No (65+ only),"Yes (pneumonia vaccine, 65+ only)",No,Always or Almost Always,Always,Yes,19.88,3.41


# Friendly Mapping
- create a lookup object between columns and 'friendly names'

In [19]:
# Import the metadata parser
from pathlib import Path
from metadata.parser import parse_codebook_html

# Parse the codebook HTML file
codebook_path = Path('./data/codebook_USCODE23_LLCP_021924.HTML')
column_metadata = parse_codebook_html(codebook_path)

# Display the number of columns parsed
print(f"Parsed {len(column_metadata)} column definitions from the codebook")

# Show a sample of the metadata
sample_keys = list(column_metadata.keys())[:5]
for key in sample_keys:
    metadata = column_metadata[key]
    print(f"\n{key}:")
    print(f"  Label: {metadata.label}")
    print(f"  Question: {metadata.question}")
    print(f"  Column: {metadata.column}")
    print(f"  Type: {metadata.type_of_variable}")

Parsed 344 column definitions from the codebook

_STATE:
  Label: State FIPS Code
  Question: State FIPS Code
  Column: 1-2
  Type: Num

FMONTH:
  Label: File Month
  Question: File Month
  Column: 17-18
  Type: Num

IDATE:
  Label: Interview Date
  Question: Interview Date
  Column: 19-26
  Type: Char

IMONTH:
  Label: Interview Month
  Question: Interview Month
  Column: 19-20
  Type: Char

IDAY:
  Label: Interview Day
  Question: Interview Day
  Column: 21-22
  Type: Char


# Examining Metadata
At this point we should have the metadata about columns extracted.

Right now it is a dictionary where column name is key.

In [20]:
# Check how complete the metadata is
print(f"Total columns in dataframe: {len(df.columns)}")
print(f"Total metadata parsed: {len(column_metadata)}")
print(f"Coverage: {len(column_metadata) / len(df.columns) * 100:.1f}%")

# Check which columns don't have metadata
missing_metadata = [col for col in df.columns if col not in column_metadata]
print(f"\nColumns without metadata: {len(missing_metadata)}")
if missing_metadata:
    print("First 10 missing:", missing_metadata[:10])

Total columns in dataframe: 350
Total metadata parsed: 344
Coverage: 98.3%

Columns without metadata: 7
First 10 missing: ['LNDSXBRT', 'CELSXBRT', 'BIRTHSEX', 'TRNSGNDR', 'USEMRJN4', 'RCSGEND1', 'RCSXBRTH']


To be a bit more data science oriented we'll turn the dictionary into another dataframe:

