In [None]:
import numpy as np
import pandas as pd
import reportlab
import seaborn as sns

from utils import *

In [None]:
df_ori = pd.read_csv('LLCP2018.csv.gz')
df = df_ori.copy()

In [None]:
#Cleaning INDORTAN - Times used an indoor tanning device

#Missing Count
missing_count = df['INDORTAN'].isna().sum()
missing_report = 'There are {:d} ({:.2f}%) missing records after cleaning.'.format(
    missing_count, missing_count / len(df) * 100
)
missing_report


clean_steps_INDORTAN = [

    (
        lambda x: x == 777,
        lambda x: np.nan,
        '777 Dont know / Not sure -> NA'
    ),
    (
        lambda x: x == 888,
        lambda x: 0,
        '888 Never -> 0'
    ),
    (
        lambda x: x == 999,
        lambda x: np.nan,
        '999 Refused -> NA'
    ),
]
clean_and_report(
    df_ori, 'INDORTAN', clean_steps_INDORTAN, # (optional) col_type='Continuous'
)

In [None]:
#Cleaning NUMBURN3 - During the past 12 months, how many times have you had a sunburn?

#Missing Count
missing_count = df['NUMBURN3'].isna().sum()
missing_report = 'There are {:d} ({:.2f}%) missing records after cleaning.'.format(
    missing_count, missing_count / len(df) * 100
)
missing_report

# [(figure, size), text]
ReportBuilder(
    [(fig_before, (8, 3)), cleaner.get_clean_report(), missing_report, (fig_after, (8, 3))],
    title='NUMBURN3', filename='NUMBURN3.pdf'
).build_report()

clean_steps_NUMBURN3 = [

    (
        lambda x: x == 777,
        lambda x: np.nan,
        '777 Dont know / Not sure -> NA'
    ),
    (
        lambda x: x == 888,
        lambda x: 0,
        '888 Never -> 0'
    ),
    (
        lambda x: x == 999,
        lambda x: np.nan,
        '999 Refused -> NA'
    ),
]
clean_and_report(
    df_ori, 'NUMBURN3', clean_steps_NUMBURN3, # (optional) col_type='Continuous'
)

In [None]:
#Cleaning SUNPRTCT - How often protect yourself from the sun
clean_steps_SUNPRTCT = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'SUNPRTCT', clean_steps_SUNPRTCT, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning WKDAYOUT - On weekdays, in the summer, how long are you outside per day between 10am and 4pm
clean_steps_WKDAYOUT = [
    (lambda x: x == 77, lambda x: np.nan, '77 Dont know / Not sure -> NA'),
    (lambda x: x == 99, lambda x: np.nan, '99 Refused -> NA'),
]
clean_and_report(
    df_ori, 'WKDAYOUT', clean_steps_WKDAYOUT, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning WKENDOUT - On weekends in the summer, how long are you outside each day between 10am and 4pm?
clean_steps_WKENDOUT = [
    (lambda x: x == 77, lambda x: np.nan, '77 Dont know / Not sure -> NA'),
    (lambda x: x == 99, lambda x: np.nan, '99 Refused -> NA'),
]
clean_and_report(
    df_ori, 'WKENDOUT', clean_steps_WKENDOUT, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning LCSFIRST - How old when you first started smoking?

#Missing Count
missing_count = df['LCSFIRST'].isna().sum()
missing_report = 'There are {:d} ({:.2f}%) missing records after cleaning.'.format(
    missing_count, missing_count / len(df) * 100
)
missing_report

# [(figure, size), text]
ReportBuilder(
    [(fig_before, (8, 3)), cleaner.get_clean_report(), missing_report, (fig_after, (8, 3))],
    title='LCSFIRST', filename='LCSFIRST.pdf'
).build_report()

clean_steps_LCSFIRST = [

    (
        lambda x: x == 777,
        lambda x: np.nan,
        '777 Dont know / Not sure -> NA'
    ),
    (
        lambda x: x == 888,
        lambda x: 0,
        '888 Never -> 0'
    ),
    (
        lambda x: x == 999,
        lambda x: np.nan,
        '999 Refused -> NA'
    ),
]
clean_and_report(
    df_ori, 'LCSFIRST', clean_steps_LCSFIRST, # (optional) col_type='Continuous'
)

In [None]:
#Cleaning LCSLAST - How old when you last smoked?

#Missing Count
missing_count = df['LCSLAST'].isna().sum()
missing_report = 'There are {:d} ({:.2f}%) missing records after cleaning.'.format(
    missing_count, missing_count / len(df) * 100
)
missing_report

# [(figure, size), text]
ReportBuilder(
    [(fig_before, (8, 3)), cleaner.get_clean_report(), missing_report, (fig_after, (8, 3))],
    title='LCSLAST', filename='LCSLAST.pdf'
).build_report()

clean_steps_LCSLAST = [

    (
        lambda x: x == 777,
        lambda x: np.nan,
        '777 Dont know / Not sure -> NA'
    ),
    (
        lambda x: x == 888,
        lambda x: 0,
        '888 Never -> 0'
    ),
    (
        lambda x: x == 999,
        lambda x: np.nan,
        '999 Refused -> NA'
    ),
]
clean_and_report(
    df_ori, 'LCSLAST', clean_steps_LCSLAST, # (optional) col_type='Continuous'
)

In [None]:
#Cleaning LCSNUMCG - On Average, how many cigarettes do you smoke each day?

#Missing Count
missing_count = df['LCSNUMCG'].isna().sum()
missing_report = 'There are {:d} ({:.2f}%) missing records after cleaning.'.format(
    missing_count, missing_count / len(df) * 100
)
missing_report

# [(figure, size), text]
ReportBuilder(
    [(fig_before, (8, 3)), cleaner.get_clean_report(), missing_report, (fig_after, (8, 3))],
    title='LCSNUMCG', filename='LCSNUMCG.pdf'
).build_report()

clean_steps_LCSNUMCG = [

    (
        lambda x: x == 777,
        lambda x: np.nan,
        '777 Dont know / Not sure -> NA'
    ),
    (
        lambda x: x == 888,
        lambda x: 0,
        '888 Never -> 0'
    ),
    (
        lambda x: x == 999,
        lambda x: np.nan,
        '999 Refused -> NA'
    ),
]
clean_and_report(
    df_ori, 'LCSNUMCG', clean_steps_LCSNUMCG, # (optional) col_type='Continuous'
)

In [None]:
#Cleaning LCSCTSCN - Did you have a CT or CAT scan?
clean_steps_LCSCTSCN = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'LCSCTSCN', clean_steps_LCSCTSCN, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CNCRDIFF - How many types of cancer?
clean_steps_CNCRDIFF = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CNCRDIFF', clean_steps_CNCRDIFF, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CNCRAGE - Age told had cancer

#Missing Count
missing_count = df['CNCRAGE'].isna().sum()
missing_report = 'There are {:d} ({:.2f}%) missing records after cleaning.'.format(
    missing_count, missing_count / len(df) * 100
)
missing_report

# [(figure, size), text]
ReportBuilder(
    [(fig_before, (8, 3)), cleaner.get_clean_report(), missing_report, (fig_after, (8, 3))],
    title='CNCRAGE', filename='CNCRAGE.pdf'
).build_report()

clean_steps_CNCRAGE = [

    (
        lambda x: x == 98,
        lambda x: np.nan,
        '98 Dont know / Not sure -> NA'
    ),
    (
        lambda x: x == 99,
        lambda x: np.nan,
        '99 Refused -> NA'
    ),
]
clean_and_report(
    df_ori, 'CNCRAGE', clean_steps_CNCRAGE, # (optional) col_type='Continuous'
)

In [None]:
#Cleaning CNCRTYP1 - Type of Cancer
clean_steps_CNCRTYP1 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CNCRTYP1', clean_steps_CNCRTYP1, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CSRVTRT2 - Currently receiving treatment for cancer
clean_steps_CSRVTRT2 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CSRVTRT2', clean_steps_CSRVTRT2, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CSRVDOC1 - What type of doctor provides the majority of your health care?
clean_steps_CSRVDOC1 = [
    (lambda x: x == 77, lambda x: np.nan, '77 Dont know / Not sure -> NA'),
    (lambda x: x == 99, lambda x: np.nan, '99 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CSRVDOC1', clean_steps_CSRVDOC1, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CSRVSUM - Did any doctor, nurse, or other health professional
#ever give you a written summary of all the cancer treatments that you received?
clean_steps_CSRVSUM = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CSRVSUM', clean_steps_CSRVSUM, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CSRVRTRN - Ever receive instructions from a doctor for follow-up check-ups
clean_steps_CSRVRTRN = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CSRVRTRN', clean_steps_CSRVRTRN, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CSRVINST - Were these instrcutions written down or printed on paper for you?
clean_steps_CSRVINST = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CSRVINST', clean_steps_CSRVINST, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CSRVINSR - Did Health Insurance Pay For All Of Your Cancer Treatment
clean_steps_CSRVINSR = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CSRVINSR', clean_steps_CSRVINSR, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CSRVCLIN - Participate In Clinical Trial As Part Of Cancer Treatment?
clean_steps_CSRVCLIN = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CSRVCLIN', clean_steps_CSRVCLIN, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CSRVPAIN - Do you currently have physical pain caused by your cancer or cancer treatment?
clean_steps_CSRVPAIN = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CSRVPAIN', clean_steps_CSRVPAIN, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CSRVCTL1 -  Is your pain currently under control?
clean_steps_CSRVCTL1 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CSRVCTL1', clean_steps_CSRVCTL1, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning PCPSADE1 - Which of the following best describes the decision to have the P.S.A. test done?
clean_steps_PCPSADE1 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'PCPSADE1', clean_steps_PCPSADE1, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning PROFEXAM - Ever Had Breast Physical Exam by Doctor
clean_steps_PROFEXAM = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'PROFEXAM', clean_steps_PROFEXAM, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning LENGEXAM - How Long since Last Breast Physical Exam
clean_steps_LENGEXAM = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'LENGEXAM', clean_steps_LENGEXAM, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning HPVADVC2 -  Have you ever had the HPV vaccination?
clean_steps_HPVADVC2 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'HPVADVC2', clean_steps_HPVADVC2, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning HPVADSHT - How many HPV shots did you receive?
clean_steps_HPVADSHT = [
    (lambda x: x == 77, lambda x: np.nan, '77 Dont know / Not sure -> NA'),
    (lambda x: x == 99, lambda x: np.nan, '99 Refused -> NA'),
]
clean_and_report(
    df_ori, 'HPVADSHT', clean_steps_HPVADSHT, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning TETANUS1 - Received Tetanus Shot Since 2005?
clean_steps_TETANUS1 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'TETANUS1', clean_steps_TETANUS1, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning SHINGLE2 - Have you ever had the shingles or zoster vaccine?
clean_steps_SHINGLE2 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'SHINGLE2', clean_steps_SHINGLE2, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning SOMALE - Which of the following best represents how you think of yourself - sexual orientation
clean_steps_SOMALE = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'SOMALE', clean_steps_SOMALE, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning SOFEMALE - Which of the following best represents how you think of yourself - sexual orientation
clean_steps_SOFEMALE = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'SOFEMALE', clean_steps_SOFEMALE, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning TRNSGNDR - Do you consider yourself to be transgender?
clean_steps_TRNSGNDR = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'TRNSGNDR', clean_steps_TRNSGNDR, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning RCSGENDR - Gender of child
clean_steps_RCSGENDR = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'RCSGENDR', clean_steps_RCSGENDR, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning RCSRLTN2 - Relationship to child
clean_steps_RCSRLTN2 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'RCSRLTN2', clean_steps_RCSRLTN2, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CASTHDX2 - Has a doctor, nurse or other health professional EVER said that the child has asthma
clean_steps_CASTHDX2 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CASTHDX2', clean_steps_CASTHDX2, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning CASTHNO2 - Does the child still have asthma?
clean_steps_CASTHNO2 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, 'CASTHNO2', clean_steps_CASTHNO2, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _CHISPNC - Child Hispanic, Latino/a, or Spanish origin calculated variable
clean_steps_CHISPNC = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_CHISPNC', clean_steps_CHISPNC, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _CRACE1 - Child Hispanic, Latino/a, or Spanish origin calculated variable
clean_steps_CRACE1 = [
    (lambda x: x == 77, lambda x: np.nan, '77 Dont know / Not sure -> NA'),
    (lambda x: x == 99, lambda x: np.nan, '99 Refused -> NA'),
]
clean_and_report(
    df_ori, '_CRACE1', clean_steps_CRACE1, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _CPRACE - Preferred Child Race Categories 
clean_steps_CPRACE = [
    (lambda x: x == 77, lambda x: np.nan, '77 Dont know / Not sure -> NA'),
    (lambda x: x == 99, lambda x: np.nan, '99 Refused -> NA'),
]
clean_and_report(
    df_ori, '_CPRACE', clean_steps_CPRACE, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _RFHLTH - Adults with good or better health
clean_steps_RFHLTH = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_RFHLTH', clean_steps_RFHLTH, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _PHYS14D - Computed Physical Health Status
clean_steps_PHYS14D = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_PHYS14D', clean_steps_PHYS14D, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _MENT14D - Computed Mental Health Status
clean_steps_MENT14D = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_MENT14D', clean_steps_MENT14D, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _HCVU651 - Respondents aged 18-64 with health care coverage
clean_steps_HCVU651 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_HCVU651', clean_steps_HCVU651, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _TOTINDA - Leisure Time Physical Activity Calculated Variable
clean_steps_TOTINDA = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_TOTINDA', clean_steps_TOTINDA, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _LTASTH1 - Adults who have ever been told they have asthma
clean_steps_LTASTH1 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_LTASTH1', clean_steps_LTASTH1, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _CASTHM1 - Adults who have been told they currently have asthma
clean_steps_CASTHM1 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_CASTHM1', clean_steps_CASTHM1, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _ASTHMS1 - Computed asthma status
clean_steps_ASTHMS1 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_ASTHMS1', clean_steps_ASTHMS1, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _EXTETH3 - Adults aged 18+ who have had permanent teeth extracted
clean_steps_EXTETH3 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_EXTETH3', clean_steps_EXTETH3, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _ALTETH3 - Adults aged 65+ who have had all their natural teeth extracted
clean_steps_ALTETH3 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_ALTETH3', clean_steps_ALTETH3, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _DENVST3 - Adults that have visited a dentist, dental hygenist or dental clinic within the past year
clean_steps_DENVST3 = [
    (lambda x: x == 7, lambda x: np.nan, '7 Dont know / Not sure -> NA'),
    (lambda x: x == 9, lambda x: np.nan, '9 Refused -> NA'),
]
clean_and_report(
    df_ori, '_DENVST3', clean_steps_DENVST3, # (optional) col_type='Categorical'
)

In [None]:
#Cleaning _PRACE1 - Computed Preferred Race
clean_steps_PRACE1 = [
    (lambda x: x == 77, lambda x: np.nan, '77 Dont know / Not sure -> NA'),
    (lambda x: x == 99, lambda x: np.nan, '99 Refused -> NA'),
]
clean_and_report(
    df_ori, '_PRACE1', clean_steps_PRACE1, # (optional) col_type='Categorical'
)

In [None]:
#Doesn't need cleaning

#QSTVER - Questionnaire Version Identifier
#QSTLANG - Language identifier
#_METSTAT - Metropolitan Status
#_URBSTAT - Urban/Rural Status
#MSCODE - Metropolitan Status Code
#_STSTR - Sample Design Stratification Variable
#_STRWT - Stratum weight
#_RAWRAKE - Raw weighting factor used in raking
#_WT2RAKE - Design weight use in raking
#_IMPRACE - Imputed race/ethnicity value
#_CLLCPWT - Final child weight: Land-line and Cell-Phone data  (Raking derived weight)
#_DUALUSE - Dual Phone Use Categories
#_DUALCOR - Dual phone use correction factor
#_LLCPWT2 - Truncated design weight used in adult combined land line and cell phone raking
#_LLCPWT - Land-line and cell-phone data
#_MICHD - Ever had CHD or MI
#_DRDXAR1 - Respondents who have had a doctor diagnose them as having some form of arthritis