In [1]:
import pandas as pd
import os

### 1 - Load and concatenate all the CSV files
The empty columns will be dropped

In [2]:
# Load all CSV files from `../data/interim` into a list of DataFrames
# Add a new column to each DataFrame with the name of the file
csv_path = os.path.join('..', 'data', 'interim')
dfs = []

for f in sorted(os.listdir(csv_path)):
    if f.endswith('.csv'):
        df = pd.read_csv(os.path.join(csv_path, f))
        df['file'] = f
        dfs.append(df)

In [3]:
# Check the dimension of each DataFrame
dims = [df.shape for df in dfs]

for i in range(len(dims)):
    print(dims[i])


print()

# Check the columns of each DataFrame and check the differences
columns = [df.columns for df in dfs]
diffs = [list(set(columns[0]) - set(col)) for col in columns[1:]]

for i in range(len(diffs)):
    print(diffs[i])

(35105, 408)
(34473, 413)
(33908, 413)
(33551, 404)
(34718, 429)
(32233, 447)

['PREGNANT', 'MDSP', 'GESTWK', 'ULTRASND', 'RACE', 'NOTOBAC', 'ETHNIC', 'BIOTER']
['PREGNANT', 'MDSP', 'GESTWK', 'ULTRASND', 'RACE', 'NOTOBAC', 'ETHNIC', 'PAYTYPE', 'BIOTER']
['TELEPHON', 'OTHPROC3', 'OTHPR22D', 'OTHPR13D', 'DIAGSCRN', 'TOTDIAG', 'BLANK', 'OTHPROC1R', 'OTHDIAG', 'ETHNIC', 'OTHPROC4R', 'OTHPR23D', 'NONMED', 'MDSP', 'ELECTROL', 'SPIRO', 'RADTHER', 'SCPWI13D', 'OTHPR43D', 'OTHPR32D', 'OTHPROC1', 'OTHPR12D', 'ADMITHOS', 'DIAGSC1R', 'OTHPR33D', 'SCPWI22D', 'NOTOBAC', 'DIGSC12D', 'OTHPROC', 'SCOPWI1R', 'REFERED', 'OTHPROC2R', 'SCPWI23D', 'DIAGSC2R', 'RACEFL', 'ORTHO', 'DIGSC22D', 'ULTRASND', 'RACE', 'OTHPROC4', 'PAYTYPE', 'SCOPPROC', 'HOSPICE', 'RETPRN', 'BIOTER', 'PREGNANT', 'SCOPEWI2', 'DMP', 'SCPWI12D', 'TOTNONMED', 'GESTWK', 'DIAGSC1', 'DIAGSC2', 'DIGSC13D', 'RACEETH', 'OTHPROC2', 'NOFU', 'DIGSC23D', 'OTHPR42D', 'SCOPEWI1', 'CASTAGE', 'OTHPROC3R', 'SCOPWI2R']
['TELEPHON', 'OTHPROC3', 'EMRNEWO'

In [4]:
# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
print('The shape of the concatenated DataFrame is:')
print(df.shape)
print()

# Drop the columns with all NaN values
print('The shape of the DataFrame after dropping columns with all NaN values is:')
df = df.dropna(axis=1, how='all')
print(df.shape)

The shape of the concatenated DataFrame is:
(203988, 544)

The shape of the DataFrame after dropping columns with all NaN values is:
(203988, 532)


### 2 - Check the distribution of NaN values

In [5]:
# Group by the `file` column and count the number of NaN values in each group
# Drop the columns with zero NaN values
# Transpose the resulting DataFrame and add a sum column
# Sort the DataFrame by the sum column
nan_counts = df.groupby('file').apply(lambda x: x.isna().sum(), include_groups=False)
nan_counts = nan_counts.loc[:, (nan_counts != 0).any(axis=0)]
nan_counts = nan_counts.T
nan_counts['sum'] = nan_counts.sum(axis=1)
nan_counts = nan_counts.sort_values(by='sum', ascending=False)
print(nan_counts)

file      opd2006.csv  opd2007.csv  opd2008.csv  opd2009.csv  opd2010.csv  \
COMSTAT8        33241        31864        31054        30433        31182   
CONTSUB8        33241        31864        31054        30433        31182   
PRESCR8         33241        31864        31054        30433        31182   
COMSTAT7        32465        30844        30050        29361        30000   
CONTSUB7        32465        30844        30050        29361        30000   
...               ...          ...          ...          ...          ...   
PAPUNSP             0            0            0            0            0   
URBANRUR            0            0            0            0            0   
CCS                 0            0            0            0            0   
PAPCONV             0            0            0            0            0   
PASTVIS          1664            0            0            0            0   

file      opd2011.csv     sum  
COMSTAT8        28841  186615  
CONTSUB8   

In [6]:
# Check the percentage of NaN values in each column in each group
nan_perc = df.groupby('file').apply(lambda x: x.isna().mean(), include_groups=False)
nan_perc = nan_perc.loc[:, (nan_perc != 0).any(axis=0)]
nan_perc = nan_perc.T
nan_perc['sum'] = nan_perc.sum(axis=1) / len(dfs)
nan_perc = nan_perc.sort_values(by='sum', ascending=False)
print(nan_perc)

file      opd2006.csv  opd2007.csv  opd2008.csv  opd2009.csv  opd2010.csv  \
COMSTAT8     0.946902     0.924318     0.915831     0.907067     0.898151   
CONTSUB8     0.946902     0.924318     0.915831     0.907067     0.898151   
PRESCR8      0.946902     0.924318     0.915831     0.907067     0.898151   
CONTSUB7     0.924797     0.894729     0.886222     0.875115     0.864105   
COMSTAT7     0.924797     0.894729     0.886222     0.875115     0.864105   
...               ...          ...          ...          ...          ...   
PREGTEST     1.000000     0.000000     0.000000     0.000000     0.000000   
MHP          1.000000     0.000000     0.000000     0.000000     0.000000   
PAPCONV      0.000000     0.000000     0.000000     0.000000     0.000000   
OPDWT        0.993306     0.000000     0.000000     0.000000     0.000000   
PASTVIS      0.047401     0.000000     0.000000     0.000000     0.000000   

file      opd2011.csv       sum  
COMSTAT8     0.894766  0.914506  
CONTSUB

In [7]:
# Check the columns that have all NaN values in some file 
all_nan_cols = nan_perc.drop('sum', axis=1)[(nan_perc == 1).any(axis=1)]

# Count the number of files that have all NaN values in each column
all_nan_cols['counts_of_files'] = all_nan_cols.sum(axis=1)
print(all_nan_cols.head(5))
print()
print('The number of columns that have all NaN values in some file is:')
print(len(all_nan_cols))
print()

# Check the columns that more than 4 files have all NaN values
more_than_4_nan_cols = all_nan_cols[all_nan_cols['counts_of_files'] > 4]
print(more_than_4_nan_cols.head(5))
print()
print('The number of columns that more than 4 files have all NaN values is:')
print(len(more_than_4_nan_cols))
print()

# Check the columns that more than 3 files have all NaN values
more_than_3_nan_cols = all_nan_cols[all_nan_cols['counts_of_files'] == 3]
print(more_than_3_nan_cols.head(5))
print()
print('The number of columns that more than 3 files have all NaN values is:')
print(len(more_than_3_nan_cols))
print()

file       opd2006.csv  opd2007.csv  opd2008.csv  opd2009.csv  opd2010.csv  \
BLANK2             1.0          1.0          1.0          1.0          0.0   
PAP                1.0          1.0          1.0          1.0          1.0   
EBILLANYO          1.0          1.0          1.0          1.0          1.0   
EMEDALGO           1.0          1.0          1.0          1.0          1.0   
ESETSO             1.0          1.0          1.0          1.0          1.0   

file       opd2011.csv  counts_of_files  
BLANK2             1.0              5.0  
PAP                0.0              5.0  
EBILLANYO          0.0              5.0  
EMEDALGO           0.0              5.0  
ESETSO             0.0              5.0  

The number of columns that have all NaN values in some file is:
202

file       opd2006.csv  opd2007.csv  opd2008.csv  opd2009.csv  opd2010.csv  \
BLANK2             1.0          1.0          1.0          1.0          0.0   
PAP                1.0          1.0          1.0     

In [8]:
# Drop the columns that have all NaN values in some file
print('The shape of the DataFrame after dropping columns with all NaN values is:')
print(df.shape)
print()

print('The shape of the DataFrame after dropping columns that have all NaN values in some file is:')
df_wo_allnan = df.drop(all_nan_cols.index, axis=1)
print(df_wo_allnan.shape)

The shape of the DataFrame after dropping columns with all NaN values is:
(203988, 532)

The shape of the DataFrame after dropping columns that have all NaN values in some file is:
(203988, 330)


In [9]:
print(df_wo_allnan.columns.to_list())

['VMONTH', 'VDAYR', 'AGE', 'SEX', 'USETOBAC', 'PAYPRIV', 'PAYMCARE', 'PAYMCAID', 'PAYWKCMP', 'PAYSELF', 'PAYNOCHG', 'PAYOTH', 'PAYDK', 'INJDET', 'INJURY', 'RFV1', 'RFV2', 'RFV3', 'RFV13D', 'RFV23D', 'RFV33D', 'PRIMCARE', 'REFER', 'SENBEFOR', 'PASTVIS', 'MAJOR', 'DIAG1', 'DIAG2', 'DIAG3', 'DIAG13D', 'DIAG23D', 'DIAG33D', 'PRDIAG1', 'PRDIAG2', 'PRDIAG3', 'ARTHRTIS', 'ASTHMA', 'CANCER', 'CEBVD', 'CHF', 'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 'HTN', 'IHD', 'OBESITY', 'OSTPRSIS', 'NOCHRON', 'TOTCHRON', 'HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS', 'BREAST', 'PELVIC', 'RECTAL', 'SKIN', 'DEPRESS', 'ANYIMAGE', 'BONEDENS', 'MAMMO', 'MRI', 'XRAY', 'OTHIMAGE', 'CBC', 'GLUCOSE', 'HGBA', 'CHOLEST', 'PSA', 'OTHERBLD', 'SIGCOLON', 'BIOPSY', 'CHLAMYD', 'HPVDNA', 'EKG', 'URINE', 'HTTAKE', 'WTTAKE', 'TEMPTAKE', 'BLODPRES', 'HEALTHED', 'ASTHMAED', 'DIETNUTR', 'EXERCISE', 'GRWTHDEV', 'INJPREV', 'STRESMGT', 'TOBACED', 'WTREDUC', 'OTHLTHED', 'TOTHLTED', 'CAM', 'DME', 'HOMEHLTH', 'PT', 'SPOCTHER

### 3 - Feature definition and selection
Not including "DRUG-RELATED INFO FOR MEDICATION #1-8" or "DIAGNOSIS RECODE"

In [61]:
# DATE OF VISIT
dateOfVisit = ['VMONTH', 'VYEAR', 'VDAYR', 'YEAR']

# Indepedent variables
demographics = [
    'PATCODE',
    'AGE', 'SEX', 'PREGNANT', 'GESTWEEK', 'ETHNIC', 'RACE', 'USETOBAC', 'NOTOBAC'
]
payment = ['PAYPRIV', 'PAYMCARE', 'PAYMCAID', 'PAYWKCMP', 'PAYSELF', 'PAYNOCHG', 'PAYOTH', 'PAYDK', 'PAYTYPE']
visitReason = [
    'INJDET',
    'INJURY',
    'MAJOR', 'RFV1', 'RFV2', 'RFV3'
]
patientClinicHistory = ['SENBEFOR', 'PASTVIS']

# Supplementary Independent Variables
vitalSigns = ['HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS']
imputedFields = ['BDATEFL', 'SEXFL', 'ETHNICFL', 'RACEFL', 'SENBEFL', 'PASTFL']

# Not sure if Independent or Dependent variables
physicianDiagnoses = ['DIAG1', 'DIAG2', 'DIAG3']
differentialDiagnoses = ['PRDIAG1', 'PRDIAG2', 'PRDIAG3']
presentSymptomsStatus = [
    'ARTHRTIS', 'ASTHMA', 'CANCER', 'CASTAGE', 'CEBVD', 'CHF', 
    'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 
    'HTN', 'IHD', 'OBESITY', 'OSTPRSIS',
    'NOCHRON', 'TOTCHRON',
    'DMP'
]

# Dependent variables
diagnosticScreeningServices = [
    #'DIAGSCRN', 'TOTDIAG',
    'BREAST', 'PELVIC', 'RECTAL', 'SKIN', 'DEPRESS',
    #'ANYIMAGE',
    'BONEDENS', 'MAMMO', 'MRI', 'ULTRASND', 'XRAY', 'OTHIMAGE',
    'CBC', 'ELECTROL', 'GLUCOSE', 'HGBA', 'CHOLEST', 'PSA', 'OTHERBLD',
    #'SCOPPROC', 'SIGCOLON',
    'SCOPEWI1', 'SCOPEWI2',
    'BIOPSY', 'CHLAMYD', 'PAPCONV', 'PAPLIQ', 'PAPUNSP', 'HPVDNA', 'EKG', 'SPIRO', 'URINE',
    'HTTAKE', 'WTTAKE', 'TEMPTAKE', 'BLODPRES',
    #'OTHDIAG',
    'DIAGSC1', 'DIAGSC2',
]
healthEducation = [
    #'HEALTHED', 'TOTHLTED',
    'ASTHMAED', 'DIETNUTR', 'EXERCISE', 'GRWTHDEV', 'INJPREV', 'STRESMGT', 'TOBACED', 'WTREDUC', 'OTHLTHED'
]
nonMedicationTreatments = [
    #'NONMED', 'TOTNONMED',
    'CAM', 'DME', 'HOMEHLTH', 'HOSPICE', 'PT', 'RADTHER', 'SPOCTHER', 'PSYCHOTH', 'OTHMNTL', 'EXCISION', 'ORTHO', 'WOUND',
    #'OTHPROC',
    'OTHPROC1', 'OTHPROC2', 'OTHPROC3', 'OTHPROC4'
]
medicationsAndImmunizations = [
    #'MED', 'NUMMED',
    'MED1', 'MED2', 'MED3', 'MED4', 'MED5', 'MED6', 'MED7', 'MED8',
    'NCMED1', 'NCMED2', 'NCMED3', 'NCMED4', 'NCMED5', 'NCMED6', 'NCMED7', 'NCMED8',
    'NUMNEW', 'NUMCONT'
]
providersSeen = [
    'NOPROVID', 'PHYS', 'PHYSASST', 'NPNMW', 'RNLPN', 'OTHPROV'
]
visitDisposition = [
    'NODISP', 'NOFU', 'RETPRN', 'REFOTHMD', 'RETAPPT', 'TELEPHON', 'REFERED', 'ADMITHOS', 'OTHDISP'
]

In [62]:
# Not sure to include or not
drugInformation1 = [
    'DRUGID1',
    'PRESCR1', 'CONTSUB1', 'COMSTAT1',
    'RX1CAT1', 'RX1CAT2', 'RX1CAT3', 'RX1CAT4',
    'RX1V1C1', 'RX1V1C2', 'RX1V1C3', 'RX1V1C4',
    'RX1V2C1', 'RX1V2C2', 'RX1V2C3', 'RX1V2C4',
    'RX1V3C1', 'RX1V3C2', 'RX1V3C3', 'RX1V3C4',
]
drugInformation2 = [
    'DRUGID2',
    'PRESCR2', 'CONTSUB2', 'COMSTAT2',
    'RX2CAT1', 'RX2CAT2', 'RX2CAT3', 'RX2CAT4',
    'RX2V1C1', 'RX2V1C2', 'RX2V1C3', 'RX2V1C4',
    'RX2V2C1', 'RX2V2C2', 'RX2V2C3', 'RX2V2C4',
    'RX2V3C1', 'RX2V3C2', 'RX2V3C3', 'RX2V3C4',
]
drugInformation3 = [
    'DRUGID3',
    'PRESCR3', 'CONTSUB3', 'COMSTAT3',
    'RX3CAT1', 'RX3CAT2', 'RX3CAT3', 'RX3CAT4',
    'RX3V1C1', 'RX3V1C2', 'RX3V1C3', 'RX3V1C4',
    'RX3V2C1', 'RX3V2C2', 'RX3V2C3', 'RX3V2C4',
    'RX3V3C1', 'RX3V3C2', 'RX3V3C3', 'RX3V3C4',
]
drugInformation4 = [
    'DRUGID4',
    'PRESCR4', 'CONTSUB4', 'COMSTAT4',
    'RX4CAT1', 'RX4CAT2', 'RX4CAT3', 'RX4CAT4',
    'RX4V1C1', 'RX4V1C2', 'RX4V1C3', 'RX4V1C4',
    'RX4V2C1', 'RX4V2C2', 'RX4V2C3', 'RX4V2C4',
    'RX4V3C1', 'RX4V3C2', 'RX4V3C3', 'RX4V3C4',
]
drugInformation5 = [
    'DRUGID5',
    'PRESCR5', 'CONTSUB5', 'COMSTAT5',
    'RX5CAT1', 'RX5CAT2', 'RX5CAT3', 'RX5CAT4',
    'RX5V1C1', 'RX5V1C2', 'RX5V1C3', 'RX5V1C4',
    'RX5V2C1', 'RX5V2C2', 'RX5V2C3', 'RX5V2C4',
    'RX5V3C1', 'RX5V3C2', 'RX5V3C3', 'RX5V3C4',
]
drugInformation6 = [
    'DRUGID6',
    'PRESCR6', 'CONTSUB6', 'COMSTAT6',
    'RX6CAT1', 'RX6CAT2', 'RX6CAT3', 'RX6CAT4',
    'RX6V1C1', 'RX6V1C2', 'RX6V1C3', 'RX6V1C4',
    'RX6V2C1', 'RX6V2C2', 'RX6V2C3', 'RX6V2C4',
    'RX6V3C1', 'RX6V3C2', 'RX6V3C3', 'RX6V3C4',
]
drugInformation7 = [
    'DRUGID7',
    'PRESCR7', 'CONTSUB7', 'COMSTAT7',
    'RX7CAT1', 'RX7CAT2', 'RX7CAT3', 'RX7CAT4',
    'RX7V1C1', 'RX7V1C2', 'RX7V1C3', 'RX7V1C4',
    'RX7V2C1', 'RX7V2C2', 'RX7V2C3', 'RX7V2C4',
    'RX7V3C1', 'RX7V3C2', 'RX7V3C3', 'RX7V3C4',
]
drugInformation8 = [
    'DRUGID8',
    'PRESCR8', 'CONTSUB8', 'COMSTAT8',
    'RX8CAT1', 'RX8CAT2', 'RX8CAT3', 'RX8CAT4',
    'RX8V1C1', 'RX8V1C2', 'RX8V1C3', 'RX8V1C4',
    'RX8V2C1', 'RX8V2C2', 'RX8V2C3', 'RX8V2C4',
    'RX8V3C1', 'RX8V3C2', 'RX8V3C3', 'RX8V3C4',
]
drugInformation = [
    drugInformation1, drugInformation2, drugInformation3, drugInformation4,
    drugInformation5, drugInformation6, drugInformation7, drugInformation8
]

diagnosisRecode = [
    'DIAG1R', 'DIAG2R', 'DIAG3R',
    'SCOPWI1R', 'SCOPWI2R',
    'DIAGSC1R', 'DIAGSC2R',
    'OTHPROC1R', 'OTHPROC2R', 'OTHPROC3R', 'OTHPROC4R',
]

In [63]:
# Get a list of all DataFrames in the current environment
# and clean up the environment
for obj in dir():
    if isinstance(eval(obj), pd.DataFrame) and not obj.startswith('_') and obj != 'dfs':
        del globals()[obj]

In [64]:
# Set up a dictionary of variables
variables = {
    'dateOfVisit': dateOfVisit,
    'demographics': demographics,
    'payment': payment,
    'visitReason': visitReason,
    'patientClinicHistory': patientClinicHistory,
    'vitalSigns': vitalSigns,
    'imputedFields': imputedFields,
    'physicianDiagnoses': physicianDiagnoses,
    'differentialDiagnoses': differentialDiagnoses,
    'presentSymptomsStatus': presentSymptomsStatus,
    'diagnosticScreeningServices': diagnosticScreeningServices,
    'healthEducation': healthEducation,
    'nonMedicationTreatments': nonMedicationTreatments,
    'medicationsAndImmunizations': medicationsAndImmunizations,
    'providersSeen': providersSeen,
    'visitDisposition': visitDisposition,
    #'drugInformation': drugInformation,
    #'diagnosisRecode': diagnosisRecode

}

# Check the existence of all variables among the DataFrames in `dfs`
for var_name, var in variables.items():
    # Continue if all variables are in all the DataFrames
    if all(all(v in df.columns for v in var) for df in dfs): continue

    # Print the DataFrames that do not have all the variables
    print(f'For `{var_name}` variables:\n{var}')
    for v in var:
        for df in dfs:
            if v not in df.columns:
                print(f'{v} is not in {df.file.unique()[0]}')
    print()

For `dateOfVisit` variables:
['VMONTH', 'VYEAR', 'VDAYR', 'YEAR']
VYEAR is not in opd2010.csv
VYEAR is not in opd2011.csv

For `demographics` variables:
['PATCODE', 'AGE', 'SEX', 'PREGNANT', 'GESTWEEK', 'ETHNIC', 'RACE', 'USETOBAC', 'NOTOBAC']
PREGNANT is not in opd2007.csv
PREGNANT is not in opd2008.csv
PREGNANT is not in opd2009.csv
PREGNANT is not in opd2010.csv
PREGNANT is not in opd2011.csv
GESTWEEK is not in opd2006.csv
GESTWEEK is not in opd2007.csv
GESTWEEK is not in opd2008.csv
GESTWEEK is not in opd2009.csv
GESTWEEK is not in opd2010.csv
GESTWEEK is not in opd2011.csv
ETHNIC is not in opd2007.csv
ETHNIC is not in opd2008.csv
ETHNIC is not in opd2009.csv
ETHNIC is not in opd2010.csv
ETHNIC is not in opd2011.csv
RACE is not in opd2007.csv
RACE is not in opd2008.csv
RACE is not in opd2009.csv
RACE is not in opd2010.csv
RACE is not in opd2011.csv
NOTOBAC is not in opd2007.csv
NOTOBAC is not in opd2008.csv
NOTOBAC is not in opd2009.csv
NOTOBAC is not in opd2010.csv
NOTOBAC is not 

### 4 - Handling missing variables in the 2007-2011 dataframes

In [65]:
# Make a copy of the list of DataFrames to dfs_cleaned
dfs_cleaned = [df.copy() for df in dfs]

In [66]:
# From year 2010,
# The year of visit item (VYEAR) is no longer included on the public use file. Although the NHAMCS
# reporting periods will often begin in the last week of December and end in the last week of the
# following December, they are designed to yield statistics that are representative of the actual
# calendar year. The survey variable YEAR continues to be on the file and all visit dates may be
# assumed to reflect the calendar year. If more specific information is required, it is necessary to
# access the data through the NCHS Research Data Center.

# Impute the `VYEAR` column with the year from the `YEAR` column for the years 2010 and 2011
for df in dfs_cleaned:
    if 'VYEAR' not in df.columns:
        df['VYEAR'] = df['YEAR']

In [67]:
# From year 2007,
# Items "Is female patient pregnant, and, if so, specify gestation week." are deleted.

# Remove the `PREGNANT` and `GESTWEEK` columns from the `demographics` list
demographics.remove('PREGNANT')
demographics.remove('GESTWEEK')

In [68]:
# From year 2007,
# The high amounts of missing data are of concern both from a data collection standpoint as well as an
# analytic one. In order to highlight this problem for data users, an unimputed race variable has been added
# to each file, along with an unimputed ethnicity variable. Imputed race and ethnicity variables are included
# as usual.

# Remove `ETHNICFL` and `RACEFL` from the `imputedFields` list
imputedFields.remove('ETHNICFL')
imputedFields.remove('RACEFL')

# Replace the `RACE` and `ETHNIC` columns with the unimputed values from `RACEUN` and `ETHUN` columns from the year 2007
for df in dfs_cleaned:
    if 'RACEUN' in df.columns:
        df['RACE'] = df['RACEUN']
        df['ETHNIC'] = df['ETHUN']

In [69]:
# From year 2007,
# Tobacco use – [USETOBAC] The format of this item was modified to three checkboxes:
# not current, current and unknown. In 2006, if “Not Current” had been checked, two
# additional items, “Never and “Former” were asked. These were deleted for 2007.

# Remove the `NOTOBAC` column from the `demographics` list
demographics.remove('NOTOBAC')

In [70]:
# From year 2008,
# the variable PAYTYPE (Expected Primary Source of Payment for this Visit) has been renamed as 
# PAYTYPER (Recoded Expected Primary Source of Payment for this Visit).
# This is intended to emphasize the fact that PAYTYPER is a recoded item which uses a hierarchy
# to assign a primary expected source of payment based on the collection of multiple expected
# sources of payment.

# Replace `PAYTYPE` with `PAYTYPER` in the `payment` list
payment.remove('PAYTYPE')
payment.append('PAYTYPER')

# Rename `PAYTYPE` to `PAYTYPER` in the DataFrames for year 2006 and 2007
for df in dfs_cleaned:
    if 'PAYTYPE' in df.columns:
        df.rename(columns={'PAYTYPE': 'PAYTYPER'}, inplace=True)

In [71]:
# From year 2009,
# The Services section reflects responses to both item 7, Diagnostic/Screening Services,
# and item 9, Non-Medication Treatment.

# Combine the `diagnosticScreeningServices` and `nonMedicationTreatments` lists into a single list `services`
services = diagnosticScreeningServices + nonMedicationTreatments

# Remove `nonMedicationTreatments` from the `variables` dictionary
del variables['nonMedicationTreatments']

# Replace `diagnosticScreeningServices` with `services` in the `variables` dictionary
new_variables = {}
for var_name, var in variables.items():
    if var_name == 'diagnosticScreeningServices':
        new_variables['services'] = services
    else:
        new_variables[var_name] = var

variables = new_variables
del new_variables

In [72]:
# From year 2007,
# The previous single ultrasound checkbox was replaced with two checkboxes,
# one for echocardiogram [ECHOCARD], the other for other ultrasound [OTHULTRA].

# Add `ECHOCARD` and `OTHULTRA` to the `services` list
services.append('ECHOCARD')
services.append('OTHULTRA')

# Assign 'Yes' to the `ULTRASND` column if either `ECHOCARD` or `OTHULTRA` is 'Yes'
# Assign 'No' to the `ULTRASND` column if both `ECHOCARD` and `OTHULTRA` are 'No'
# Assign NaN to the `ULTRASND` column if both `ECHOCARD` and `OTHULTRA` are NaN
for df in dfs_cleaned:
    if 'ECHOCARD' in df.columns and 'OTHULTRA' in df.columns:
        df['ULTRASND'] = df['ECHOCARD'].combine_first(df['OTHULTRA'])

In [73]:
# `SCOPEWI1` and `SCOPEWI1` in the year 2007 and 2008 represent two Scope Procedures written-in fields.
# `DIAGSC1` and `DIAGSC2` in the year 2007 and 2008 represent two other Diagnostic Screening Procedures written-in fields.
# `OTHPROC1` to `OTHPROC4` in the year 2007 and 2008 represent four outher Surgical or Non-surgical Procedures written-in fields.
# A write-in box for site of biopsy was added to the existing checkbox for biopsy [BIOPSYWI] in the year 2007 and 2008.
# From year 2009,
# These eight fields are replaced by `PROC1` to `PROC9`,
# presenting a complete picture of the number and type of procedures reported at a visit.
# Up to 2 scope procedures, 1 biopsy site, 2 other diagnostic/screening tests/services,
# and up to 4 procedures in the non-medication treatment item could be coded
# for each outpatient department visit.

# Add columns `PROC1` to `PROC9` to the `services` list
services.extend([f'PROC{i}' for i in range(1, 10)])

# Remove `SCOPEWI1` and `SCOPEWI2` from the `service` list
services.remove('SCOPEWI1')
services.remove('SCOPEWI2')

# Remove `DIAGSC1` and `DIAGSC2` from the `service` list
services.remove('DIAGSC1')
services.remove('DIAGSC2')

# Remove `OTHPROC1` to `OTHPROC4` from the `service` list
services.remove('OTHPROC1')
services.remove('OTHPROC2')
services.remove('OTHPROC3')
services.remove('OTHPROC4')

# Renew the `services` list in the `variables` dictionary
variables['services'] = services

# Add `PROC1` to `PROC9` columns to the DataFrames for year 2007 and 2008,
# assign the values from `SCOPEWI1` and `SCOPEWI2` to the `PROC1` and `PROC2` columns
# assign the values from `BIOPSYWI` to the `PROC3` column
# assign the values from `DIAGSC1` and `DIAGSC2` to the `PROC4` and `PROC5` columns
# assign the values from `OTHPROC1` to `OTHPROC4` to the `PROC6` to `PROC9` columns
for df in dfs_cleaned:
    if 'SCOPEWI1' in df.columns and 'SCOPEWI2' in df.columns:
        df['PROC1'] = df['SCOPEWI1']
        df['PROC2'] = df['SCOPEWI2']
    else:
        df['PROC1'] = None
        df['PROC2'] = None

    if 'BIOPSYWI' in df.columns: df['PROC3'] = df['BIOPSYWI']
    else: df['PROC3'] = None

    if 'DIAGSC1' in df.columns and 'DIAGSC2' in df.columns:
        df['PROC4'] = df['DIAGSC1']
        df['PROC5'] = df['DIAGSC2']
    else:
        df['PROC4'] = None
        df['PROC5'] = None

    if 'OTHPROC1' in df.columns: df['PROC6'] = df['OTHPROC1']
    else: df['PROC6'] = None

    if 'OTHPROC2' in df.columns: df['PROC7'] = df['OTHPROC2']
    else: df['PROC7'] = None

    if 'OTHPROC3' in df.columns: df['PROC8'] = df['OTHPROC3']
    else: df['PROC8'] = None

    if 'OTHPROC4' in df.columns: df['PROC9'] = df['OTHPROC4']
    else: df['PROC9'] = None

In [74]:
# In the year 2011,
# In Diagnostic/Screening Services, the checkbox “Pap test” replaces the 2010 checkboxes
# for “Pap test – conventional”, “Pap test – liquid-based”, and “Pap test – unspecified.”

# Rename `PAP` to `PAPUNSP` in the 2011 DataFrame
for df in dfs_cleaned:
    if 'PAP' in df.columns:
        df.rename(columns={'PAP': 'PAPUNSP'}, inplace=True)

In [75]:
# From year 2009,
# In item 12, Visit Disposition, checkboxes for
# no follow-up planned, return if needed PRN, and telephone follow-up planned
# were removed.

# Remove `NOFU`, `RETPRN`, and `TELEPHON` from the `visitDisposition` list
visitDisposition.remove('NOFU')
visitDisposition.remove('RETPRN')
visitDisposition.remove('TELEPHON')

In [76]:
# From year 2009,
# In item 12, Visit Disposition, checkboxes used in 2008 for 
# “refer to emergency department” and “admit to hospital”
# were combined into a single category for 2009: Refer to ER/Admit to hospital

# Remove `REFERED` and `ADMITHOS` from the `visitDisposition` list
visitDisposition.remove('REFERED')
visitDisposition.remove('ADMITHOS')

# Add `ERADMHOS` to the `visitDisposition` list
visitDisposition.append('ERADMHOS')

# Add `ERADMHOS` column to the DataFrames for year 2006 to 2008
# Assign the values from `REFERED` and `ADMITHOS` to the `ERADMHOS` column
for df in dfs_cleaned:
    if 'REFERED' in df.columns and 'ADMITHOS' in df.columns:
        df['ERADMHOS'] = df['REFERED'].combine_first(df['ADMITHOS'])

In [77]:
# Check the existence of all variables among the DataFrames in `dfs_cleaned`
# to verify the result of the cleaning process

for var_name, var in variables.items():
    # Continue if all variables are in all the DataFrames
    if all(all(v in df.columns for v in var) for df in dfs_cleaned): continue

    # Print the DataFrames that do not have all the variables
    print(f'For `{var_name}` variables:\n{var}')
    for v in var:
        for df in dfs_cleaned:
            if v not in df.columns:
                print(f'{v} is not in {df.file.unique()[0]}')
    print()

For `presentSymptomsStatus` variables:
['ARTHRTIS', 'ASTHMA', 'CANCER', 'CASTAGE', 'CEBVD', 'CHF', 'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 'HTN', 'IHD', 'OBESITY', 'OSTPRSIS', 'NOCHRON', 'TOTCHRON', 'DMP']
CASTAGE is not in opd2009.csv
CASTAGE is not in opd2011.csv
DMP is not in opd2009.csv
DMP is not in opd2010.csv
DMP is not in opd2011.csv

For `services` variables:
['BREAST', 'PELVIC', 'RECTAL', 'SKIN', 'DEPRESS', 'BONEDENS', 'MAMMO', 'MRI', 'ULTRASND', 'XRAY', 'OTHIMAGE', 'CBC', 'ELECTROL', 'GLUCOSE', 'HGBA', 'CHOLEST', 'PSA', 'OTHERBLD', 'BIOPSY', 'CHLAMYD', 'PAPCONV', 'PAPLIQ', 'PAPUNSP', 'HPVDNA', 'EKG', 'SPIRO', 'URINE', 'HTTAKE', 'WTTAKE', 'TEMPTAKE', 'BLODPRES', 'CAM', 'DME', 'HOMEHLTH', 'HOSPICE', 'PT', 'RADTHER', 'SPOCTHER', 'PSYCHOTH', 'OTHMNTL', 'EXCISION', 'ORTHO', 'WOUND', 'ECHOCARD', 'OTHULTRA', 'PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5', 'PROC6', 'PROC7', 'PROC8', 'PROC9']
ELECTROL is not in opd2009.csv
ELECTROL is not in opd2010.csv
ELECTROL is not in opd201

In [78]:
# For year 2009 and 2011,
# “Regardless of the diagnoses written in 5a, does the patient now have:”,
# the sub-item for cancer stage was removed.

# We would want to keep the `CASTAGE` variable for the moment.

In [79]:
# From year 2009,
# “Status of patient enrollment in a disease management program
# for any of the conditions marked in 5b” was removed.

# We would want to keep the `DMP` variable for the moment.

In [80]:
# From year 2009,
# In Diagnostic /Screening Services, checkboxes for PET scan, electrolytes, and
# spirometry/pulmonary function test were removed.

# We would want to keep the `ELECTROL` and `SPIRO` variables for the moment.

In [81]:
# From year 2009,
# In Non-medication Treatment, checkboxes for
# orthopedic care, hospice care, and radiation therapy were removed.
# Radiation therapy was added back to the form in 2010

# We would want to keep the `ORTHO`, `HOSPICE`, and `RADTHER` variables for the moment.

In [82]:
# Print the update lists of variables
print('The updated list of variables is:')
print()
for var_name, var in variables.items():
    print(f'{var_name}: {var}')
    print()

print('-' * 80)

# Print the number of variables in total
print('The number of variables in total is:')
print(sum(len(var) for var in variables.values()))

The updated list of variables is:

dateOfVisit: ['VMONTH', 'VYEAR', 'VDAYR', 'YEAR']

demographics: ['PATCODE', 'AGE', 'SEX', 'ETHNIC', 'RACE', 'USETOBAC']

payment: ['PAYPRIV', 'PAYMCARE', 'PAYMCAID', 'PAYWKCMP', 'PAYSELF', 'PAYNOCHG', 'PAYOTH', 'PAYDK', 'PAYTYPER']

visitReason: ['INJDET', 'INJURY', 'MAJOR', 'RFV1', 'RFV2', 'RFV3']

patientClinicHistory: ['SENBEFOR', 'PASTVIS']

vitalSigns: ['HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS']

imputedFields: ['BDATEFL', 'SEXFL', 'SENBEFL', 'PASTFL']

physicianDiagnoses: ['DIAG1', 'DIAG2', 'DIAG3']

differentialDiagnoses: ['PRDIAG1', 'PRDIAG2', 'PRDIAG3']

presentSymptomsStatus: ['ARTHRTIS', 'ASTHMA', 'CANCER', 'CASTAGE', 'CEBVD', 'CHF', 'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 'HTN', 'IHD', 'OBESITY', 'OSTPRSIS', 'NOCHRON', 'TOTCHRON', 'DMP']

services: ['BREAST', 'PELVIC', 'RECTAL', 'SKIN', 'DEPRESS', 'BONEDENS', 'MAMMO', 'MRI', 'ULTRASND', 'XRAY', 'OTHIMAGE', 'CBC', 'ELECTROL', 'GLUCOSE', 'HGBA', 'CHOLEST', 'PSA', 'OTHERBLD',

### 5 - Add new variables in the 2007-2011 dataframes to the lists of variables
The initial lists of variables are created based on the 2006 data and document.

There are some new fields added to the survey and data since 2007.

In [83]:
# From year 2007,

# In Diagnostic/Screening Services – New checkboxes were added for
# CT scan [CATSCAN], PET scan [PETSCAN], and pregnancy test [PREGTEST]

# PET scan [PETSCAN] is removed again from year 2009 to 2011, So we don't add it to the `services` list.

# Add `CATSCAN`, `PETSCAN`, `PREGTEST`, and `BIOPSYWI` to the `services` list
services.append('CATSCAN')
#services.append('PETSCAN')
services.append('PREGTEST')


# In Providers – A checkbox was added for mental health provider [MHP].

# Add `MHP` to the `providersSeen` list
providersSeen.append('MHP')

In [84]:
# From year 2009,

# In item 7, Diagnostic /Screening Services, there are new checkboxes for:
# Foot examination, Retinal examination, HIV test

# Add `FOOT`, `RETINAL`, `HIVTEST` to the `services` list
services.append('FOOT')
services.append('RETINAL')
services.append('HIVTEST')


# In item 9, Non-medication Treatment, there are new checkboxes for:
# Cast, Splint or wrap

# Add `CAST`, `SPLINT` to the `services` list
services.append('CAST')
services.append('SPLINT')

In [85]:
# Check the existence of all variables among the DataFrames in `dfs_cleaned`
# to verify the result of the cleaning process

for var_name, var in variables.items():
    # Continue if all variables are in all the DataFrames
    if all(all(v in df.columns for v in var) for df in dfs_cleaned): continue

    # Print the DataFrames that do not have all the variables
    print(f'For `{var_name}` variables:\n{var}')
    for v in var:
        for df in dfs_cleaned:
            if v not in df.columns:
                print(f'{v} is not in {df.file.unique()[0]}')
    print()

For `presentSymptomsStatus` variables:
['ARTHRTIS', 'ASTHMA', 'CANCER', 'CASTAGE', 'CEBVD', 'CHF', 'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 'HTN', 'IHD', 'OBESITY', 'OSTPRSIS', 'NOCHRON', 'TOTCHRON', 'DMP']
CASTAGE is not in opd2009.csv
CASTAGE is not in opd2011.csv
DMP is not in opd2009.csv
DMP is not in opd2010.csv
DMP is not in opd2011.csv

For `services` variables:
['BREAST', 'PELVIC', 'RECTAL', 'SKIN', 'DEPRESS', 'BONEDENS', 'MAMMO', 'MRI', 'ULTRASND', 'XRAY', 'OTHIMAGE', 'CBC', 'ELECTROL', 'GLUCOSE', 'HGBA', 'CHOLEST', 'PSA', 'OTHERBLD', 'BIOPSY', 'CHLAMYD', 'PAPCONV', 'PAPLIQ', 'PAPUNSP', 'HPVDNA', 'EKG', 'SPIRO', 'URINE', 'HTTAKE', 'WTTAKE', 'TEMPTAKE', 'BLODPRES', 'CAM', 'DME', 'HOMEHLTH', 'HOSPICE', 'PT', 'RADTHER', 'SPOCTHER', 'PSYCHOTH', 'OTHMNTL', 'EXCISION', 'ORTHO', 'WOUND', 'ECHOCARD', 'OTHULTRA', 'PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5', 'PROC6', 'PROC7', 'PROC8', 'PROC9', 'CATSCAN', 'PREGTEST', 'FOOT', 'RETINAL', 'HIVTEST', 'CAST', 'SPLINT']
ELECTROL is not 

In [86]:
# Print the update lists of variables
print('The updated list of variables is:')
print()
for var_name, var in variables.items():
    print(f'{var_name}: {var}')
    print()

print('-' * 80)

# Print the number of variables in total
print('The number of variables in total is:')
print(sum(len(var) for var in variables.values()))

The updated list of variables is:

dateOfVisit: ['VMONTH', 'VYEAR', 'VDAYR', 'YEAR']

demographics: ['PATCODE', 'AGE', 'SEX', 'ETHNIC', 'RACE', 'USETOBAC']

payment: ['PAYPRIV', 'PAYMCARE', 'PAYMCAID', 'PAYWKCMP', 'PAYSELF', 'PAYNOCHG', 'PAYOTH', 'PAYDK', 'PAYTYPER']

visitReason: ['INJDET', 'INJURY', 'MAJOR', 'RFV1', 'RFV2', 'RFV3']

patientClinicHistory: ['SENBEFOR', 'PASTVIS']

vitalSigns: ['HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS']

imputedFields: ['BDATEFL', 'SEXFL', 'SENBEFL', 'PASTFL']

physicianDiagnoses: ['DIAG1', 'DIAG2', 'DIAG3']

differentialDiagnoses: ['PRDIAG1', 'PRDIAG2', 'PRDIAG3']

presentSymptomsStatus: ['ARTHRTIS', 'ASTHMA', 'CANCER', 'CASTAGE', 'CEBVD', 'CHF', 'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 'HTN', 'IHD', 'OBESITY', 'OSTPRSIS', 'NOCHRON', 'TOTCHRON', 'DMP']

services: ['BREAST', 'PELVIC', 'RECTAL', 'SKIN', 'DEPRESS', 'BONEDENS', 'MAMMO', 'MRI', 'ULTRASND', 'XRAY', 'OTHIMAGE', 'CBC', 'ELECTROL', 'GLUCOSE', 'HGBA', 'CHOLEST', 'PSA', 'OTHERBLD',

### 6 - Merge the dataframes with the selected variables of interest and clean up the values

In [89]:
# Concatenate all selected variables of interest into a list
selected_vars = list()
for var in variables.values():
    selected_vars.extend(var)

# Merge the DataFrames with the selected variables of interest
df = pd.concat(dfs_cleaned, ignore_index=True)
df = df[selected_vars]

# Check the shape of the merged DataFrame
print('The shape of the merged DataFrame is:')
print(df.shape)

The shape of the merged DataFrame is:
(203988, 161)


In [100]:
# Find the columns that the type of values are not consistent in the merged DataFrame
inconsistent_cols = df.columns[df.apply(lambda x: x.map(type).nunique() > 1)]
print('The columns that the type of values are not consistent in the merged DataFrame are:')
print(inconsistent_cols)

The columns that the type of values are not consistent in the merged DataFrame are:
Index(['PASTVIS', 'BMI', 'CASTAGE', 'DMP', 'ELECTROL', 'PAPCONV', 'PAPLIQ',
       'SPIRO', 'HOSPICE', 'RADTHER', 'ORTHO', 'ECHOCARD', 'OTHULTRA', 'PROC1',
       'PROC2', 'PROC3', 'PROC4', 'PROC5', 'PROC6', 'PROC7', 'PROC8', 'PROC9',
       'CATSCAN', 'PREGTEST', 'FOOT', 'RETINAL', 'HIVTEST', 'CAST', 'SPLINT',
       'MHP'],
      dtype='object')


In [101]:
df.PASTVIS.value_counts()

PASTVIS
Not applicable    39270
1.0               27620
2.0               21376
3.0               16680
4.0               12262
                  ...  
82.0                  6
84.0                  5
85.0                  3
91.0                  3
88.0                  2
Name: count, Length: 107, dtype: int64

In [98]:
# Check the value counts of each column in the merged DataFrame
#value_counts = df.apply(lambda x: x.value_counts(dropna=False, sort=False)).T
#print(value_counts)