In [1]:
import pandas as pd
import os

### 1 - Load and concatenate all the CSV files
The empty columns will be dropped

In [2]:
# Load all CSV files from `../data/interim` into a list of DataFrames
# Add a new column to each DataFrame with the name of the file
csv_path = os.path.join('..', 'data', 'interim')
dfs = []

for f in sorted(os.listdir(csv_path)):
    if f.endswith('.csv'):
        df = pd.read_csv(os.path.join(csv_path, f))
        df['file'] = f
        dfs.append(df)

In [3]:
# Check the dimension of each DataFrame
dims = [df.shape for df in dfs]

for i in range(len(dims)):
    print(dims[i])


print()

# Check the columns of each DataFrame and check the differences
columns = [df.columns for df in dfs]
diffs = [list(set(columns[0]) - set(col)) for col in columns[1:]]

for i in range(len(diffs)):
    print(diffs[i])

(35105, 408)
(34473, 413)
(33908, 413)
(33551, 404)
(34718, 429)
(32233, 447)

['NOTOBAC', 'PREGNANT', 'GESTWK', 'RACE', 'ULTRASND', 'MDSP', 'ETHNIC', 'BIOTER']
['NOTOBAC', 'PREGNANT', 'PAYTYPE', 'GESTWK', 'RACE', 'ULTRASND', 'MDSP', 'ETHNIC', 'BIOTER']
['SCOPEWI2', 'NOTOBAC', 'OTHPR12D', 'NOFU', 'ELECTROL', 'OTHPROC1', 'TOTNONMED', 'HOSPICE', 'OTHPROC4', 'SCPWI23D', 'REFERED', 'DIAGSCRN', 'PREGNANT', 'OTHPROC1R', 'OTHPR33D', 'RACEFL', 'OTHPR23D', 'RADTHER', 'NONMED', 'SCOPEWI1', 'SCOPWI1R', 'DIGSC12D', 'BIOTER', 'OTHPR43D', 'DIAGSC1', 'SCOPPROC', 'OTHDIAG', 'OTHPROC3R', 'DIGSC13D', 'DIGSC23D', 'DIAGSC2', 'OTHPROC2', 'OTHPR22D', 'DIAGSC2R', 'OTHPR32D', 'SCPWI13D', 'OTHPR42D', 'OTHPROC2R', 'BLANK', 'ETHNIC', 'ORTHO', 'CASTAGE', 'DMP', 'OTHPROC4R', 'DIGSC22D', 'OTHPR13D', 'GESTWK', 'MDSP', 'RETPRN', 'ADMITHOS', 'RACEETH', 'SPIRO', 'OTHPROC', 'PAYTYPE', 'RACE', 'DIAGSC1R', 'ULTRASND', 'SCPWI12D', 'SCOPWI2R', 'SCPWI22D', 'OTHPROC3', 'TOTDIAG', 'TELEPHON']
['SCOPEWI2', 'NOTOBAC', 'OTHPR12D'

In [4]:
# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
print('The shape of the concatenated DataFrame is:')
print(df.shape)
print()

# Drop the columns with all NaN values
print('The shape of the DataFrame after dropping columns with all NaN values is:')
df = df.dropna(axis=1, how='all')
print(df.shape)

The shape of the concatenated DataFrame is:
(203988, 544)

The shape of the DataFrame after dropping columns with all NaN values is:
(203988, 532)


### 2 - Check the distribution of NaN values

In [5]:
# Group by the `file` column and count the number of NaN values in each group
# Drop the columns with zero NaN values
# Transpose the resulting DataFrame and add a sum column
# Sort the DataFrame by the sum column
nan_counts = df.groupby('file').apply(lambda x: x.isna().sum(), include_groups=False)
nan_counts = nan_counts.loc[:, (nan_counts != 0).any(axis=0)]
nan_counts = nan_counts.T
nan_counts['sum'] = nan_counts.sum(axis=1)
nan_counts = nan_counts.sort_values(by='sum', ascending=False)
print(nan_counts)

file      opd2006.csv  opd2007.csv  opd2008.csv  opd2009.csv  opd2010.csv  \
COMSTAT8        33241        31864        31054        30433        31182   
CONTSUB8        33241        31864        31054        30433        31182   
PRESCR8         33241        31864        31054        30433        31182   
COMSTAT7        32465        30844        30050        29361        30000   
CONTSUB7        32465        30844        30050        29361        30000   
...               ...          ...          ...          ...          ...   
PAPUNSP             0            0            0            0            0   
URBANRUR            0            0            0            0            0   
CCS                 0            0            0            0            0   
PAPCONV             0            0            0            0            0   
PASTVIS          1664            0            0            0            0   

file      opd2011.csv     sum  
COMSTAT8        28841  186615  
CONTSUB8   

In [6]:
# Check the percentage of NaN values in each column in each group
nan_perc = df.groupby('file').apply(lambda x: x.isna().mean(), include_groups=False)
nan_perc = nan_perc.loc[:, (nan_perc != 0).any(axis=0)]
nan_perc = nan_perc.T
nan_perc['sum'] = nan_perc.sum(axis=1) / len(dfs)
nan_perc = nan_perc.sort_values(by='sum', ascending=False)
print(nan_perc)

file      opd2006.csv  opd2007.csv  opd2008.csv  opd2009.csv  opd2010.csv  \
COMSTAT8     0.946902     0.924318     0.915831     0.907067     0.898151   
CONTSUB8     0.946902     0.924318     0.915831     0.907067     0.898151   
PRESCR8      0.946902     0.924318     0.915831     0.907067     0.898151   
CONTSUB7     0.924797     0.894729     0.886222     0.875115     0.864105   
COMSTAT7     0.924797     0.894729     0.886222     0.875115     0.864105   
...               ...          ...          ...          ...          ...   
PREGTEST     1.000000     0.000000     0.000000     0.000000     0.000000   
MHP          1.000000     0.000000     0.000000     0.000000     0.000000   
PAPCONV      0.000000     0.000000     0.000000     0.000000     0.000000   
OPDWT        0.993306     0.000000     0.000000     0.000000     0.000000   
PASTVIS      0.047401     0.000000     0.000000     0.000000     0.000000   

file      opd2011.csv       sum  
COMSTAT8     0.894766  0.914506  
CONTSUB

In [7]:
# Check the columns that have all NaN values in some file 
all_nan_cols = nan_perc.drop('sum', axis=1)[(nan_perc == 1).any(axis=1)]

# Count the number of files that have all NaN values in each column
all_nan_cols['counts_of_files'] = all_nan_cols.sum(axis=1)
print(all_nan_cols.head(5))
print()
print('The number of columns that have all NaN values in some file is:')
print(len(all_nan_cols))
print()

# Check the columns that more than 4 files have all NaN values
more_than_4_nan_cols = all_nan_cols[all_nan_cols['counts_of_files'] > 4]
print(more_than_4_nan_cols.head(5))
print()
print('The number of columns that more than 4 files have all NaN values is:')
print(len(more_than_4_nan_cols))
print()

# Check the columns that more than 3 files have all NaN values
more_than_3_nan_cols = all_nan_cols[all_nan_cols['counts_of_files'] == 3]
print(more_than_3_nan_cols.head(5))
print()
print('The number of columns that more than 3 files have all NaN values is:')
print(len(more_than_3_nan_cols))
print()

file       opd2006.csv  opd2007.csv  opd2008.csv  opd2009.csv  opd2010.csv  \
BLANK2             1.0          1.0          1.0          1.0          0.0   
PAP                1.0          1.0          1.0          1.0          1.0   
EBILLANYO          1.0          1.0          1.0          1.0          1.0   
EMEDALGO           1.0          1.0          1.0          1.0          1.0   
ESETSO             1.0          1.0          1.0          1.0          1.0   

file       opd2011.csv  counts_of_files  
BLANK2             1.0              5.0  
PAP                0.0              5.0  
EBILLANYO          0.0              5.0  
EMEDALGO           0.0              5.0  
ESETSO             0.0              5.0  

The number of columns that have all NaN values in some file is:
202

file       opd2006.csv  opd2007.csv  opd2008.csv  opd2009.csv  opd2010.csv  \
BLANK2             1.0          1.0          1.0          1.0          0.0   
PAP                1.0          1.0          1.0     

In [8]:
# Drop the columns that have all NaN values in some file
print('The shape of the DataFrame after dropping columns with all NaN values is:')
print(df.shape)
print()

print('The shape of the DataFrame after dropping columns that have all NaN values in some file is:')
df_wo_allnan = df.drop(all_nan_cols.index, axis=1)
print(df_wo_allnan.shape)

The shape of the DataFrame after dropping columns with all NaN values is:
(203988, 532)

The shape of the DataFrame after dropping columns that have all NaN values in some file is:
(203988, 330)


In [9]:
print(df_wo_allnan.columns.to_list())

['VMONTH', 'VDAYR', 'AGE', 'SEX', 'USETOBAC', 'PAYPRIV', 'PAYMCARE', 'PAYMCAID', 'PAYWKCMP', 'PAYSELF', 'PAYNOCHG', 'PAYOTH', 'PAYDK', 'INJDET', 'INJURY', 'RFV1', 'RFV2', 'RFV3', 'RFV13D', 'RFV23D', 'RFV33D', 'PRIMCARE', 'REFER', 'SENBEFOR', 'PASTVIS', 'MAJOR', 'DIAG1', 'DIAG2', 'DIAG3', 'DIAG13D', 'DIAG23D', 'DIAG33D', 'PRDIAG1', 'PRDIAG2', 'PRDIAG3', 'ARTHRTIS', 'ASTHMA', 'CANCER', 'CEBVD', 'CHF', 'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 'HTN', 'IHD', 'OBESITY', 'OSTPRSIS', 'NOCHRON', 'TOTCHRON', 'HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS', 'BREAST', 'PELVIC', 'RECTAL', 'SKIN', 'DEPRESS', 'ANYIMAGE', 'BONEDENS', 'MAMMO', 'MRI', 'XRAY', 'OTHIMAGE', 'CBC', 'GLUCOSE', 'HGBA', 'CHOLEST', 'PSA', 'OTHERBLD', 'SIGCOLON', 'BIOPSY', 'CHLAMYD', 'HPVDNA', 'EKG', 'URINE', 'HTTAKE', 'WTTAKE', 'TEMPTAKE', 'BLODPRES', 'HEALTHED', 'ASTHMAED', 'DIETNUTR', 'EXERCISE', 'GRWTHDEV', 'INJPREV', 'STRESMGT', 'TOBACED', 'WTREDUC', 'OTHLTHED', 'TOTHLTED', 'CAM', 'DME', 'HOMEHLTH', 'PT', 'SPOCTHER

### 3 - Feature definition and selection
Not including "DRUG-RELATED INFO FOR MEDICATION #1-8" or "DIAGNOSIS RECODE"

In [31]:
# DATE OF VISIT
dateOfVisit = ['VMONTH', 'VYEAR', 'VDAYR', 'YEAR']

# Indepedent variables
demographics = [
    'PATCODE',
    'AGE', 'SEX', 'PREGNANT', 'GESTWEEK', 'ETHNIC', 'RACE', 'USETOBAC', 'NOTOBAC'
]
demographics_supp = ['RACER', 'RACEETH', 'AGEDAYS', 'AGER']
payment = ['PAYPRIV', 'PAYMCARE', 'PAYMCAID', 'PAYWKCMP', 'PAYSELF', 'PAYNOCHG', 'PAYOTH', 'PAYUNK', 'PAYTYPE']
visitReason = [
    'INJDET',
    'INJURY',
    'MAJOR', 'RFV1', 'RFV2', 'RFV3'
]
patientClinicHistory = ['SENBEFOR', 'PASTVIS']

# Supplementary Independent Variables
vitalSigns = ['HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS']
imputedFields = ['BDATEFL', 'SEXFL', 'ETHNICFL', 'RACEFL', 'SENBEFL', 'PASTFL']

# Not sure if Independent or Dependent variables
physicianDiagnoses = ['DIAG1', 'DIAG2', 'DIAG3']
differentialDiagnoses = ['PRDIAG1', 'PRDIAG2', 'PRDIAG3']
presentSymptomsStatus = [
    'ARTHRTIS', 'ASTHMA', 'CANCER', 'CASTAGE', 'CEBVD', 'CHF', 
    'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 
    'HTN', 'IHD', 'OBESITY', 'OSTPRSIS',
    'NOCHRON', 'TOTCHRON',
    'DMP'
]

# Dependent variables
diagnosticScreeningServices = [
    'DIAGSCRN', 'TOTDIAG',
    'BREAST', 'PELVIC', 'RECTAL', 'SKIN', 'DEPRESS',
    'ANYIMAGE', 'BONEDENS', 'MAMMO', 'MRI', 'ULTRASND', 'XRAY', 'OTHIMAGE',
    'CBC', 'ELECTROL', 'GLUCOSE', 'HGBA', 'CHOLEST', 'PSA', 'OTHERBLD',
    'SCOPPROC', 'SIGCOLON', 'SCOPEWI1', 'SCOPEWI2',
    'BIOPSY', 'CHLAMYD', 'PAPCONV', 'PAPLIQ', 'PAPUNSP', 'HPVDNA', 'EKG', 'SPIRO', 'URINE',
    'HTTAKE', 'WTTAKE', 'TEMPTAKE', 'BLODPRES',
    'OTHDIAG', 'DIAGSC1', 'DIAGSC2',
]
healthEducation = [
    'HLTHED', 'TOTHLTED',
    'ASTHMAED', 'DIETNUTR', 'EXERCISE', 'GRWTHDEV', 'INJPREV', 'STRESMGT', 'TOBACED', 'WTREDUC', 'OTHLTHED'
]
nonMedicationTreatments = [
    'NONMED', 'TOTNONMED',
    'CAM', 'DME', 'HOMEHLTH', 'HOSPICE', 'PT', 'RADTHER', 'SPOCTHER', 'PSYCHOTH', 'OTHMNTL', 'EXCISION', 'ORTHO', 'WOUND',
    'OTHPROC', 'OTHPROC1', 'OTHPROC2', 'OTHPROC3', 'OTHPROC4'
]
medicationsAndImmunizations = [
    'MED', 'NUMMED',
    'MED1', 'MED2', 'MED3', 'MED4', 'MED5', 'MED6', 'MED7', 'MED8',
    'NCMED1', 'NCMED2', 'NCMED3', 'NCMED4', 'NCMED5', 'NCMED6', 'NCMED7', 'NCMED8',
    'NUMNEW', 'NUMCONT'
]
providersSeen = [
    'NOPROVID', 'PHYS', 'PHYSASST', 'NPNMW', 'RNLPN', 'OTHPROV'
]
visitDisposition = [
    'NODISP', 'NOFU', 'RETPRN', 'REFOTHMD', 'RETAPPT', 'TELEPHON', 'REFERED', 'ADMITHOS', 'OTHDISP'
]

In [11]:
# Not sure to include or not
drugInformation1 = [
    'DRUGID1',
    'PRESCR1', 'CONTSUB1', 'COMSTAT1',
    'RX1CAT1', 'RX1CAT2', 'RX1CAT3', 'RX1CAT4',
    'RX1V1C1', 'RX1V1C2', 'RX1V1C3', 'RX1V1C4',
    'RX1V2C1', 'RX1V2C2', 'RX1V2C3', 'RX1V2C4',
    'RX1V3C1', 'RX1V3C2', 'RX1V3C3', 'RX1V3C4',
]
drugInformation2 = [
    'DRUGID2',
    'PRESCR2', 'CONTSUB2', 'COMSTAT2',
    'RX2CAT1', 'RX2CAT2', 'RX2CAT3', 'RX2CAT4',
    'RX2V1C1', 'RX2V1C2', 'RX2V1C3', 'RX2V1C4',
    'RX2V2C1', 'RX2V2C2', 'RX2V2C3', 'RX2V2C4',
    'RX2V3C1', 'RX2V3C2', 'RX2V3C3', 'RX2V3C4',
]
drugInformation3 = [
    'DRUGID3',
    'PRESCR3', 'CONTSUB3', 'COMSTAT3',
    'RX3CAT1', 'RX3CAT2', 'RX3CAT3', 'RX3CAT4',
    'RX3V1C1', 'RX3V1C2', 'RX3V1C3', 'RX3V1C4',
    'RX3V2C1', 'RX3V2C2', 'RX3V2C3', 'RX3V2C4',
    'RX3V3C1', 'RX3V3C2', 'RX3V3C3', 'RX3V3C4',
]
drugInformation4 = [
    'DRUGID4',
    'PRESCR4', 'CONTSUB4', 'COMSTAT4',
    'RX4CAT1', 'RX4CAT2', 'RX4CAT3', 'RX4CAT4',
    'RX4V1C1', 'RX4V1C2', 'RX4V1C3', 'RX4V1C4',
    'RX4V2C1', 'RX4V2C2', 'RX4V2C3', 'RX4V2C4',
    'RX4V3C1', 'RX4V3C2', 'RX4V3C3', 'RX4V3C4',
]
drugInformation5 = [
    'DRUGID5',
    'PRESCR5', 'CONTSUB5', 'COMSTAT5',
    'RX5CAT1', 'RX5CAT2', 'RX5CAT3', 'RX5CAT4',
    'RX5V1C1', 'RX5V1C2', 'RX5V1C3', 'RX5V1C4',
    'RX5V2C1', 'RX5V2C2', 'RX5V2C3', 'RX5V2C4',
    'RX5V3C1', 'RX5V3C2', 'RX5V3C3', 'RX5V3C4',
]
drugInformation6 = [
    'DRUGID6',
    'PRESCR6', 'CONTSUB6', 'COMSTAT6',
    'RX6CAT1', 'RX6CAT2', 'RX6CAT3', 'RX6CAT4',
    'RX6V1C1', 'RX6V1C2', 'RX6V1C3', 'RX6V1C4',
    'RX6V2C1', 'RX6V2C2', 'RX6V2C3', 'RX6V2C4',
    'RX6V3C1', 'RX6V3C2', 'RX6V3C3', 'RX6V3C4',
]
drugInformation7 = [
    'DRUGID7',
    'PRESCR7', 'CONTSUB7', 'COMSTAT7',
    'RX7CAT1', 'RX7CAT2', 'RX7CAT3', 'RX7CAT4',
    'RX7V1C1', 'RX7V1C2', 'RX7V1C3', 'RX7V1C4',
    'RX7V2C1', 'RX7V2C2', 'RX7V2C3', 'RX7V2C4',
    'RX7V3C1', 'RX7V3C2', 'RX7V3C3', 'RX7V3C4',
]
drugInformation8 = [
    'DRUGID8',
    'PRESCR8', 'CONTSUB8', 'COMSTAT8',
    'RX8CAT1', 'RX8CAT2', 'RX8CAT3', 'RX8CAT4',
    'RX8V1C1', 'RX8V1C2', 'RX8V1C3', 'RX8V1C4',
    'RX8V2C1', 'RX8V2C2', 'RX8V2C3', 'RX8V2C4',
    'RX8V3C1', 'RX8V3C2', 'RX8V3C3', 'RX8V3C4',
]
drugInformation = [
    drugInformation1, drugInformation2, drugInformation3, drugInformation4,
    drugInformation5, drugInformation6, drugInformation7, drugInformation8
]

diagnosisRecode = [
    'DIAG1R', 'DIAG2R', 'DIAG3R',
    'SCOPWI1R', 'SCOPWI2R',
    'DIAGSC1R', 'DIAGSC2R',
    'OTHPROC1R', 'OTHPROC2R', 'OTHPROC3R', 'OTHPROC4R',
]

In [12]:
# Get a list of all DataFrames in the current environment
# and clean up the environment
for obj in dir():
    if isinstance(eval(obj), pd.DataFrame) and not obj.startswith('_') and obj != 'dfs':
        del globals()[obj]

In [29]:
# Set up a dictionary of variables
variables = {
    'dateOfVisit': dateOfVisit,
    'demographics': demographics,
    'demographics_supp': demographics_supp,
    'payment': payment,
    'visitReason': visitReason,
    'patientClinicHistory': patientClinicHistory,
    'vitalSigns': vitalSigns,
    'imputedFields': imputedFields,
    'physicianDiagnoses': physicianDiagnoses,
    'differentialDiagnoses': differentialDiagnoses,
    'presentSymptomsStatus': presentSymptomsStatus,
    'diagnosticScreeningServices': diagnosticScreeningServices,
    'healthEducation': healthEducation,
    'nonMedicationTreatments': nonMedicationTreatments,
    'medicationsAndImmunizations': medicationsAndImmunizations,
    'providersSeen': providersSeen,
    'visitDisposition': visitDisposition,
    #'drugInformation': drugInformation,
    #'diagnosisRecode': diagnosisRecode

}

# Check the existence of all variables among the DataFrames in `dfs`
for var_name, var in variables.items():
    # Continue if all variables are in all the DataFrames
    if all(all(v in df.columns for v in var) for df in dfs): continue

    # Print the DataFrames that do not have all the variables
    print(f'For `{var_name}` variables:\n{var}')
    for df in dfs:
        for v in var:
            if v not in df.columns:
                print(f'{v} is not in {df.file.unique()[0]}')
                break
    print()

For `dateOfVisit` variables:
['VMONTH', 'VYEAR', 'VDAYR']
VYEAR is not in opd2010.csv
VYEAR is not in opd2011.csv

For `demographics` variables:
['PATCODE', 'AGE', 'SEX', 'PREGNANT', 'GESTWEEK', 'ETHNIC', 'RACE', 'USETOBAC', 'NOTOBAC']
GESTWEEK is not in opd2006.csv
PREGNANT is not in opd2007.csv
PREGNANT is not in opd2008.csv
PREGNANT is not in opd2009.csv
PREGNANT is not in opd2010.csv
PREGNANT is not in opd2011.csv

For `demographics_supp` variables:
['RACER', 'RACEETH', 'AGEDAYS', 'AGER']
RACEETH is not in opd2009.csv
RACEETH is not in opd2010.csv
RACEETH is not in opd2011.csv

For `payment` variables:
['PAYPRIV', 'PAYMCARE', 'PAYMCAID', 'PAYWKCMP', 'PAYSELF', 'PAYNOCHG', 'PAYOTH', 'PAYUNK', 'PAYTYPE']
PAYUNK is not in opd2006.csv
PAYUNK is not in opd2007.csv
PAYUNK is not in opd2008.csv
PAYUNK is not in opd2009.csv
PAYUNK is not in opd2010.csv
PAYUNK is not in opd2011.csv

For `imputedFields` variables:
['BDATEFL', 'SEXFL', 'ETHNICFL', 'RACEFL', 'SENBEFL', 'PASTFL']
RACEFL is not 

### 4 - Handling missing variables in the dataframes

In [None]:
# From year 2010,
# The year of visit item (VYEAR) is no longer included on the public use file. Although the NHAMCS
# reporting periods will often begin in the last week of December and end in the last week of the
# following December, they are designed to yield statistics that are representative of the actual
# calendar year. The survey variable YEAR continues to be on the file and all visit dates may be
# assumed to reflect the calendar year. If more specific information is required, it is necessary to
# access the data through the NCHS Research Data Center.

# Impute the `VYEAR` column with the year from the `YEAR` column for the years 2010 and 2011
for df in dfs:
    df['VYEAR'] = df['VYEAR'].fillna(df['YEAR'])

In [None]:
# From year 2007,
# Items "Is female patient pregnant, and, if so, specify gestation week." are deleted.

# Remove the `PREGNANT` and `GESTWEEK` columns from the `demographics` list
demographics.remove('PREGNANT')
demographics.remove('GESTWEEK')