In [2]:
import pandas as pd
import glob, os

In [None]:
## Read in Data

In [3]:
def read_combine_raw_files(path, file_str = '*.XPT'):
    files = glob.glob(os.path.join(path, file_str))

    # Print the files 
    display(files)

    # Combining all the files into a DataFrame
    df_files = [pd.read_sas(file) for file in files]
    combined_df = pd.concat(df_files)
    
    return combined_df

In [7]:
def select_df_col(df, sel_col):
    "return df with only selected col"
    col_in_df = [col for col in sel_col if col in df.columns]
    
    display(col_in_df)
    if len(col_in_df) < len(sel_col):
        display("warnings: not all selected col in df")
        
    df_select = df.copy()
    
    return df_select[col_in_df] 

### 1. Demographics 

In [4]:
demo_path = r'nhanes_rawdata/DEMO/'

demo_combined = read_combine_raw_files(path = demo_path)

['nhanes_rawdata/DEMO/DEMO_G.XPT',
 'nhanes_rawdata/DEMO/DEMO_H.XPT',
 'nhanes_rawdata/DEMO/DEMO_I.XPT',
 'nhanes_rawdata/DEMO/DEMO_J.XPT']

In [5]:
demo_combined.head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGY,...,DMDHRGND,DMDHRAGE,DMDHRBR4,DMDHREDU,DMDHRMAR,DMDHSEDU,DMDHRAGZ,DMDHREDZ,DMDHRMAZ,DMDHSEDZ
0,62161.0,7.0,2.0,1.0,22.0,,3.0,3.0,2.0,,...,2.0,50.0,1.0,5.0,1.0,5.0,,,,
1,62162.0,7.0,2.0,2.0,3.0,,1.0,1.0,1.0,3.0,...,2.0,24.0,1.0,3.0,6.0,,,,,
2,62163.0,7.0,2.0,1.0,14.0,,5.0,6.0,2.0,14.0,...,1.0,42.0,1.0,5.0,1.0,4.0,,,,
3,62164.0,7.0,2.0,2.0,44.0,,3.0,3.0,1.0,,...,1.0,52.0,1.0,4.0,1.0,4.0,,,,
4,62165.0,7.0,2.0,2.0,14.0,,4.0,4.0,2.0,14.0,...,2.0,33.0,2.0,2.0,77.0,,,,,


In [6]:
demo_cols_keep  = ['DMDCITZN', # Citizenship status
                   'DMDEDUC2', # Education level - Adults 20+
                   'DMDHHSIZ', # Total number of people in the Household
                   'DMDMARTL', # Marital status
                   'DMQMILIZ', # Served active duty in US Armed Forces
                   'INDHHIN2', # Annual household income
                   'RIAGENDR', # gender
                   'RIDAGEYR', # Age in years at screening
                   'RIDEXPRG', # pregnacy status
                   'RIDRETH1' # Race/Hispanic origin
                  ]

In [8]:
df_demo = select_df_col(demo_combined, demo_cols_keep)

['DMDCITZN',
 'DMDEDUC2',
 'DMDHHSIZ',
 'DMDMARTL',
 'DMQMILIZ',
 'INDHHIN2',
 'RIAGENDR',
 'RIDAGEYR',
 'RIDEXPRG',
 'RIDRETH1']

In [9]:
df_demo.head()

Unnamed: 0,DMDCITZN,DMDEDUC2,DMDHHSIZ,DMDMARTL,DMQMILIZ,INDHHIN2,RIAGENDR,RIDAGEYR,RIDEXPRG,RIDRETH1
0,1.0,3.0,5.0,5.0,2.0,14.0,1.0,22.0,,3.0
1,1.0,,6.0,,,4.0,2.0,3.0,,1.0
2,1.0,,5.0,,,15.0,1.0,14.0,,5.0
3,1.0,4.0,5.0,1.0,1.0,8.0,2.0,44.0,2.0,3.0
4,1.0,,5.0,,,4.0,2.0,14.0,,4.0


### 2. Physical Measure

In [13]:
bp_path = r'nhanes_rawdata/BODY/BloodPressure/'

bp_combined = read_combine_raw_files(path = bp_path)

['nhanes_rawdata/BODY/BloodPressure/BPX_H.XPT',
 'nhanes_rawdata/BODY/BloodPressure/BPX_I.XPT',
 'nhanes_rawdata/BODY/BloodPressure/BPX_J.XPT',
 'nhanes_rawdata/BODY/BloodPressure/BPX_G.XPT']

In [14]:
bm_path = r'nhanes_rawdata/BODY/BodyMeasures/'

bm_combined = read_combine_raw_files(path = bm_path)

['nhanes_rawdata/BODY/BodyMeasures/BMX_G.XPT',
 'nhanes_rawdata/BODY/BodyMeasures/BMX_J.XPT',
 'nhanes_rawdata/BODY/BodyMeasures/BMX_I.XPT',
 'nhanes_rawdata/BODY/BodyMeasures/BMX_H.XPT']