In [1]:
import requests  
from bs4 import BeautifulSoup 
import os

import pandas as pd

In [2]:
# The data retrieved from the NHANES repository site are limited to these years and variables

year_letter = {2007:'E',2009:'F',2011:"G",2013:"H",2015:"I",2017:"J"}


repo_vars = {
    'DEMO': ['RIAGENDR','RIDAGEYR','RIDRETH1','DMDHREDU','DMDHREDZ','INDHHIN2'],#,'DMDBORN4'], # Demographic Questionaire
    'ALQ': ['ALQ150','ALQ151'], # Alcohol Questionaire 
    'BPQ': ['BPQ020','BPQ080'], # Blood preasure Questionaire
    'CDQ': ['CDQ001','CDQ010'], # Cardio health Questionaire
    'DIQ': ['DIQ010', 'DID040'], # Diabetes Questionaire 
    'DBQ': ['DBQ197','DBD895','DBD900','DBD905','DBD910'], # Diet Questionaire
    'KIQ_U': ['KIQ022','KIQ026','KIQ005'], # Kidneys Questionaire
    'OCQ': ['OCD150', 'OCQ180'], # Occupation
    'OHQ': ['OHQ011','OHQ845'], # Oral health Questionaire
    'PAQ': ['PAQ605', 'PAQ620', 'PAQ635','PAQ650','PAQ665'], # Physical Activity Questionaire
    'SLQ': ['SLD010H', 'SLD012'], # Sleep questionaire
    'SMQ': ['SMQ020', 'SMQ040'], # Smoking Questionaire 
    'WHQ': ['WHD140'], # Weight history
    'BMX': [ 'BMXWT','BMXBMI','BMXWAIST','BMXARML','BMXARMC','BMXLEG'], # Body measurements
    'BPX': [ 'BPXPLS','BPXSY1', 'BPXDI1','BPXSY2', 'BPXDI2','BPXSY3', 'BPXDI3'], # Blood preasure measurements
    'GLU': ['LBXGLU'], #Glucose measurement
    'FASTQX': ['PHAFSTHR'], # Fasting time
    'HDL': ['LBDHDD'], # High Cholesterol
    'TRIGLY': ['LBXTR','LBDLDL'], # Low Chol + Triglycerides
    'TCHOL': ['LBXTC'] # Total Cholesterol
}


In [3]:
# First we will get all links to data in neccessary repos

url = "https://wwwn.cdc.gov"
arch_url = "https://wwwn.cdc.gov/nchs/nhanes/Search/DataPage.aspx?" 
survey_type = ['Demographics','Questionnaire','Examination','Laboratory'] 
    
    
    
data_links = []
for year in year_letter.keys():
    year_links = []
    for survey in survey_type:
        path = f'Component={survey}&CycleBeginYear={str(year)}'
        archive_url = arch_url + path
      
        # create response object  
        r = requests.get(archive_url)  
      
        # create beautiful-soup object  
        soup = BeautifulSoup(r.content,'html.parser')  
      
        # find all links on web-page  
        links = soup.findAll('a')  
        
        # filter the link sending with .xpt
        links = [url + link['href'] 
                 for link in links if link['href'].endswith('XPT')]  
        
        year_links += links
    data_links += [year_links]    
   

In [4]:
# read data links and retrieve desired files
i = 0
for year_links in data_links:
    j = 0
    for link in year_links: 
        file = link.split('/')[-1]
        # Drop last _ and all following characters to get repo_vars key value
        key = file[:file.rfind('_')]
        if key in repo_vars.keys():
            df_temp = pd.read_sas(link, index = 'SEQN')
            cols = list(set(df_temp.columns.to_list()) & set(repo_vars[key]))
            df_temp = df_temp[cols]
            if j == 0:
                df_year = df_temp
                j += 1
            else:
                df_year = df_year.join(df_temp)
    if i == 0:
        df = df_year
        i += 1
    else:
        df = pd.concat([df,df_year])
        

In [5]:
df.to_pickle("raw_data.pkl")