In [None]:
import pandas as pd
import os

In [None]:
if 'data' not in os.listdir():
    os.mkdir('data')
if 'raw' not in os.listdir('data/'):
    os.mkdir('data/raw')

In [None]:
year_data_mapping = {
    '2011-2012': 'G',
    '2013-2014': 'H',
}

In [None]:
data_sources = {
    year: {
        'demographic': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/DEMO_{}.XPT'.format(year, letter)),
        'bp_cholesterol': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/BPQ_{}.XPT'.format(year, letter)),
        'cardiovascular': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/CDQ_{}.XPT'.format(year, letter)),
        'cognitive': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/CFQ_{}.XPT'.format(year, letter)),
        'diabetes': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/DIQ_{}.XPT'.format(year, letter)),
        'diet_nutrition': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/DBQ_{}.XPT'.format(year, letter)),
        'housing': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/HOQ_{}.XPT'.format(year, letter)),
        'income': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/INQ_{}.XPT'.format(year, letter)),
        'mental_health': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/DPQ_{}.XPT'.format(year, letter)),
        'physical_activity': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/PAQ_{}.XPT'.format(year, letter)),
        'sleep_disorder': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/SLQ_{}.XPT'.format(year, letter)),
        'cigarette_use':pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/SMQ_{}.XPT'.format(year,letter)),
        #Added data sources
        'alcohol_use': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/ALQ_{}.XPT'.format(year,letter)) ,
        'medical_conditions': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/MCQ_{}.XPT'.format(year,letter)), 
        'access_to_care': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/HUQ_{}.XPT'.format(year,letter)), 
        'food_security': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/FSQ_{}.XPT'.format(year,letter)), 
        'immunization': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/IMQ_{}.XPT'.format(year,letter)),
        #At some point, we might need to remove one of the weight files (putting too much emphasis on one's weight can decrease accuracy of the model)
        'weight_history': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/WHQ_{}.XPT'.format(year,letter)),
        'weight_history_youth': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/WHQMEC_{}.XPT'.format(year,letter)),
        'drug_use': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/DUQ_{}.XPT'.format(year,letter)),
        #Occupation affects income, and we already included income. This data includes interesting info like smoke exposure, which could relate to cognitive decline
        'occupation': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/OCQ_{}.XPT'.format(year,letter)),
        #Need to refer to first paper to check if prescription medication is a good feature to include
        'prescription_medication': pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/{}/RXQ_RX_{}.XPT'.format(year,letter))

        
    }
    for year, letter in year_data_mapping.items()
}

In [None]:
# Keep track of columns that are consistent across years
column_tracker = {
    file: []
    for file in data_sources['2011-2012']
}

In [None]:
for year, data in data_sources.items():
    for file in column_tracker:

        if len(column_tracker[file]) == 0:
            column_tracker[file] = data[file].columns
        else:
            column_tracker[file] = [
                col
                for col in data[file].columns
                if col in column_tracker[file]
            ]

In [None]:
# Save dataframes to data/raw directory
for year, data in data_sources.items():
    for file, df in data.items():
        df[column_tracker[file]].to_csv(
            'data/raw/{}_{}.csv'.format(year, file),
            index = False
        )