In [5]:
pip install requests
pip install tqdm

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Could not fetch URL https://pypi.tuna.tsinghua.edu.cn/simple/requests/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.tuna.tsinghua.edu.cn', port=443): Max retries exceeded with url: /simple/requests/ (Caused by SSLError(SSLZeroReturnError(6, 'TLS/SSL connection has been closed (EOF) (_ssl.c:1149)'))) - skipping
Could not fetch URL https://pypi.tuna.tsinghua.edu.cn/simple/pip/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.tuna.tsinghua.edu.cn', port=443): Max retries exceeded with url: /simple/pip/ (Caused by SSLError(SSLZeroReturnError(6, 'TLS/SSL connection has been closed (EOF) (_ssl.c:1149)'))) - skipping
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement requests (from versions: none)
ERROR: No matching distribution found for requests


In [11]:
# os.chdir(r'E:\LZ\25014\00_rawdata')

In [1]:
import os
import requests
from tqdm import tqdm
import time

In [2]:
def get_data_desc(data_code):
    data_desc_dict = {
        "ACQ": "全部状况问卷身体测量数据",
        "ALQ": "饮酒估计数据",
        "AUQ": "听力测量数据",
        "BPQ": "血压调查问卷数据",
        "CBQ": "心血管健康问卷数据",
        "CDQ": "心血管疾病问卷数据",
        "COT": "血清中可替宁数据",
        "DEQ": "抑郁症筛查问卷数据",
        "DIQ": "糖尿病问卷数据",
        "DLQ": "残疾问卷数据",
        "ECQ": "环境暴露问卷数据",
        "FSQ": "食品安全问卷数据",
        "GLU": "血糖数据",
        "HDL": "高密度脂蛋白胆固醇数据",
        "HIQ": "健康保险问卷数据",
        "HOQ": "住房特征问卷数据",
        "HSQ": "健康状况问卷数据",
        "MCQ": "医疗状况问卷数据",
        "PAQ": "体力活动问卷数据",
        "SMQ": "吸烟问卷数据",
        "DEMO": "人口统计学数据",
        "BPX": "血压测量数据"
    }
    # 移除数据代码中的后缀（如 _D, _E 等）
    base_code = data_code.split('_')[0] if '_' in data_code else data_code
    return data_desc_dict.get(base_code, data_code)

def download_file(url, local_filename):
    max_retries = 3
    retry_count = 0
    
    while retry_count < max_retries:
        try:
            with requests.get(url, stream=True, timeout=30) as r:
                r.raise_for_status()
                total_size = int(r.headers.get('content-length', 0))
                
                with open(local_filename, 'wb') as f:
                    with tqdm(total=total_size, unit='B', unit_scale=True, desc=os.path.basename(local_filename)) as pbar:
                        for chunk in r.iter_content(chunk_size=8192):
                            if chunk:
                                size = f.write(chunk)
                                pbar.update(size)
                return True
        except requests.exceptions.RequestException as e:
            retry_count += 1
            print(f"下载失败 {url}, 尝试次数 {retry_count}/{max_retries}")
            if retry_count == max_retries:
                print(f"下载失败: {e}")
                return False
            time.sleep(2)

def get_cycle_letter(year):
    """根据年份获取周期字母"""
    cycle_letters = {
        "1999-2000": "A",
        "2001-2002": "B",
        "2003-2004": "C",
        "2005-2006": "D",
        "2007-2008": "E",
        "2009-2010": "F",
        "2011-2012": "G",
        "2013-2014": "H",
        "2015-2016": "I",
        "2017-2018": "J",
        "2019-2020": "K",
        "August 2021-August 2023": "L"
    }
    return cycle_letters.get(year, "")

def download_nhanes_data(years, data_codes, output_dir):
    """
    下载NHANES数据文件
    
    参数:
    years (list): 年份列表，例如 ["2005-2006", "2007-2008"]
    data_codes (list): 数据代码列表
    output_dir (str): 输出目录路径
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    total_files = len(years) * len(data_codes)  # 每个数据集只下载XPT文件
    current_file = 0
    
    for year in years:
        year_dir = os.path.join(output_dir, year)
        if not os.path.exists(year_dir):
            os.makedirs(year_dir)
            
        cycle_letter = get_cycle_letter(year)
        if not cycle_letter:
            print(f"警告: 未找到年份 {year} 对应的周期字母")
            continue
            
        # 获取周期开始年份（例如从"2005-2006"获取"2005"）
        start_year = year.split('-')[0]
            
        for data_code in data_codes:
            time.sleep(1)  # 避免请求过于频繁
            try:
                # 获取基础数据代码（移除后缀）
                base_code = data_code.split('_')[0] if '_' in data_code else data_code
                cycle_letter = get_cycle_letter(year)

                # 1999-2000 特殊处理：无后缀
                if year == "1999-2000":
                    file_code = base_code
                else:
                    file_code = f"{base_code}_{cycle_letter}"
                
                # 构建XPT文件URL（使用start_year）
                xpt_url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{start_year}/DataFiles/{file_code}.xpt"
                
                current_file += 1
                data_desc = get_data_desc(data_code)
                local_filename = os.path.join(year_dir, f"{year}_{file_code}.xpt")
                
                print(f"\n[{current_file}/{total_files}] 正在下载: {xpt_url}")
                if os.path.exists(local_filename):
                    print(f"文件已存在，跳过: {local_filename}")
                    continue
                    
                if download_file(xpt_url, local_filename):
                    print(f"下载完成: {local_filename}")
                else:
                    print(f"下载失败: {local_filename}")
                    
            except Exception as e:
                print(f"处理数据代码 {data_code} 时发生错误: {e}")




In [3]:
years = ["2005-2006","2007-2008","2009-2010", "2011-2012", "2013-2014", "2015-2016", "2017-2018"]
# data_codes1 = ["DEMO", "CBC", "BIOPRO", "ALQ", "SMQ", "MCQ", "BPQ", "DIQ", "DLQ", "GHB", "GLU", "BPX"]
data_codes1 = ["DR1TOT","DR2TOT"]

In [20]:
def get_data_code(year):
    if year in ["1999-2000", "2001-2002", "2003-2004", "2005-2006", "2007-2008", "2009-2010"]:
        return ["CRP"]
    elif year in ["2015-2016", "2017-2018"]:
        return ["HSCRP"]
    else:
        return []  # 对于2011-2012和2013-2014返回空列表

data_codes2 = {year: get_data_code(year) for year in years}

In [4]:
download_nhanes_data(years,data_codes1,'E:/LZ/25014/00_rawdata')
# download_nhanes_data(years,data_codes2,'E:/LZ/25014/00_rawdata')



[1/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2005/DataFiles/DR1TOT_D.xpt


2005-2006_DR1TOT_D.xpt: 100%|██████████| 12.8M/12.8M [00:03<00:00, 3.63MB/s]


下载完成: E:/LZ/25014/00_rawdata\2005-2006\2005-2006_DR1TOT_D.xpt

[2/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2005/DataFiles/DR2TOT_D.xpt


2005-2006_DR2TOT_D.xpt: 6.46MB [00:03, 2.02MB/s]


下载完成: E:/LZ/25014/00_rawdata\2005-2006\2005-2006_DR2TOT_D.xpt

[3/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2007/DataFiles/DR1TOT_E.xpt


2007-2008_DR1TOT_E.xpt: 100%|██████████| 12.8M/12.8M [00:03<00:00, 3.75MB/s]


下载完成: E:/LZ/25014/00_rawdata\2007-2008\2007-2008_DR1TOT_E.xpt

[4/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2007/DataFiles/DR2TOT_E.xpt


2007-2008_DR2TOT_E.xpt: 6.49MB [00:02, 2.85MB/s]


下载完成: E:/LZ/25014/00_rawdata\2007-2008\2007-2008_DR2TOT_E.xpt

[5/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2009/DataFiles/DR1TOT_F.xpt


2009-2010_DR1TOT_F.xpt: 100%|██████████| 13.6M/13.6M [00:04<00:00, 2.98MB/s]


下载完成: E:/LZ/25014/00_rawdata\2009-2010\2009-2010_DR1TOT_F.xpt

[6/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2009/DataFiles/DR2TOT_F.xpt


2009-2010_DR2TOT_F.xpt: 6.82MB [00:02, 2.82MB/s]


下载完成: E:/LZ/25014/00_rawdata\2009-2010\2009-2010_DR2TOT_F.xpt

[7/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2011/DataFiles/DR1TOT_G.xpt


2011-2012_DR1TOT_G.xpt: 100%|██████████| 12.4M/12.4M [00:05<00:00, 2.44MB/s]


下载完成: E:/LZ/25014/00_rawdata\2011-2012\2011-2012_DR1TOT_G.xpt

[8/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2011/DataFiles/DR2TOT_G.xpt


2011-2012_DR2TOT_G.xpt: 6.21MB [00:02, 2.68MB/s]


下载完成: E:/LZ/25014/00_rawdata\2011-2012\2011-2012_DR2TOT_G.xpt

[9/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/DR1TOT_H.xpt


2013-2014_DR1TOT_H.xpt: 100%|██████████| 13.2M/13.2M [00:04<00:00, 2.80MB/s]


下载完成: E:/LZ/25014/00_rawdata\2013-2014\2013-2014_DR1TOT_H.xpt

[10/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/DR2TOT_H.xpt


2013-2014_DR2TOT_H.xpt: 6.69MB [00:02, 2.79MB/s]


下载完成: E:/LZ/25014/00_rawdata\2013-2014\2013-2014_DR2TOT_H.xpt

[11/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DR1TOT_I.xpt


2015-2016_DR1TOT_I.xpt: 100%|██████████| 12.9M/12.9M [00:03<00:00, 3.52MB/s]


下载完成: E:/LZ/25014/00_rawdata\2015-2016\2015-2016_DR1TOT_I.xpt

[12/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DR2TOT_I.xpt


2015-2016_DR2TOT_I.xpt: 6.50MB [00:02, 2.74MB/s]


下载完成: E:/LZ/25014/00_rawdata\2015-2016\2015-2016_DR2TOT_I.xpt

[13/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR1TOT_J.xpt


2017-2018_DR1TOT_J.xpt: 100%|██████████| 11.7M/11.7M [00:03<00:00, 3.73MB/s]


下载完成: E:/LZ/25014/00_rawdata\2017-2018\2017-2018_DR1TOT_J.xpt

[14/14] 正在下载: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR2TOT_J.xpt


2017-2018_DR2TOT_J.xpt: 5.93MB [00:02, 2.26MB/s]

下载完成: E:/LZ/25014/00_rawdata\2017-2018\2017-2018_DR2TOT_J.xpt





In [None]:
# 递归寻找目录下所有以xpt结尾的文件
def find_xpt_files(directory):
    xpt_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.xpt'):
                xpt_files.append(os.path.join(root, file))
    return xpt_files

# 使用示例
directory = 'E:/LZ/25014/00_rawdata'
xpt_files = find_xpt_files(directory)


