# 01 - Data Acquisition

This notebook handles downloading and caching all required datasets for the pain-Trump correlation analysis.

## Data Sources

### Electoral Data
- County presidential returns (2016, 2020, 2024) from MIT Election Lab

### Pain/Distress Proxies
- CDC WONDER mortality data (overdose, suicide rates)
- CDC opioid dispensing rates
- CDC PLACES (frequent physical distress, arthritis)
- USALEEP life expectancy estimates
- County Health Rankings indicators

### Contextual/Control Variables
- USDA Rural-Urban Continuum Codes
- USDA County Typology Codes
- Census ACS 5-year estimates
- SSA OASDI/SSI disability data

In [None]:
import os
import sys
import pandas as pd
import geopandas as gpd
import requests
import zipfile
import json
from pathlib import Path
from datetime import datetime
import logging
from tqdm.notebook import tqdm
from dotenv import load_dotenv

# Setup paths
project_root = Path.cwd().parent
data_raw = project_root / 'data' / 'raw'
data_processed = project_root / 'data' / 'processed'
data_external = project_root / 'data' / 'external'

# Load environment variables
load_dotenv()

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

logger.info(f"Data will be saved to: {data_raw}")

## 1. County Boundaries (Shapefile)

In [None]:
def download_county_boundaries():
    """Download US county boundaries from Census Bureau"""
    url = "https://www2.census.gov/geo/tiger/TIGER2023/COUNTY/tl_2023_us_county.zip"
    output_dir = data_raw / 'shapefiles'
    output_dir.mkdir(parents=True, exist_ok=True)
    
    zip_path = output_dir / 'counties.zip'
    
    if not zip_path.exists():
        logger.info("Downloading county boundaries...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        with open(zip_path, 'wb') as f:
            for chunk in tqdm(response.iter_content(chunk_size=8192)):
                f.write(chunk)
        
        # Extract
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(output_dir)
        logger.info("County boundaries downloaded and extracted")
    else:
        logger.info("County boundaries already exist")
    
    # Load and preview
    counties = gpd.read_file(output_dir / 'tl_2023_us_county.shp')
    logger.info(f"Loaded {len(counties)} counties")
    return counties

counties_gdf = download_county_boundaries()
counties_gdf.head()

## 2. Electoral Data

In [None]:
def download_election_data():
    """Download county-level presidential election results"""
    
    # MIT Election Lab data URLs (these are examples - verify current URLs)
    election_urls = {
        '2016': 'https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/VOQCHQ/HEIJCQ',
        '2020': 'https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/VOQCHQ/NJTQLE'
    }
    
    election_dir = data_raw / 'elections'
    election_dir.mkdir(parents=True, exist_ok=True)
    
    election_data = {}
    
    for year, url in election_urls.items():
        file_path = election_dir / f'county_presidential_{year}.csv'
        
        if not file_path.exists():
            logger.info(f"Downloading {year} election data...")
            response = requests.get(url)
            response.raise_for_status()
            
            with open(file_path, 'wb') as f:
                f.write(response.content)
            logger.info(f"{year} election data downloaded")
        
        # Load data
        df = pd.read_csv(file_path)
        election_data[year] = df
        logger.info(f"{year}: {len(df)} records")
    
    return election_data

# Note: You'll need to verify and update these URLs
# election_data = download_election_data()

## 3. CDC WONDER Mortality Data

Note: CDC WONDER requires interactive querying. We'll set up the structure for manual downloads.

In [None]:
def prepare_cdc_wonder_queries():
    """Generate query parameters for CDC WONDER"""
    
    queries = {
        'overdose': {
            'cause': 'Drug poisonings (overdose) Unintentional (X40-X44)',
            'years': ['2013-2016', '2017-2020'],
            'group_by': ['County', 'Year'],
            'measures': ['Deaths', 'Population', 'Age-Adjusted Rate']
        },
        'suicide': {
            'cause': 'Intentional self-harm (suicide) (X60-X84, Y87.0)',
            'years': ['2013-2016', '2017-2020'],
            'group_by': ['County', 'Year'],
            'measures': ['Deaths', 'Population', 'Age-Adjusted Rate']
        },
        'despair': {
            'cause': 'Deaths of Despair (drug, alcohol, suicide)',
            'years': ['2013-2016', '2017-2020'],
            'group_by': ['County', 'Year'],
            'measures': ['Deaths', 'Population', 'Age-Adjusted Rate']
        }
    }
    
    # Save query instructions
    wonder_dir = data_raw / 'cdc_wonder'
    wonder_dir.mkdir(parents=True, exist_ok=True)
    
    instructions = [
        "CDC WONDER Query Instructions:",
        "1. Go to https://wonder.cdc.gov/mcd.html",
        "2. Use the following parameters for each query:",
        ""
    ]
    
    for name, params in queries.items():
        instructions.append(f"Query: {name}")
        instructions.append(f"  - Cause: {params['cause']}")
        instructions.append(f"  - Years: {params['years']}")
        instructions.append(f"  - Group by: {params['group_by']}")
        instructions.append(f"  - Export as: Tab-delimited")
        instructions.append(f"  - Save to: {wonder_dir / f'{name}_{{year_range}}.txt'}")
        instructions.append("")
    
    with open(wonder_dir / 'query_instructions.txt', 'w') as f:
        f.write('\n'.join(instructions))
    
    logger.info(f"CDC WONDER query instructions saved to {wonder_dir / 'query_instructions.txt'}")
    return queries

cdc_queries = prepare_cdc_wonder_queries()

## 4. CDC PLACES Data

In [None]:
def download_cdc_places():
    """Download CDC PLACES county-level health data"""
    
    # PLACES data URL (2023 release)
    url = "https://data.cdc.gov/api/views/swc5-untb/rows.csv?accessType=DOWNLOAD"
    
    places_dir = data_raw / 'cdc_places'
    places_dir.mkdir(parents=True, exist_ok=True)
    file_path = places_dir / 'places_county_2023.csv'
    
    if not file_path.exists():
        logger.info("Downloading CDC PLACES data...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        with open(file_path, 'wb') as f:
            for chunk in tqdm(response.iter_content(chunk_size=8192)):
                f.write(chunk)
        logger.info("CDC PLACES data downloaded")
    else:
        logger.info("CDC PLACES data already exists")
    
    # Load and filter relevant columns
    df = pd.read_csv(file_path, low_memory=False)
    
    # Filter for pain-related measures
    pain_measures = [
        'ARTHRITIS',  # Arthritis among adults
        'PHLTH',      # Physical health not good for >=14 days
        'DISABILITY', # Any disability
        'DEPRESSION'  # Depression
    ]
    
    logger.info(f"Loaded {len(df)} records with {df.columns.tolist()[:5]}... columns")
    return df

# places_df = download_cdc_places()

## 5. Opioid Dispensing Rates

In [None]:
def download_opioid_dispensing():
    """Download CDC opioid dispensing rate data"""
    
    opioid_dir = data_raw / 'opioids'
    opioid_dir.mkdir(parents=True, exist_ok=True)
    
    # Years available: 2006-2021
    years = range(2015, 2022)
    
    instructions = [
        "CDC Opioid Dispensing Rate Data:",
        "Download from: https://www.cdc.gov/overdose-prevention/data-research/facts-stats/opioid-dispensing-rate-maps.html",
        "",
        "Files to download:",
    ]
    
    for year in years:
        instructions.append(f"  - {year} County Opioid Dispensing Rates")
        instructions.append(f"    Save to: {opioid_dir / f'opioid_dispensing_{year}.xlsx'}")
    
    with open(opioid_dir / 'download_instructions.txt', 'w') as f:
        f.write('\n'.join(instructions))
    
    logger.info(f"Opioid data download instructions saved to {opioid_dir}")

download_opioid_dispensing()

## 6. USDA Rural-Urban Continuum Codes

In [None]:
def download_rucc_codes():
    """Download USDA Rural-Urban Continuum Codes"""
    
    url = "https://www.ers.usda.gov/webdocs/DataFiles/53251/ruralurbancodes2023.xlsx?v=4833.5"
    
    usda_dir = data_raw / 'usda'
    usda_dir.mkdir(parents=True, exist_ok=True)
    file_path = usda_dir / 'rucc_2023.xlsx'
    
    if not file_path.exists():
        logger.info("Downloading RUCC codes...")
        response = requests.get(url)
        response.raise_for_status()
        
        with open(file_path, 'wb') as f:
            f.write(response.content)
        logger.info("RUCC codes downloaded")
    else:
        logger.info("RUCC codes already exist")
    
    # Load and preview
    df = pd.read_excel(file_path)
    logger.info(f"Loaded {len(df)} counties with RUCC codes")
    return df

# rucc_df = download_rucc_codes()

## 7. Census ACS Data

Using the Census API for American Community Survey 5-year estimates

In [None]:
def setup_census_api():
    """Setup Census API access and define variables to fetch"""
    
    # Variables to fetch from ACS 5-year estimates
    acs_variables = {
        'B01003_001E': 'total_population',
        'B25077_001E': 'median_home_value',
        'B19013_001E': 'median_household_income',
        'B15003_022E': 'bachelors_degree',
        'B15003_023E': 'masters_degree',
        'B15003_024E': 'professional_degree',
        'B15003_025E': 'doctorate_degree',
        'B01001_020E': 'male_65_66',
        'B01001_021E': 'male_67_69',
        'B01001_022E': 'male_70_74',
        'B01001_023E': 'male_75_79',
        'B01001_024E': 'male_80_84',
        'B01001_025E': 'male_85_plus',
        'B01001_044E': 'female_65_66',
        'B01001_045E': 'female_67_69',
        'B01001_046E': 'female_70_74',
        'B01001_047E': 'female_75_79',
        'B01001_048E': 'female_80_84',
        'B01001_049E': 'female_85_plus',
        'B02001_002E': 'white_alone',
        'B02001_003E': 'black_alone',
        'B03002_012E': 'hispanic_latino',
        'B23025_005E': 'unemployed',
        'B23025_002E': 'labor_force'
    }
    
    census_dir = data_raw / 'census'
    census_dir.mkdir(parents=True, exist_ok=True)
    
    # Save variable definitions
    with open(census_dir / 'acs_variables.json', 'w') as f:
        json.dump(acs_variables, f, indent=2)
    
    logger.info(f"Census ACS variable definitions saved to {census_dir}")
    
    # Note: You'll need a Census API key
    # Get one at: https://api.census.gov/data/key_signup.html
    api_key = os.getenv('CENSUS_API_KEY')
    
    if not api_key:
        logger.warning("No Census API key found. Add CENSUS_API_KEY to your .env file")
        logger.info("Get a key at: https://api.census.gov/data/key_signup.html")
    
    return acs_variables

acs_vars = setup_census_api()

## 8. Data Download Summary

In [None]:
def create_download_summary():
    """Create a summary of all data sources and their status"""
    
    summary = {
        'timestamp': datetime.now().isoformat(),
        'data_sources': {
            'county_boundaries': {
                'source': 'Census TIGER',
                'url': 'https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html',
                'status': 'automated',
                'path': str(data_raw / 'shapefiles')
            },
            'elections': {
                'source': 'MIT Election Lab',
                'url': 'https://electionlab.mit.edu/data',
                'status': 'manual_required',
                'path': str(data_raw / 'elections')
            },
            'cdc_wonder': {
                'source': 'CDC WONDER',
                'url': 'https://wonder.cdc.gov/',
                'status': 'manual_required',
                'path': str(data_raw / 'cdc_wonder')
            },
            'cdc_places': {
                'source': 'CDC PLACES',
                'url': 'https://www.cdc.gov/places/',
                'status': 'automated',
                'path': str(data_raw / 'cdc_places')
            },
            'opioid_dispensing': {
                'source': 'CDC Opioid Data',
                'url': 'https://www.cdc.gov/overdose-prevention/',
                'status': 'manual_required',
                'path': str(data_raw / 'opioids')
            },
            'rucc': {
                'source': 'USDA ERS',
                'url': 'https://www.ers.usda.gov/',
                'status': 'automated',
                'path': str(data_raw / 'usda')
            },
            'census_acs': {
                'source': 'Census ACS',
                'url': 'https://www.census.gov/programs-surveys/acs',
                'status': 'api_required',
                'path': str(data_raw / 'census')
            }
        }
    }
    
    with open(project_root / 'data_acquisition_status.json', 'w') as f:
        json.dump(summary, f, indent=2)
    
    logger.info("Data acquisition summary created")
    
    # Print status
    print("\n" + "="*50)
    print("DATA ACQUISITION STATUS")
    print("="*50)
    for source, info in summary['data_sources'].items():
        status_emoji = "✅" if info['status'] == 'automated' else "⚠️"
        print(f"{status_emoji} {source:20} - {info['status']}")
    print("="*50)
    
    return summary

summary = create_download_summary()