# Notebook Setup

In [None]:
# this retrieves my username(s) and api key(s) another user will need to retrieve their own
%run project_api_keys.ipynb
kaggle_username = os.environ.get("KAGGLE_USERNAME")
kaggle_api_key = os.environ.get("KAGGLE_API_KEY")
bea_api_key = os.environ.get("BEA_API_KEY")

In [None]:
print(bea_api_key)

## Installs

In [None]:
# installs - assumes only jupyter-lab has been installed
!pip3 install -qU pandas kaggle

## Imports

In [None]:
# imports
import os
import time
import datetime as dt
import requests
import pandas as pd
import zipfile
import re
from pathlib import Path
from typing import List
import json

# Project Name: Economic Resilience After Natural Disasters
 - Student <b>Name: Robert Williams</b>
 - UTeid: <b>rgw65</b>
 - Course: <b>Case Studies in Machine Learning AI391M (54340)</b>
 - Term: <b>Fall 2025</b>

I would like to take a moment to acknowledge <b>[Aurélien Géron](https://www.oreilly.com/pub/au/7106)</b> author of <b>[Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow](https://www.oreilly.com/library/view/hands-on-machine-learning/9781098125967/)</b>.
The structure of this machine learning project is based upon his Machine Learning Project Checklist (Appendix A). It has been an invaluable resource.

## 1. Frame the Problem and Look at the Big Picture

## 2. Get the Data

In [None]:
# set data directories
OUT_DIR = "data/raw/"

### FEMA Disaster Declarations Summaries (OpenFEMA)

In [None]:
# config for FEMA
BASE_URL = "https://www.fema.gov/api/open/v2/DisasterDeclarationsSummaries" # OpenFEMA v2 endpoint
PAGE_SIZE = 1000 # OpenFEMA returns up to 1000 per page
SLEEP_SEC = 0.2
os.makedirs(OUT_DIR, exist_ok=True)
timestamp = dt.datetime.now(dt.UTC).strftime("%Y%m%dT%H%M%SZ")
OUT_PARQUET = os.path.join(OUT_DIR, f"disaster_declarations_summaries_{timestamp}.parquet")
OUT_CSV = os.path.join(OUT_DIR, f"disaster_declarations_summaries_{timestamp}.csv")

# Set to None to pull everything.
FILTERS = None

In [None]:
# function to create filter string
def build_filter_str(filters: dict | None) -> str | None:
    if not filters:
        return None
    parts = []
    for k, v in filters.items():
        # escape single quotes in value
        v_escaped = str(v).replace("'", "''")
        parts.append(f"{k} eq '{v_escaped}'")
    return " and ".join(parts)

In [None]:
# function to fetch all pages
def fetch_openfema_all(base_url: str, page_size: int = 1000, filters: dict | None = None, sleep_sec: float = 0.2) -> pd.DataFrame:
    session = requests.Session()
    records = []
    skip = 0

    params = {
        "$top": page_size,
        # You can also use $select to reduce columns if needed, e.g. "$select": "disasterNumber,state,incidentType,..." 
    }
    filt = build_filter_str(filters)
    if filt:
        params["$filter"] = filt

    while True:
        params["$skip"] = skip
        r = session.get(base_url, params=params, timeout=60)
        r.raise_for_status()
        data = r.json()

        # OpenFEMA returns a "DisasterDeclarationsSummaries" array in v2
        # If structure changes, print(data.keys()) to inspect.
        page = data.get("DisasterDeclarationsSummaries", [])
        if not page:
            break

        records.extend(page)
        skip += page_size
        time.sleep(sleep_sec)

    return pd.DataFrame.from_records(records)

In [None]:
# download data to dataframe
df_femaddc = fetch_openfema_all(BASE_URL, page_size=PAGE_SIZE, filters=FILTERS, sleep_sec=SLEEP_SEC)

In [None]:
# save locally
wrote = None
try:
    df_femaddc.to_parquet(OUT_PARQUET, index=False)
    wrote = OUT_PARQUET
except Exception as e:
    # Fallback if pyarrow/fastparquet isn’t installed:
    df_femaddc.to_csv(OUT_CSV, index=False)
    wrote = OUT_CSV

print(f"Fetched {len(df_femaddc):,} rows. Saved to: {wrote}")

### NOAA Billion-Dollar Weather and Climate Disasters (NCEI) via Kaggle Mirror

In [None]:
# kaggle config
KAGGLE_URL = "https://www.kaggle.com/datasets/landfallmotto/billiondollar-weather-and-climate-disasters/data"
try:
    OUT_DIR  # noqa: F821
except NameError:
    OUT_DIR = "data/raw/"

BASE_NAME = "noaa_billion_dollar_disasters"  # file stem for saved outputs
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
# function to authenticate Kaggle (from env vars)
def init_kaggle_api(kaggle_username: str | None = None, kaggle_api_key: str | None = None):
    user = kaggle_username or os.environ.get("KAGGLE_USERNAME")
    # Kaggle lib reads KAGGLE_KEY (preferred) but many users set KAGGLE_API_KEY; accept either.
    key  = kaggle_api_key or os.environ.get("KAGGLE_KEY") or os.environ.get("KAGGLE_API_KEY")
    if not user or not key:
        raise SystemExit("Missing Kaggle credentials. Ensure KAGGLE_USERNAME and KAGGLE_KEY (or KAGGLE_API_KEY) are set.")
    os.environ["KAGGLE_USERNAME"] = user
    os.environ["KAGGLE_KEY"] = key
    try:
        from kaggle.api.kaggle_api_extended import KaggleApi
    except Exception as e:
        raise SystemExit(f"Kaggle package not installed or not importable. Install with `pip install kaggle`. Details: {e}")
    api = KaggleApi()
    api.authenticate()
    print(f"Kaggle API authenticated as: {user}")
    return api

In [None]:
# create kaggle_api object
kaggle_api = init_kaggle_api(
    os.environ.get("KAGGLE_USERNAME"),
    os.environ.get("KAGGLE_KEY") or os.environ.get("KAGGLE_API_KEY")
)

In [None]:
# function to parse dataset reference from URL
def get_kaggle_dataset_ref(url: str) -> str:
    m = re.search(r"/datasets/([^/]+/[^/]+)", url)
    if not m:
        raise ValueError(f"Could not parse Kaggle dataset reference from URL: {url}")
    return m.group(1)

In [None]:
# create data reference set
dataset_ref = get_kaggle_dataset_ref(KAGGLE_URL)
print("Dataset ref:", dataset_ref)

In [None]:
# function to download and unzip kaggle data
def kaggle_download_dataset(kaggle_api, dataset_ref: str, base_dir: str, subfolder: str = "noaa_billion_dollar_disasters", force: bool = False) -> List[Path]:
    """
    Downloads and unzips a Kaggle dataset into a unique subfolder under base_dir.
    Returns a list of extracted file paths.
    """
    # Create a dedicated NOAA folder under your base_dir
    out_path = Path(base_dir) / subfolder
    out_path.mkdir(parents=True, exist_ok=True)

    existing = [p for p in out_path.rglob("*") if p.is_file()]
    if existing and not force:
        print(f"Files already exist under {out_path}. Skipping download (use force=True to re-download).")
        return existing

    print(f"Downloading '{dataset_ref}' to {out_path} ...")
    kaggle_api.dataset_download_files(dataset_ref, path=str(out_path), unzip=True, quiet=False)

    extracted = [p for p in out_path.rglob("*") if p.is_file()]
    if not extracted:
        raise RuntimeError(f"No files found after download for {dataset_ref} in {out_path}")

    print(f"Download complete. Found {len(extracted)} files in {out_path}.")
    for p in sorted(extracted)[:10]:
        print(" -", p.relative_to(out_path))
    if len(extracted) > 10:
        print(f" ... (+{len(extracted)-10} more)")

    return extracted

In [None]:
# download data
downloaded_files = kaggle_download_dataset(kaggle_api, dataset_ref, base_dir="data/raw", subfolder="noaa_billion_dollar_disasters", force=True)

In [None]:
# function to creat dataframe
NOAA_DIR = Path("data/raw/noaa_billion_dollar_disasters")  # matches your unique subfolder
BASE_NAME = "noaa_billion_dollar_disasters"

def select_best_csv(root_dir: Path) -> Path:
    csvs = list(root_dir.rglob("*.csv"))
    if not csvs:
        raise FileNotFoundError(f"No CSV files found under {root_dir.resolve()}")
    patterns = ["event", "disaster", "billion"]
    def score(p: Path):
        name = p.name.lower()
        hits = sum(bool(re.search(pat, name)) for pat in patterns)
        return (hits, p.stat().st_size)
    return sorted(csvs, key=score, reverse=True)[0]

In [None]:
selected_csv = select_best_csv(NOAA_DIR)
print("Selected CSV:", selected_csv)

# load dataframe
df_noaabdd = pd.read_csv(selected_csv, low_memory=False)
print(f"Loaded df_noaabdd: {df_noaabdd.shape[0]:,} rows × {df_noaabdd.shape[1]} columns")

In [None]:
# save standardized copies in the same NOAA folder
ts = dt.datetime.now(dt.UTC).strftime("%Y%m%dT%H%M%SZ")
out_parquet_ts     = NOAA_DIR / f"{BASE_NAME}_{ts}.parquet"
out_csv_ts         = NOAA_DIR / f"{BASE_NAME}_{ts}.csv"
out_parquet_latest = NOAA_DIR / f"{BASE_NAME}_latest.parquet"
out_csv_latest     = NOAA_DIR / f"{BASE_NAME}_latest.csv"

try:
    df_noaabdd.to_parquet(out_parquet_ts, index=False)
    df_noaabdd.to_parquet(out_parquet_latest, index=False)
    print("Saved timestamped file to:", out_parquet_ts)
    print("Saved latest file to     :", out_parquet_latest)
except Exception:
    df_noaabdd.to_csv(out_csv_ts, index=False)
    df_noaabdd.to_csv(out_csv_latest, index=False)
    print("Saved timestamped file to:", out_csv_ts)
    print("Saved latest file to     :", out_csv_latest)

### BEA Regional GDP by County

In [None]:
BEA_BASE_URL = "https://apps.bea.gov/api/data"

def init_bea_api():
    """
    Reads your BEA API key from env and returns it.
    Expected env var: BEA_API_KEY (fallback: BEA_KEY).
    """
    key = os.environ.get("BEA_API_KEY") or os.environ.get("BEA_KEY")
    if not key:
        raise SystemExit(
            "Missing BEA API key. Set BEA_API_KEY (or BEA_KEY) in your environment "
            "before continuing."
        )
    print("BEA API key found.")
    return key

def bea_get(params: dict, timeout: int = 60) -> dict:
    """
    Thin wrapper around BEA's API.
    Adds required fields, handles errors, and returns parsed JSON.
    """
    # required fields (case-insensitive on BEA side, but we’ll keep consistent)
    base = {
        "UserID": params.pop("UserID"),
        "method": params.pop("method"),
        "datasetname": params.pop("datasetname"),
        "ResultFormat": params.pop("ResultFormat", "JSON"),
    }
    query = {**base, **params}

    r = requests.get(BEA_BASE_URL, params=query, timeout=timeout)
    r.raise_for_status()
    data = r.json()

    # basic sanity checks BEA-style
    if "BEAAPI" not in data or "Results" not in data["BEAAPI"]:
        raise RuntimeError(f"Unexpected BEA response structure: {list(data.keys())}")
    return data["BEAAPI"]["Results"]

# Initialize (reads key from your env)
BEA_API_KEY = init_bea_api()

### BLS Local Area Unemployment Statistics (LAUS))

### U.S. Census American Community Survey (ACS) 5-Year Estimates

### USDA ERS County Typology Codes

### Census County Business Patterns (CBP)

### HUD Aggregated USPS Vacancies Dataset

### National Risk Index (FEMA)

## 3. Explore the Data

## 4. Prepare the Data

## 5. Shortlist Promising Models

## 6. Fine-Tune the System

## 7. Present your Solution

## 8. Launch!