# National Endowment for the Humanities

Downloads data from NEH's open data portal and outputs into our format

Base URL (from: https://apps.neh.gov/open/data/)

In [11]:
import requests
import io
import pandas as pd

from oic_scrape.validation import validate_all
from oic_scrape.items import AwardItem
from attrs import asdict
import datetime

In [12]:
# Notebook Parameters
DECADES = "2000, 2010, 2020"
OUTPUT_LOCATION = "data/neh.gov_grants.jsonl"
OUTPUT_FORMAT = "jsonl"

In [13]:
def validate_decades(decades):
    """
    Validates a list of decades for NEH data file downloads.

    Args:
        decades (str): A string representing the decades to validate.
                       It can be either a comma-separated list of decades or the string "all".

    Returns:
        list: A list of valid decades.

    Raises:
        ValueError: If the input decades are not valid.

    Example:
        >>> validate_decades("1960, 1970, 1980")
        ['1960', '1970', '1980']
    """
    current_year = datetime.datetime.now().year
    current_decade = current_year - (current_year % 10)
    valid_decades = [str(year) for year in range(1960, current_decade + 1, 10)]

    if decades.lower() == "all":
        return valid_decades

    decades_list = decades.split(",")
    sanitized_decades = []

    for decade in decades_list:
        decade = decade.strip()
        if decade not in valid_decades:
            raise ValueError(
                f"Decade should be between 1960 and {current_decade} (the start of the decade for {current_year})."
            )
        sanitized_decades.append(decade)

    return sanitized_decades


def validate_output_format(format):
    """
    Validates the output file format.

    Args:
        format (str): The output format to be validated.

    Returns:
        bool: True if the format is valid (json, jsonl, or jsonlines), False otherwise.
    """
    if (
        format.lower() == "json"
        or format.lower() == "jsonl"
        or format.lower() == "jsonlines"
    ):
        return True
    else:
        return False


if validate_output_format(OUTPUT_FORMAT):
    if OUTPUT_FORMAT.lower() == "jsonl" or OUTPUT_FORMAT.lower() == "jsonlines":
        output_format_lines = True
    else:
        output_format_lines = False
else:
    raise ValueError("Output format should be either 'json' or 'jsonl'/'jsonlines'.")

In [14]:
FUNDER_NAME = "National Endowment for the Humanities"
FUNDER_ROR_ID = "https://ror.org/02vdm1p28"

In [15]:
baseurl = "https://securegrants.neh.gov/open/data/NEH_Grants{}s.csv"

In [16]:
dfs = []
for decade in validate_decades(DECADES):
    url = baseurl.format(decade)
    r = requests.get(url, verify=False)
    df = pd.read_csv(io.StringIO(r.text))
    timestamp_str = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    df["_crawled_at"] = timestamp_str
    dfs.append(df)
all_grants = pd.concat(dfs)



In [17]:
# Format times
all_grants["BeginGrant"] = pd.to_datetime(
    all_grants["BeginGrant"], format="%m/%d/%Y %I:%M:%S %p"
)
all_grants["EndGrant"] = pd.to_datetime(
    all_grants["EndGrant"], format="%m/%d/%Y %I:%M:%S %p"
)

In [18]:
# Format times
all_grants["BeginGrant"] = pd.to_datetime(
    all_grants["BeginGrant"], format="%m/%d/%Y %I:%M:%S %p"
)
all_grants["EndGrant"] = pd.to_datetime(
    all_grants["EndGrant"], format="%m/%d/%Y %I:%M:%S %p"
)

In [19]:
def month_diff(end_date: pd.Timestamp, start_date: pd.Timestamp) -> int:
    """
    Calculate the difference in months between two Pandas timestamps.

    Parameters:
    end_date (pandas.Timestamp): The first timestamp.
    start_date (pandas.Timestamp): The second timestamp.

    Returns:
    int: The difference in months between the two timestamps.
    """
    if (
        start_date.day == 1
        and end_date.day == pd.Timestamp(end_date.year, end_date.month, 1).days_in_month
    ):
        return (
            12 * (end_date.year - start_date.year)
            + (end_date.month - start_date.month)
            + 1
        )
    else:
        return 12 * (end_date.year - start_date.year) + (
            end_date.month - start_date.month
        )

In [20]:
ioi_grants = []
for ix, grant in all_grants.iterrows():
    try:
        # Handle null values for string fields
        pi_name = grant["Participants"] if pd.notna(grant["Participants"]) else None
        project_title = grant["ProjectTitle"] if pd.notna(grant["ProjectTitle"]) else ""
        project_desc = grant["ProjectDesc"] if pd.notna(grant["ProjectDesc"]) else ""
        program = grant["Program"] if pd.notna(grant["Program"]) else ""
        division = grant["Division"] if pd.notna(grant["Division"]) else ""

        # Get start and end dates
        grant_start_date = grant["BeginGrant"]
        grant_end_date = grant["EndGrant"]

        # Create AwardItem instance
        award = AwardItem(
            _crawled_at=datetime.datetime.strptime(
                grant["_crawled_at"], "%Y-%m-%d %H:%M:%S"
            ),
            source="https://apps.neh.gov/open/data/",
            grant_id=f"neh::{grant['AppNumber']}",
            funder_org_name=FUNDER_NAME,
            funder_org_ror_id=FUNDER_ROR_ID,
            recipient_org_name=grant["Institution"],
            recipient_org_location=", ".join(
                filter(
                    None,
                    [
                        grant["InstCity"],
                        grant["InstState"],
                        str(grant["InstPostalCode"])
                        if pd.notna(grant["InstPostalCode"])
                        else None,
                        grant["InstCountry"],
                    ],
                )
            ),
            pi_name=pi_name,  # Now using handled null value
            grant_year=int(grant_start_date.year),
            grant_duration=f"{month_diff(grant_end_date, grant_start_date)} months",
            grant_start_date=grant_start_date.date(),
            grant_end_date=grant_end_date.date(),
            award_amount=float(grant["ApprovedOutright"]),
            award_currency="USD",
            award_amount_usd=float(grant["ApprovedOutright"]),
            grant_description=f"{project_title} > {project_desc}",
            program_of_funder=f"{program} > {division}",
            raw_source_data=str(grant.to_dict()),
            _award_schema_version="0.1.1",
        )
        ioi_grants.append(asdict(award))
    except Exception as e:
        print(f"Error processing grant {ix}: {str(e)}")


# Validate all awards before writing
try:
    print(f"Validating {len(ioi_grants)} awards...")
    validate_all(ioi_grants)
    print("All validations passed!")

    # Write to file
    grants_df = pd.DataFrame(ioi_grants)
    grants_df.to_json(OUTPUT_LOCATION, orient="records", lines=output_format_lines)
    print(f"Successfully wrote {len(ioi_grants)} awards to {OUTPUT_LOCATION}")

except Exception as e:
    print("Validation failed:")
    print(str(e))

Validating 21422 awards...
All validations passed!
Successfully wrote 21422 awards to data/neh.gov_grants.jsonl
