# National Science Foundation

[Bulk grant downloads](https://www.nsf.gov/awardsearch/download.jsp) are available from the NSF's site. Has a last_updated date. Also available: [xml schema](https://www.nsf.gov/awardsearch/resources/Award.xsd).

```yaml
parameter_type: 
    fiscal_year:
        - October 1 - September 30
    earliest: FY-1959
```

```yaml

In [37]:
import requests
from requests_cache import CachedSession
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from datetime import datetime, timedelta
import xmltodict
from bs4 import BeautifulSoup
import pytz
import zipfile
from io import BytesIO
from typing import Optional, Dict

In [6]:
START_FISCAL_YEAR = 2020 # 1959
END_FISCAL_YEAR = datetime.now().year if datetime.now().month < 10 else datetime.now().year +1
USE_CACHE = True

In [8]:
# parameter validation and cleanup
p = {}

try :
    p['start_fy'] = int(START_FISCAL_YEAR)
except ValueError:
    raise ValueError("Invalid start year")

try:
    p['end_fy'] = int(END_FISCAL_YEAR)
except ValueError:
    raise ValueError("Invalid end year")

if p['start_fy'] > p['end_fy']:
    raise ValueError("Start year must be less than or equal to end year")

try:
    p['use_cache'] = bool(USE_CACHE)
except ValueError:
    raise ValueError("Invalid use_cache value. Must be boolean")

In [14]:
## HTTP Configuration

# Cache for development use only
if p['use_cache']:
    session = CachedSession(
        "cache.sqlite", backend="sqlite",allowable_methods=('GET', 'POST'), allowable_codes=(200, 404), expire_after=timedelta(days=7))
else:
    session = requests.Session()
retry = Retry(connect=5, backoff_factor=1.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)


In [17]:
def get_most_recent_update_timestamp() -> datetime:
    url = "https://www.nsf.gov/awardsearch/download.jsp"
    r = session.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    date_paragraph = soup.select_one("#downloadText > p:nth-child(2)")
    if date_paragraph:
        # Extract the datetime string
        datetime_string = date_paragraph.text.split("Awards last updated on: ")[1].split("\n")[0]
        
        # Parse the datetime string into a datetime object
        datetime_object = datetime.strptime(datetime_string, "%m/%d/%Y %H:%M:%S")
        
        # Set the timezone to America/New_York
        eastern = pytz.timezone("America/New_York")
        datetime_with_timezone = eastern.localize(datetime_object)
        
        return datetime_with_timezone
    else:
        raise ValueError(f"Couldn't find the timestamp of the most recent update of NSF awards data at {url}")

def get_fy_data(fy: int) -> dict:
    url = f"https://www.nsf.gov/awardsearch/download?DownloadFileName={fy}&All=true"

datetime.datetime(2024, 2, 22, 23, 34, 52, tzinfo=<DstTzInfo 'America/New_York' EST-1 day, 19:00:00 STD>)

In [26]:
# ad-hoc data builder

fy = 2020
url = f"https://www.nsf.gov/awardsearch/download?DownloadFileName={fy}&All=true"
r = session.get(url = url)

In [30]:
if r.status_code != 200:
    raise ValueError(f"Failed to download NSF award data for fiscal year {fy}")
elif r.headers["Content-Type"] != 'application/zip':
    raise ValueError(f"Unexpected content type for NSF award data for fiscal year {fy}. Recieved `{r.headers['Content-Type']}`, expected `application/zip`")
else:
    zip_file = zipfile.ZipFile(BytesIO(r.content))

    for filename in zip_file.namelist():
        if filename.endswith(".xml"):
            xml_file = zip_file.open(filename)
            xml_data = xml_file.read()
            xml_file.close()
            award_data = xmltodict.parse(xml_data)
            print(award_data)
            break
    

{'rootTag': {'Award': {'AwardTitle': 'Collaborative Research: Excellence in Research: Impact of Gbx2 on neural crest cells during neuronal, craniofacial and cardiovascular development', 'AGENCY': 'NSF', 'AwardEffectiveDate': '07/01/2020', 'AwardExpirationDate': '06/30/2024', 'AwardTotalIntnAmount': '229338.00', 'AwardAmount': '345460', 'AwardInstrument': {'Value': 'Standard Grant'}, 'Organization': {'Code': '08090100', 'Directorate': {'Abbreviation': 'BIO', 'LongName': 'Direct For Biological Sciences'}, 'Division': {'Abbreviation': 'IOS', 'LongName': 'Division Of Integrative Organismal Systems'}}, 'ProgramOfficer': {'SignBlockName': 'Philip Becraft', 'PO_EMAI': 'pbecraft@nsf.gov', 'PO_PHON': '7032920000'}, 'AbstractNarration': "Head and heart development are closely intertwined during embryonic development in vertebrates. They share molecular regulatory mechanism as well as some progenitor cell populations. Both head and heart, are made from a multitude of cells, which include mesoderm

In [34]:
def format_award_data(award_xml: str, crawl_timestamp:Optional[datetime]=None) -> dict:
    """
    Produces an oic_item compliant object from the award data in the given XML file.
    """
    award_data = xmltodict.parse(award_xml)
    results = {}

'b\'<?xml version="1.0" encoding="UTF-8"?>\\n<rootTag>\\n<Award>\\n<AwardTitle><![CDATA[Collaborative Research: Excellence in Research: Impact of Gbx2 on neural crest cells during neuronal, craniofacial and cardiovascular development]]></AwardTitle>\\n<AGENCY>NSF</AGENCY>\\n<AwardEffectiveDate>07/01/2020</AwardEffectiveDate>\\n<AwardExpirationDate>06/30/2024</AwardExpirationDate>\\n<AwardTotalIntnAmount>229338.00</AwardTotalIntnAmount>\\n<AwardAmount>345460</AwardAmount>\\n<AwardInstrument>\\n<Value>Standard Grant</Value>\\n</AwardInstrument>\\n<Organization>\\n<Code>08090100</Code>\\n<Directorate>\\n<Abbreviation>BIO</Abbreviation>\\n<LongName>Direct For Biological Sciences</LongName>\\n</Directorate>\\n<Division>\\n<Abbreviation>IOS</Abbreviation>\\n<LongName>Division Of Integrative Organismal Systems</LongName>\\n</Division>\\n</Organization>\\n<ProgramOfficer>\\n<SignBlockName>Philip Becraft</SignBlockName>\\n<PO_EMAI>pbecraft@nsf.gov</PO_EMAI>\\n<PO_PHON>7032920000</PO_PHON>\\n</P

In [36]:
r.created_at

datetime.datetime(2024, 2, 28, 17, 26, 2, 946103)