In [7]:
import requests
import pandas as pd
from datetime import date, timedelta
from urllib.parse import quote_plus

### Configuration

In [8]:
# Organizations
ORG_CANDIDATES = [
    "Corewell Health",
    "Spectrum Health",
    "Beaumont Health",
]

# Columns you asked for (final, normalized)
OUT_COLS = [
    "id","agency","awardeeName","awardeeCity","awardeeStateCode",
    "piFirstName","piLastName","title","date","startDate","expDate",
    "fundsObligatedAmt","abstractText"
]

# Last 3 years (inclusive) based on today's date
TODAY = date.today()
START = TODAY.replace(year=TODAY.year - 20)

### NSF Data Extraction

In [23]:
def fetch_nsf_awards(org_names, start_date, end_date):
    base = "https://api.nsf.gov/services/v1/awards.json"
    s = start_date.strftime("%m/%d/%Y")
    e = end_date.strftime("%m/%d/%Y")
    rpp = 25

    rows = []
    for org in org_names:
        offset = 1
        while True:
            params = {
                "awardeeName": org,
                "startDateStart": s,
                "startDateEnd": e,
                "rpp": rpp,
                "offset": offset,
                "printFields": ",".join([
                    "id","agency","awardeeName","awardeeCity","awardeeStateCode",
                    "pdPIName","piFirstName","piLastName","title","date","startDate","expDate",
                    "fundsObligatedAmt","abstractText"
                ])
            }
            resp = requests.get(base, params=params, timeout=60)
            resp.raise_for_status()
            data = resp.json()
            awards = (data.get("response") or {}).get("award") or []
            if not awards:
                break

            for a in awards:
                pi_first = a.get("piFirstName")
                pi_last = a.get("piLastName")
                if (not pi_first or not pi_last) and a.get("pdPIName"):
                    parts = a["pdPIName"].split()
                    if len(parts) >= 2:
                        pi_first = pi_first or parts[0]
                        pi_last = pi_last or " ".join(parts[1:])

                rows.append({
                    "id": str(a.get("id") or ""),
                    "agency": a.get("agency") or "NSF",
                    "awardeeName": a.get("awardeeName") or org,
                    "awardeeCity": a.get("awardeeCity"),
                    "awardeeStateCode": a.get("awardeeStateCode"),
                    "piFirstName": pi_first,
                    "piLastName": pi_last,
                    "title": a.get("title"),
                    "date": a.get("date"),
                    "startDate": a.get("startDate"),
                    "expDate": a.get("expDate"),
                    "fundsObligatedAmt": a.get("fundsObligatedAmt"),
                    "abstractText": a.get("abstractText"),
                    "orgCandidate": org    # <--- new column
                })

            if len(awards) < rpp:
                break
            offset += rpp

    df = pd.DataFrame(rows)
    return df


In [25]:
nsf_df_org= fetch_nsf_awards(ORG_CANDIDATES, START, TODAY)

In [26]:
nsf_df_org

Unnamed: 0,id,agency,awardeeName,awardeeCity,awardeeStateCode,piFirstName,piLastName,title,date,startDate,expDate,fundsObligatedAmt,abstractText,orgCandidate
0,2432754,NSF,XN HEALTH INC.,NEWARK,NJ,Elizabeth,Jaworski,STTR Phase I: Tracheal Phrenic Nerve Stimulat...,07/11/2025,07/15/2025,06/30/2026,274841,The broader/commercial impact of this Small Bu...,Corewell Health
1,2505338,NSF,Oregon Health & Science University,PORTLAND,OR,Paul,Tratnyek,Electrochemical Characterization of Redox Proc...,05/22/2025,07/01/2025,06/30/2028,410000,With support from the Environmental Chemical S...,Corewell Health
2,2451412,NSF,"ABSTRACTIVE HEALTH, INC.",NEW YORK,NY,Vince,Hartman,SBIR Phase II : A tool to automate a narrative...,07/01/2025,07/01/2025,06/30/2027,1250000,The broader impact/commercial potential of thi...,Corewell Health
3,2419342,NSF,"INSU HEALTH DESIGN, INC.",MAYAGUEZ,PR,Mason,Lucich,SBIR Phase II: Insu Health Design: Temperature...,05/15/2025,05/15/2025,04/30/2027,998527,This Small Business Innovation Research (SBIR)...,Corewell Health
4,2444410,NSF,University of North Texas Health Science Cente...,FORT WORTH,TX,Austin,Reynolds,Collaborative Research: Genomic and Isotopic A...,04/07/2025,04/15/2025,03/31/2028,63318,Studies of human population genetics can revea...,Corewell Health
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1510,0525026,NSF,University of Texas Health Science Center San ...,SAN ANTONIO,TX,DAVID,KATERNDAHL,Dynamic Patterns of Husband-to-Wife Abuse,12/12/2005,12/15/2005,11/30/2007,124214,This exploratory research study will evaluate ...,Beaumont Health
1511,0534797,NSF,Oregon Health & Science University,PORTLAND,OR,Paul,Gorman,Collaborative Research: Supporting Rapid Trans...,10/19/2005,11/01/2005,10/31/2009,192037,This is a project to understand essential issu...,Beaumont Health
1512,0635493,NSF,Oregon Health & Science University,PORTLAND,OR,Bradley,Tebo,Bacterial Manganese (II) Oxidation in the Guay...,06/16/2006,10/01/2005,03/31/2008,185289,ABSTRACT OCE- 0352081 Hydrothermal vents a...,Beaumont Health
1513,0513492,NSF,Oregon Health & Science University,PORTLAND,OR,James,Pankow,Thermodynamic Modeling of Atmospheric Organic ...,09/20/2005,10/01/2005,04/30/2009,424999,This project involves the further development ...,Beaumont Health


### NIH Data  Extraction

In [5]:
def fetch_nih_awards(org_names, start_date, end_date):
    """
    Fetch awards from NIH RePORTER v2 Projects API for given organizations and date range.
    Normalizes to OUT_COLS schema.
    """
    url = "https://api.reporter.nih.gov/v2/projects/search"
    rows = []

    # RePORTER supports posting arrays of org names and a date range filter.
    # We'll query per-org to keep results tidy and easy to troubleshoot.
    for org in org_names:
        offset = 0
        page_size = 500  # RePORTER allows large page sizes

        while True:
            payload = {
                "criteria": {
                    "org_names": [org],
                    # Use Notice of Award date to approximate "date"; you can switch to project_start_date if preferred
                    "award_notice_date": {
                        "from_date": start_date.strftime("%Y-%m-%d"),
                        "to_date": end_date.strftime("%Y-%m-%d")
                    }
                },
                "include_fields": [
                    "ApplId",
                    "Organization",            # includes name, city, state, etc.
                    "PrincipalInvestigators",  # includes first_name, last_name
                    "ProjectTitle",
                    "AwardNoticeDate",         # maps to our 'date'
                    "ProjectStartDate",
                    "ProjectEndDate",
                    "AwardAmount",             # we'll map to fundsObligatedAmt
                    "AbstractText"
                ],
                "offset": offset,
                "limit": page_size
            }

            resp = requests.post(url, json=payload, timeout=90)
            resp.raise_for_status()
            data = resp.json()

            results = data.get("results") or []
            if not results:
                break

            for r in results:
                org_obj = r.get("organization") or {}
                pis = r.get("principal_investigators") or []
                # Take the first PI if present
                pi_first = pi_last = None
                if pis:
                    pi_first = pis[0].get("first_name")
                    pi_last = pis[0].get("last_name")

                rows.append({
                    "id": str(r.get("appl_id") or ""),
                    "agency": "NIH",
                    "awardeeName": org_obj.get("org_name") or org,
                    "awardeeCity": org_obj.get("org_city"),
                    "awardeeStateCode": org_obj.get("org_state"),
                    "piFirstName": pi_first,
                    "piLastName": pi_last,
                    "title": r.get("project_title"),
                    "date": r.get("award_notice_date"),   # yyyy-mm-dd
                    "startDate": r.get("project_start_date"),
                    "expDate": r.get("project_end_date"),
                    "fundsObligatedAmt": r.get("award_amount"),  # normalized name
                    "abstractText": r.get("abstract_text"),
                })

            offset += page_size
            if offset >= (data.get("meta", {}).get("total") or 0):
                break

    df = pd.DataFrame(rows, columns=OUT_COLS)
    return df

In [9]:
fetch_nih_awards(ORG_CANDIDATES, START, TODAY)

Unnamed: 0,id,agency,awardeeName,awardeeCity,awardeeStateCode,piFirstName,piLastName,title,date,startDate,expDate,fundsObligatedAmt,abstractText
0,11337409,NIH,COREWELL HEALTH,Grand Rapids,MI,KATHLEEN,BUTLER,Cancer Research Consortium of West Michigan (C...,2025-08-11T00:00:00,2014-08-08T00:00:00,2026-07-31T00:00:00,2724475.0,CRCWM Project Summary/Abstract\nThe Cancer Res...
1,10892231,NIH,COREWELL HEALTH,Grand Rapids,MI,KATHLEEN,BUTLER,Cancer Research Consortium of West Michigan (C...,2024-07-29T00:00:00,2014-08-08T00:00:00,2026-07-31T00:00:00,2153395.0,The Cancer Research Consortium of West Michiga...
2,10675484,NIH,COREWELL HEALTH,Grand Rapids,MI,KATHLEEN,BUTLER,Cancer Research Consortium of West Michigan (C...,2023-07-20T00:00:00,2014-08-08T00:00:00,2025-07-31T00:00:00,2113756.0,PROJECT SUMMARY\nThe Cancer Research Consortiu...
3,10460164,NIH,COREWELL HEALTH,Grand Rapids,MI,KATHLEEN,BUTLER,Cancer Research Consortium of West Michigan (C...,2022-08-01T00:00:00,2014-08-08T00:00:00,2025-07-31T00:00:00,2190347.0,PROJECT SUMMARY\nThe Cancer Research Consortiu...
4,10226962,NIH,COREWELL HEALTH,Grand Rapids,MI,KATHLEEN,BUTLER,Cancer Research Consortium of West Michigan (C...,2021-07-30T00:00:00,2014-08-08T00:00:00,2025-07-31T00:00:00,2184338.0,PROJECT SUMMARY\nThe Cancer Research Consortiu...
5,9991757,NIH,COREWELL HEALTH,Grand Rapids,MI,KATHLEEN,BUTLER,Cancer Research Consortium of West Michigan (C...,2020-07-31T00:00:00,2014-08-08T00:00:00,2025-07-31T00:00:00,2302236.0,PROJECT SUMMARY\nThe Cancer Research Consortiu...
6,9771133,NIH,COREWELL HEALTH,Grand Rapids,MI,KATHLEEN,BUTLER,Cancer Research Consortium of West Michigan (C...,2019-08-08T00:00:00,2014-08-08T00:00:00,2025-07-31T00:00:00,2372116.0,PROJECT SUMMARY\nThe Cancer Research Consortiu...
7,9535231,NIH,COREWELL HEALTH,Grand Rapids,MI,KATHLEEN,BUTLER,Cancer Research Consortium of West Michigan (C...,2018-07-23T00:00:00,2014-08-01T00:00:00,2019-07-31T00:00:00,2751806.0,DESCRIPTION (provided by applicant): The Cance...
8,9330824,NIH,COREWELL HEALTH,Grand Rapids,MI,KATHLEEN,BUTLER,Cancer Research Consortium of West Michigan (C...,2017-07-27T00:00:00,2014-08-01T00:00:00,2019-07-31T00:00:00,2341117.0,DESCRIPTION (provided by applicant): The Cance...
9,9130139,NIH,COREWELL HEALTH,Grand Rapids,MI,KATHLEEN,BUTLER,Cancer Research Consortium of West Michigan (C...,2016-08-11T00:00:00,2014-08-01T00:00:00,2019-07-31T00:00:00,1580000.0,DESCRIPTION (provided by applicant): The Cance...


In [None]:
nsf_df_org.to_csv("../Data/NSF_data_20years.csv")

In [30]:
nsf_df_org.shape

(1515, 14)

In [None]:

    "project_start_date",
    "project_end_date",
    "abstract_text",
]

def fetch(query):
    """Fetch NIH awards for a given query dict"""
    all_rows = []
    offset = 0
    while True:
        body = {
            "criteria": query,
            "include_fields": FIELDS,
            "offset": offset,
            "limit": 500,  # max NIH allows
        }
        r = requests.post(BASE_URL, json=body, timeout=60)
        r.raise_for_status()
        data = r.json()
        if "results" not in data or not data["results"]:
            break
        all_rows.extend(data["results"])
        offset += 500
        if offset >= data.get("meta", {}).get("total", 0):
            break
    return all_rows

all_rows = []

# 1) Exact org match
for org in ORG_CANDIDATES:
    all_rows += fetch({
        "org_name": org,
        "project_start_date": {"from_date": start_date, "to_date": end_date}
    })

# 2) Keyword fallback search
for org in ORG_CANDIDATES:
    all_rows += fetch({
        "text_search": {"search_text": org},
        "project_start_date": {"from_date": start_date, "to_date": end_date}
    })

# 3) Michigan sweep (state filter)
all_rows += fetch({
    "org_state": "MI",
    "text_search": {"search_text": "Beaumont OR Spectrum OR Corewell"},
    "project_start_date": {"from_date": start_date, "to_date": end_date}
})

df = pd.DataFrame(all_rows).drop_duplicates(subset=["project_num"]) if all_rows else pd.DataFrame()

print(f"Total NIH awards found: {len(df)}")
if df.empty:
    print("No NIH awards matched these org names (last 20 years)import requests, datetime, pandas as pd

BASE_URL = "https://api.reporter.nih.gov/v2/projects/search"

today = datetime.date.today()
start_year = today.year - 20
end_year = today.year

ORG_CANDIDATES = [
    "Corewell Health",
    "Beaumont Health",
    "William Beaumont Hospital",
    "Beaumont Hospital",
    "Spectrum Health",
    "Helen DeVos Children's Hospital",
    "Butterworth Hospital",
    "Lakeland Hospital",
    "Blodgett Hospital",
]

FIELDS = [
    "project_num",
    "project_title",
    "org_name",
    "org_city",
    "org_state",
    "principal_investigators",
    "award_amount",
    "fy",
    "project_start_date",
    "project_end_date",
    "abstract_text",
]

def fetch(query):
    rows = []
    offset = 0
    while True:
        body = {
            "criteria": query,
            "include_fields": FIELDS,
            "offset": offset,
            "limit": 500,
        }
        r = requests.post(BASE_URL, json=body, timeout=60)
        r.raise_for_status()
        data = r.json()
        if "results" not in data or not data["results"]:
            break
        rows.extend(data["results"])
        offset += 500
        if offset >= data.get("meta", {}).get("total", 0):
            break
    return rows

all_rows = []

# Loop over years to avoid offset > 14,999
for year in range(start_year, end_year + 1):
    for org in ORG_CANDIDATES:
        all_rows += fetch({
            "org_name": org,
            "fiscal_years": [year]
        })
        # Keyword fallback
        all_rows += fetch({
            "text_search": {"search_text": org},
            "fiscal_years": [year]
        })
    # Michigan sweep
    all_rows += fetch({
        "org_state": "MI",
        "text_search": {"search_text": "Beaumont OR Spectrum OR Corewell"},
        "fiscal_years": [year]
    })

df = pd.DataFrame(all_rows).drop_duplicates(subset=["project_num"]) if all_rows else pd.DataFrame()

print(f"Total NIH awards found: {len(df)}")
if not df.empty:
    cols = [c for c in ["project_num","project_title","org_name","org_city","org_state","fy","project_start_date"] if c in df.columns]
    print(df[cols].head(25).to_string(index=False))
.")
else:
    cols = [c for c in ["project_num","project_title","org_name","org_city","org_state","fy","project_start_date"] if c in df.columns]
    print(df[cols].head(25).to_string(index=False))


HTTPError: 400 Client Error: System doesn't support offset value greater than 14,999. Please narrow down your search criteria. for url: https://api.reporter.nih.gov/v2/projects/search