# National Institutes of Health

Data from the National Institutes of Health via their [RePORTER](https://projectreporter.nih.gov/reporter.cfm) system.

Data is obtained via their RePORTER [API](https://api.reporter.nih.gov/). This API can be updated on a weekly basis (Monday mornings around 10am EST) to retrieve the latest data. When weekly updates are applied following the initial data backfill, this can be extremely up-to-date.

[Data Dictionary](https://api.reporter.nih.gov/documents/Data%20Elements%20for%20RePORTER%20Project%20API_V2.pdf)

## Setup

In [49]:
import pandas as pd
import datetime
from datetime import timedelta
from typing import List, Dict, Any, Optional
import requests
import json
import warnings
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from requests_cache import CachedSession

#Test if we are in a notebook, load right tqdm
try:
    get_ipython() #type: ignore
    from tqdm.notebook import tqdm
except NameError:
    from tqdm import tqdm

### Parameters

In [50]:
# minimum start date is 2011-01-01 when using date_added api field
START_DATE = "2013-01-01"
END_DATE = datetime.datetime.now().strftime("%Y-%m-%d")
OUTPUT_LOCATION = "data/nih.gov_grants.jsonl"

## NEVER COMMIT WITH THIS SET TO TRUE
USE_CACHE = "True"

#### Parameter Clean-Up

In [51]:
p = {}
# Validate and convert START_DATE
try:
    p['start_date'] = datetime.datetime.strptime(START_DATE, "%Y-%m-%d").date()
except ValueError:
    raise ValueError("Invalid START_DATE format. Please use the format 'YYYY-MM-DD'.")
if p['start_date'] < datetime.datetime(2009, 1, 1).date():
    raise ValueError("START_DATE is too early. Please use a date after 2009-01-01.")

# Validate and convert END_DATE
try:
    p['end_date'] = datetime.datetime.strptime(END_DATE, "%Y-%m-%d").date()
except ValueError:
    raise ValueError("Invalid END_DATE format. Please use the format 'YYYY-MM-DD'.")

# Ensure END DATE > START_DATE
if p['end_date'] <= p['start_date']:
    raise ValueError("END_DATE must be greater than START_DATE.")


# Validate and convert USE_CACHE
if USE_CACHE.lower() not in ["true", "false"]:
    raise ValueError("Invalid USE_CACHE value. Please use 'True' or 'False'.")
p['use_cache'] = USE_CACHE.lower() == "true"

In [52]:
## HTTP Configuration

# Cache for development use only
if p['use_cache']:
    session = CachedSession(
        "cache.sqlite", backend="sqlite", expire_after=timedelta(days=1)
    )
else:
    session = requests.Session()
retry = Retry(connect=5, backoff_factor=1.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)

### NIH API

In [53]:
class NihApi:
    API_URL: str = "https://api.reporter.nih.gov/v2/projects/search"
    last_request_timestamp: Optional[datetime.datetime] = None
    follow_rate_limit: bool = True

    def __init__(self, follow_rate_limit: bool = True):
        self.follow_rate_limit = follow_rate_limit

    def projects_search(
        self, payload: Dict[Any, Any], offset: Optional[int] = None, limit: Optional[int] = None
    ) -> Dict[Any, Any]:
        """
        Calls the NIH RePORTER API to get project data.

        Args:
            payload (Dict[Any, Any]): Allows for manual specification of the complete search payload.
                Documentation at https://api.reporter.nih.gov/?urls.primaryName=V2.0
            offset (int, optional): The offset for the API request. Defaults to None (0). Maximum is 14,999.
            limit (int, optional): The maximum number of results to return. Min = 1, Max = 500
                Defaults to 50 (when not specified), but is more efficient at 500.

        Returns:
            Dict[Any, Any]: The response from the API request.

        Raises:
            ValueError: If the API request fails or returns an error.
        """

        # Check for overlaps between the payload dictionary and convenience parameters.
        if offset is not None and "offset" in payload:
            raise ValueError(
                "offset is set in both payload and as a convenience parameter"
            )
        if limit is not None and "limit" in payload:
            raise ValueError(
                "limit is set in both payload and as a convenience parameter"
            )

        if offset:
            payload["offset"] = offset
        if limit:
            payload["limit"] = limit

        headers = {"Content-Type": "application/json"}

        if self.follow_rate_limit and self.last_request_timestamp:
            time_since_last_request = (
                datetime.datetime.utcnow() - self.last_request_timestamp
            )
            while time_since_last_request < timedelta(seconds=1):
                time_since_last_request = (
                    datetime.datetime.utcnow() - self.last_request_timestamp
                )
                pass
        response = session.post(self.API_URL, data=json.dumps(payload), headers=headers)
        self.last_request_timestamp = datetime.datetime.utcnow()

        if response.status_code != 200:
            raise ValueError(
                f"API request failed with status code {response.status_code}. Error message: {response.text}"
            )

        response_json = response.json()
        if "error" in response_json:
            raise ValueError(
                f"API request returned an error: {response_json['error']}."
            )

        return response_json

    def get_projects_by_day(
        self,
        date: datetime.date,
        datefield: str = "date_added",
        add_timestamp_field: bool = False,
        added_timestamp_field_name: str = "_record_crawled_at",
        excessive_results_strategy="error",
        progress_bar: bool = False,
    ) -> List[Dict[Any, Any]]:
        """
        Retrieves projects added on a specific day.

        Args:
            date (datetime.date): The date to retrieve projects for.
            datefield (str, optional): The date field to search on. Defaults to "date_added".
            add_timestamp_field (bool): When true, will add a field to each record with the timestamp of the crawl. Field is named with `added_timestamp_field_name`
            added_timestamp_field_name (str):  When add_timestamp_field == True, this is the name of the new field added to each project
            excessive_results_strategy (str, optional): The strategy to use when the number of results exceeds the maximum. Defaults to 'error'. Options are 'error', 'warn', and 'ignore'. 'warn' and 'ignore' will retrieve up to 15000 results but will not be able to retrieve results beyond that limit.
            progress_bar (bool, optional): Whether to display a progress bar. Defaults to False.

        Returns:
            List[Dict[Any, Any]]: The list of projects added on the specified day.
        """

        projects = []
        result_page_length = 500
        max_results = 14999  # NIH RePORTER API constraint
        sort_order = "asc"

        payload = {
            "criteria": {
                datefield: {
                    "from_date": date.strftime("%Y-%m-%d"),
                    "to_date": (date + timedelta(days=1)).strftime("%Y-%m-%d"),
                }
            },
            "sort_field": datefield,
            "sort_order": sort_order,
        }

        # Get the metadata for the day's results
        # Used for determining excessive results and optional tqdm progress bar
        info_response = self.projects_search(payload, limit=1)
        total_results = info_response["meta"]["total"]

        if total_results > max_results:
            if excessive_results_strategy == "error":
                raise ValueError(
                    f"Number of results ({total_results}) exceeds the maximum ({max_results}) on {date}."
                )
            elif excessive_results_strategy == "warn":
                warnings.warn(
                    f"Number of results ({total_results}) exceeds the maximum ({max_results}) on {date}."
                )
            elif excessive_results_strategy == "ignore":
                pass
            else:
                raise ValueError(
                    f"Invalid excessive_results_strategy value: {excessive_results_strategy}."
                )

        if total_results > 0:
            if progress_bar:
                pbar = tqdm(
                    total=total_results, desc=f"Retrieving NIH projects for {date}"
                )

            offset = 0
            while len(projects) < total_results:
                payload["offset"] = offset
                payload["limit"] = result_page_length
                response = self.projects_search(payload=payload)
                response_ts = datetime.datetime.utcnow()
                response_projects = response["results"]

                if add_timestamp_field:
                    # use list comprehension
                    response_projects = [
                        {**project, added_timestamp_field_name: response_ts}
                        for project in response_projects
                    ]

                n_results = len(response_projects)
                projects.extend(response_projects)
                offset += n_results
                if progress_bar:
                    pbar.update(n_results)

            if progress_bar:
                pbar.close()

        if len(projects) != total_results:
            raise ValueError(
                f"Number of projects retrieved ({len(projects)}) does not match the total number of projects ({total_results}) for {date}."
            )

        return projects

    def count_projects_added_in_date_range(
        self, start_date: datetime.date, end_date: datetime.date
    ) -> int:
        """
        Counts the number of projects added in a date range.

        Args:
            start_date (datetime.date): The date to start counting projects for.
            end_date (datetime.date): The date to stop counting projects for.

        Returns:
            int: The number of projects added in the specified date range.
        """

        payload = {
            "criteria": {
                "date_added": {
                    "from_date": start_date.strftime("%Y-%m-%d"),
                    "to_date": end_date.strftime("%Y-%m-%d"),
                }
            },
            "sort_field": "date_added",
            "sort_order": "asc",
        }

        response = self.projects_search(payload, limit=1)
        return response["meta"]["total"]

    def get_projects_added_in_date_range(
        self,
        start_date: datetime.date,
        end_date: datetime.date,
        progress_bar=False,
        daily_progress_bar=False,
    ) -> List[Dict[Any, Any]]:
        """
        Retrieves projects added on a specific day.

        Args:
            start_date (datetime.date): The date to start retrieving projects for.
            end_date (datetime.date): The date to stop retrieving projects for.
            progress_bar (bool, optional): Whether to display a progress bar. Defaults to False.

        Returns:
            List[Dict[Any, Any]]: The list of projects added on the specified day.
        """

        projects: List[Dict[Any, Any]] = []
        total_projects = self.count_projects_added_in_date_range(start_date, end_date)
        if progress_bar:
            pbar = tqdm(
                total=total_projects,
                desc=f"Retrieving Projects for {start_date} to {end_date}",
            )
        date_spine = [
            start_date + timedelta(days=x) for x in range((end_date - start_date).days)
        ]
        for date in date_spine:
            projects_on_date = self.get_projects_by_day(
                date,
                add_timestamp_field=True,
                progress_bar=daily_progress_bar,
                excessive_results_strategy="warn",
            )
            projects.extend(projects_on_date)
            if progress_bar:
                pbar.update(len(projects_on_date))
        if progress_bar:
            pbar.close()

        if len(projects) != total_projects:
            raise ValueError(
                f"Number of projects retrieved ({len(projects)}) does not match the total number of projects ({total_projects}) for the date range {start_date} to {end_date}."
            )
        return projects

### Additional NIH Data
We need to get the data for the NIH's agencies IDs to correspond to their ROR ID's

In [54]:
FUNDER_NAME = "National Institutes of Health"
FUNDER_ROR_ID = "https://ror.org/01cwqze88"
NIH_IC_AGENCY = [
    {
        "acronym": "CC",
        "full_name": "Clinical Center",
        "org_code": "CC",
        "payload_criteria_value": "CLC",
        "ror_id": "https://ror.org/04vfsmv21",
    },
    {
        "acronym": "CSR",
        "full_name": "Center for Scientific Review",
        "org_code": "RG",
        "payload_criteria_value": "CSR",
        "ror_id": "https://ror.org/04r5s4b52",
    },
    {
        "acronym": "CIT",
        "full_name": "Center for Information Technology",
        "org_code": "CIT",
        "payload_criteria_value": "CIT",
        "ror_id": "https://ror.org/03jh5a977",
    },
    {
        "acronym": "FIC",
        "full_name": "John E. Fogarty International Center",
        "org_code": "TW",
        "payload_criteria_value": "FIC",
        "ror_id": "https://ror.org/02xey9a22",
    },
    {
        "acronym": "NCATS",
        "full_name": "National Center for Advancing Translational Sciences (NCATS)",
        "org_code": "TR",
        "payload_criteria_value": "NCATS",
        "ror_id": "https://ror.org/04pw6fb54",
    },
    {
        "acronym": "NCCIH",
        "full_name": "National Center for Complementary and Integrative Health",
        "org_code": "AT",
        "payload_criteria_value": "NCCIH",
        "ror_id": "https://ror.org/00190t495",
    },
    {
        "acronym": "NCI",
        "full_name": "National Cancer Institute",
        "org_code": "CA",
        "payload_criteria_value": "NCI",
        "ror_id": "https://ror.org/040gcmg81",
    },
    {
        "acronym": "NCRR",
        "full_name": "National Center for Research Resources (dissolved 12/2011)",
        "org_code": "RR",
        "payload_criteria_value": "NCRR",
        "ror_id": "https://ror.org/01cwqze88",
    },
    {
        "acronym": "NEI",
        "full_name": "National Eye Institute",
        "org_code": "EY",
        "payload_criteria_value": "NEI",
        "ror_id": "https://ror.org/03wkg3b53",
    },
    {
        "acronym": "NHGRI",
        "full_name": "National Human Genome Research Institute",
        "org_code": "HG",
        "payload_criteria_value": "NHGRI",
        "ror_id": "https://ror.org/00baak391",
    },
    {
        "acronym": "NHLBI",
        "full_name": "National Heart, Lung, and Blood Institute",
        "org_code": "HL",
        "payload_criteria_value": "NHLBI",
        "ror_id": "https://ror.org/012pb6c26",
    },
    {
        "acronym": "NIA",
        "full_name": "National Institute on Aging",
        "org_code": "AG",
        "payload_criteria_value": "NIA",
        "ror_id": "https://ror.org/049v75w11",
    },
    {
        "acronym": "NIAAA",
        "full_name": "National Institute on Alcohol Abuse and Alcoholism",
        "org_code": "AA",
        "payload_criteria_value": "NIAAA",
        "ror_id": "https://ror.org/02jzrsm59",
    },
    {
        "acronym": "NIAID",
        "full_name": "National Institute of Allergy and Infectious Diseases",
        "org_code": "AI",
        "payload_criteria_value": "NIAID",
        "ror_id": "https://ror.org/043z4tv69",
    },
    {
        "acronym": "NIAMS",
        "full_name": "National Institute of Arthritis and Musculoskeletal and Skin Diseases",
        "org_code": "AR",
        "payload_criteria_value": "NIAMS",
        "ror_id": "https://ror.org/006zn3t30",
    },
    {
        "acronym": "NIBIB",
        "full_name": "National Institute of Biomedical Imaging and Bioengineering",
        "org_code": "EB",
        "payload_criteria_value": "NIBIB",
        "ror_id": "https://ror.org/00372qc85",
    },
    {
        "acronym": "NICHD",
        "full_name": "Eunice Kennedy Shriver National Institute of Child Health and Human Development",
        "org_code": "HD",
        "payload_criteria_value": "NICHD",
        "ror_id": "https://ror.org/04byxyr05",
    },
    {
        "acronym": "NIDA",
        "full_name": "National Institute on Drug Abuse",
        "org_code": "DA",
        "payload_criteria_value": "NIDA",
        "ror_id": "https://ror.org/00fq5cm18",
    },
    {
        "acronym": "NIDCD",
        "full_name": "National Institute on Deafness and Other Communication Disorders",
        "org_code": "DC",
        "payload_criteria_value": "NIDCD",
        "ror_id": "https://ror.org/04mhx6838",
    },
    {
        "acronym": "NIDCR",
        "full_name": "National Institute of Dental and Craniofacial Research",
        "org_code": "DE",
        "payload_criteria_value": "NIDCR",
        "ror_id": "https://ror.org/004a2wv92",
    },
    {
        "acronym": "NIDDK",
        "full_name": "National Institute of Diabetes and Digestive and Kidney Diseases",
        "org_code": "DK",
        "payload_criteria_value": "NIDDK",
        "ror_id": "https://ror.org/00adh9b73",
    },
    {
        "acronym": "NIEHS",
        "full_name": "National Institute of Environmental Health Sciences",
        "org_code": "ES",
        "payload_criteria_value": "NIEHS",
        "ror_id": "https://ror.org/00j4k1h63",
    },
    {
        "acronym": "NIGMS",
        "full_name": "National Institute of General Medical Sciences",
        "org_code": "GM",
        "payload_criteria_value": "NIGMS",
        "ror_id": "https://ror.org/04q48ey07",
    },
    {
        "acronym": "NIMH",
        "full_name": "National Institute of Mental Health",
        "org_code": "MH",
        "payload_criteria_value": "NIMH",
        "ror_id": "https://ror.org/04xeg9z08",
    },
    {
        "acronym": "NIMHD",
        "full_name": "National Institute on Minority Health and Health Disparities",
        "org_code": "MD",
        "payload_criteria_value": "NIMHD",
        "ror_id": "https://ror.org/0493hgw16",
    },
    {
        "acronym": "NINDS",
        "full_name": "National Institute of Neurological Disorders and Stroke",
        "org_code": "NS",
        "payload_criteria_value": "NINDS",
        "ror_id": "https://ror.org/01s5ya894",
    },
    {
        "acronym": "NINR",
        "full_name": "National Institute of Nursing Research",
        "org_code": "NR",
        "payload_criteria_value": "NINR",
        "ror_id": "https://ror.org/01y3zfr79",
    },
    {
        "acronym": "NLM",
        "full_name": "National Library of Medicine",
        "org_code": "LM",
        "payload_criteria_value": "NLM",
        "ror_id": "https://ror.org/0060t0j89",
    },
    {
        "acronym": "OD",
        "full_name": "Office of the Director",
        "org_code": "OD",
        "payload_criteria_value": "OD",
        "ror_id": "https://ror.org/00fj8a872",
    },
]

lookup_agency = {agency["org_code"]: agency for agency in NIH_IC_AGENCY}

## Get and Clean Data

In [55]:
nih = NihApi()
projects = nih.get_projects_added_in_date_range(
    p['start_date'], p['end_date'], progress_bar=True, daily_progress_bar=True
)

Retrieving Projects for 2013-01-01 to 2024-02-27:   0%|          | 0/908860 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-01-06:   0%|          | 0/2610 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-01-13:   0%|          | 0/268 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-01-19:   0%|          | 0/361 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-01-26:   0%|          | 0/177 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-02-02:   0%|          | 0/2834 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-02-10:   0%|          | 0/358 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-02-17:   0%|          | 0/358 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-02-24:   0%|          | 0/136 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-03-03:   0%|          | 0/3094 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-03-10:   0%|          | 0/380 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-03-17:   0%|          | 0/322 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-03-24:   0%|          | 0/209 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-03-31:   0%|          | 0/279 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-04-07:   0%|          | 0/4549 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-04-14:   0%|          | 0/677 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-04-21:   0%|          | 0/495 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-04-28:   0%|          | 0/305 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-05-05:   0%|          | 0/3136 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-05-12:   0%|          | 0/807 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-05-19:   0%|          | 0/1189 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-05-26:   0%|          | 0/543 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-06-02:   0%|          | 0/3291 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-06-09:   0%|          | 0/490 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-06-16:   0%|          | 0/1336 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-06-23:   0%|          | 0/666 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-06-30:   0%|          | 0/296 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-07-07:   0%|          | 0/8128 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-07-14:   0%|          | 0/1608 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-07-21:   0%|          | 0/1234 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-07-28:   0%|          | 0/668 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-08-04:   0%|          | 0/6013 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-08-11:   0%|          | 0/426 [00:00<?, ?it/s]

Retrieving NIH projects for 2013-08-15:   0%|          | 0/11946 [00:00<?, ?it/s]

In [None]:
projects_df = pd.json_normalize(projects, max_level=2)
del projects

#### Set types for columns

In [None]:
date_columns = [
    "project_start_date",
    "project_end_date",
    "award_notice_date",
    "budget_start",
    "budget_end",
    "date_added",
]

for col in date_columns:
    projects_df[col] = pd.to_datetime(projects_df[col])

### Data Cleaning Functions

In [None]:
def get_ror_id(code):
    """
    Retrieves the ROR ID associated with the given NIH agency code.

    Args:
        code (str): The code to look up.

    Returns:
        str or None: The ROR ID if found, None otherwise.
    """
    return lookup_agency[code]["ror_id"] if code in lookup_agency else None


def build_location(row):
    """
    Builds a location string based on the provided row data.

    Args:
        row (pandas.Series): The row containing organization information.

    Returns:
        str: The location string.
    """
    location = ""
    if pd.notnull(row["organization.org_city"]):
        location += f"{row['organization.org_city']}, "
    elif pd.notnull(row["organization.city"]):
        location += f"{row['organization.city']}, "
    if pd.notnull(row["organization.org_zipcode"]):
        # handle zip-5/zip-9 formats
        if row["organization.org_zipcode"].isnumeric():
            if len(row["organization.org_zipcode"]) == 5:
                location += f"{row['organization.org_zipcode']} ,"
            elif len(row["organization.org_zipcode"]) == 9:
                location += f"{row['organization.org_zipcode'][:5]}-{row['organization.org_zipcode'][5:]}, "
        else:
            location += f"{row['organization.org_zipcode']}, "
    if pd.notnull(row["organization.org_country"]):
        location += row["organization.org_country"]
    elif pd.notnull(row["organization.country"]):
        location += row["organization.country"]
    return location


def extract_format_pi_names(pi_list) -> str:
    return " | ".join(pi["full_name"] for pi in pi_list)


### Format data for export

In [None]:
export_df = pd.DataFrame()

In [None]:
export_df["grant_id"] = "nih::" + projects_df["appl_id"].astype(str)
export_df["funder_name"] = "National Institutes of Health: " + projects_df[
    "agency_ic_admin.name"
].astype(str)
export_df["funder_ror_id"] = projects_df["agency_ic_admin.code"].apply(get_ror_id)
export_df["recipient_org_name"] = projects_df["organization.org_name"]
export_df["recipient_location"] = projects_df.apply(build_location, axis=1)

export_df["grant_year"] = projects_df["budget_start"].dt.year
export_df["grant_duration"] = (
    projects_df["budget_end"].dt.date - projects_df["budget_start"].dt.date
)
export_df["grant_start_date"] = projects_df["budget_start"].dt.date
export_df["grant_end_date"] = projects_df["budget_end"].dt.date

export_df["award_amount"] = projects_df["award_amount"]
export_df["award_currency"] = "USD"
export_df["award_amount_usd"] = projects_df["award_amount"]
export_df["source"] = "NIH RePORTER API"
export_df["source_url"] = "https://api.reporter.nih.gov/?urls.primaryName=V2.0"
export_df["grant_title"] = projects_df["project_title"]
export_df["grant_description"] = (
    "PROJECT TITLE: "
    + projects_df["project_title"]
    + "\n\n\n PROJECT ABSTRACT:\n"
    + projects_df["abstract_text"]
    + "\n\n\n"
    + "PUBLIC HEALTH RELEVANCE STATEMENT: \n"
    + projects_df["phr_text"]
)
export_df["grant_category"] = projects_df["funding_mechanism"]

## Notify the user if the record represents a subproject
subproject_comment = "Grant record is for a subproject. Value reflected here is value of the subproject only. Parent grant has cumulative value of funding of all subprojects. If summed, this value may be counted twice if using the overall dataset. Use the project_id in the NIH's raw_source_data if you would like to identify the parent project."

export_df['comment'] = projects_df['subproject_id'].where(projects_df['subproject_id'].isna(), subproject_comment)

export_df["_crawled_at"] = projects_df["_record_crawled_at"]
export_df["raw_export_data"] = projects_df.drop("_record_crawled_at", axis=1).to_dict(
    orient="records"
)

In [None]:
export_df.to_json(OUTPUT_LOCATION, orient="records", lines=True, date_format="iso")

In [None]:
# Split the file so it will fit
! split -C 95M --numeric-suffixes --additional-suffix .jsonl {OUTPUT_LOCATION} {OUTPUT_LOCATION.split('.jsonl')[0]}_part_

# Remove the original file so oversize file doesn't get committed
! rm {OUTPUT_LOCATION}

In [None]:
! ls {OUTPUT_LOCATION.split('.jsonl')[0]}*

data/nih.gov_grants_part_00.jsonl  data/nih.gov_grants_part_14.jsonl
data/nih.gov_grants_part_01.jsonl  data/nih.gov_grants_part_15.jsonl
data/nih.gov_grants_part_02.jsonl  data/nih.gov_grants_part_16.jsonl
data/nih.gov_grants_part_03.jsonl  data/nih.gov_grants_part_17.jsonl
data/nih.gov_grants_part_04.jsonl  data/nih.gov_grants_part_18.jsonl
data/nih.gov_grants_part_05.jsonl  data/nih.gov_grants_part_19.jsonl
data/nih.gov_grants_part_06.jsonl  data/nih.gov_grants_part_20.jsonl
data/nih.gov_grants_part_07.jsonl  data/nih.gov_grants_part_21.jsonl
data/nih.gov_grants_part_08.jsonl  data/nih.gov_grants_part_22.jsonl
data/nih.gov_grants_part_09.jsonl  data/nih.gov_grants_part_23.jsonl
data/nih.gov_grants_part_10.jsonl  data/nih.gov_grants_part_24.jsonl
data/nih.gov_grants_part_11.jsonl  data/nih.gov_grants_part_25.jsonl
data/nih.gov_grants_part_12.jsonl  data/nih.gov_grants_part_26.jsonl
data/nih.gov_grants_part_13.jsonl
