# National Institutes of Health

Data from the National Institutes of Health via their [RePORTER](https://projectreporter.nih.gov/reporter.cfm) system.

Data is obtained via their Batch RePORTER [ExPorter](https://reporter.nih.gov/exporter) for FY 1985-FY 2022 (federal fiscal year runs from October 1 to September 30). Data after that is obtained via their [API](https://api.reporter.nih.gov/).

In [28]:
import pandas as pd
import datetime
from typing import List
pd.set_option('display.max_columns', None)

In [3]:
fy_years = "2010 - now"
"""
fy_years is a parameter that is used to filter the data by fiscal year. 
It can accept a single year, a range of years, or a list of years. 
It can also accept the values 'ALL' for all valid dates, or the value 'now' as a single year
or as the end date of a range.
The format for the parameter is as follows:
- A single year: "2019"
- A range of years: "2018-2023"
- The value 'ALL' for all valid dates
- "2018 - now"
- A comma-separated list of years: "2018, 2019, 2020"
"""
use_batch = True


In [5]:

def validate_and_transform_fy_years(fy_years: str) -> List[int]:
    """
    Validates and transforms fiscal year input into a list of integers representing the fiscal years.

    Args:
        fy_years (str): The fiscal year input. It can be one of the following:
            - 'ALL': Returns a list of all valid fiscal years from 1985 to the current fiscal year.
            - 'now': Returns a list containing only the current fiscal year.
            - 'start_year-end_year': Returns a list of fiscal years from start_year to end_year (inclusive).
              The end_year can be 'now' to represent the current fiscal year.
            - 'year1,year2,...': Returns a list of specific fiscal years provided as comma-separated values.

    Returns:
        List[int]: A list of integers representing the fiscal years.

    Raises:
        ValueError: If the input is invalid or outside the valid range of fiscal years.

    """
    current_year = datetime.datetime.now().year
    min_year = 1985
    max_year = current_year+1 if datetime.datetime.now().month >= 10 else current_year # Next Fiscal year starts in October

    if fy_years == 'ALL':
        return list(range(min_year, max_year + 1))
    elif fy_years == 'now':
        return [max_year]
    elif '-' in fy_years:
        start_year, end_year = fy_years.split('-')
        start_year = int(start_year.strip())
        if str(end_year).strip().lower() == 'now':
            end_year = max_year
        else:
            end_year = int(end_year.strip())
        if start_year < min_year or end_year > max_year:
            raise ValueError(f"Invalid year range. Valid range is {min_year}-{max_year}.")
        return list(range(start_year, end_year + 1))
    elif ',' in fy_years:
        years = fy_years.split(',')
        years = [int(year.strip()) for year in years]
        if any(year < min_year or year > max_year for year in years):
            raise ValueError(f"Invalid year. Valid range is {min_year}-{max_year}.")
        return years
    else:
        year = int(fy_years.strip())
        if year < min_year or year > max_year:
            raise ValueError(f"Invalid year. Valid range is {min_year}-{max_year}.")
        return [year]

FY_YEARS = validate_and_transform_fy_years(fy_years)

In [6]:
min_batch_year = 1985
max_batch_year = 2022
batch_elligible_years = list(range(min_batch_year, max_batch_year+1))


In [7]:
batch_years = [year for year in FY_YEARS if year in batch_elligible_years]
api_years = [year for year in FY_YEARS if year not in batch_elligible_years]


In [10]:
!ls ../analysis/nih/data/batch/

RePORTER_PRJ_C_FY2007.zip  RePORTER_PRJ_C_FY2013.zip  RePORTER_PRJ_C_FY2019.zip
RePORTER_PRJ_C_FY2008.zip  RePORTER_PRJ_C_FY2014.zip  RePORTER_PRJ_C_FY2020.zip
RePORTER_PRJ_C_FY2009.zip  RePORTER_PRJ_C_FY2015.zip  RePORTER_PRJ_C_FY2021.zip
RePORTER_PRJ_C_FY2010.zip  RePORTER_PRJ_C_FY2016.zip  RePORTER_PRJ_C_FY2022.zip
RePORTER_PRJ_C_FY2011.zip  RePORTER_PRJ_C_FY2017.zip
RePORTER_PRJ_C_FY2012.zip  RePORTER_PRJ_C_FY2018.zip


In [17]:
df = pd.read_csv("../analysis/nih/data/batch/RePORTER_PRJ_C_FY2011.zip", compression="zip", on_bad_lines='warn', low_memory=False, encoding='latin1')

In [16]:
import chardet

ModuleNotFoundError: No module named 'chardet'

In [19]:
nih_batch_projects_csvzip_url = 'https://reporter.nih.gov/exporter/projects/download/{}'

batch_dfs = []

for year in batch_years:
    try:
        df = pd.read_csv(nih_batch_projects_csvzip_url.format(year), compression='zip', low_memory=False, encoding='latin1', on_bad_lines='warn')
        batch_dfs.append(df)
    except Exception as e:
            raise ValueError(f"Error reading batch file for year {year}: {e}")
batch_dfs = pd.concat(batch_dfs, ignore_index=True)


  df = pd.read_csv(nih_batch_projects_csvzip_url.format(year), compression='zip', low_memory=False, encoding='latin1', on_bad_lines='warn')

  df = pd.read_csv(nih_batch_projects_csvzip_url.format(year), compression='zip', low_memory=False, encoding='latin1', on_bad_lines='warn')


In [35]:
import pandas as pd
duplicate_id_projects = batch_dfs[batch_dfs['APPLICATION_ID'].map(batch_dfs['APPLICATION_ID'].value_counts()) > 1]
duplicate_id_projects
batch_dfs.drop_duplicates(subset='APPLICATION_ID', inplace=True)

# Format date columns
date_columns = ['AWARD_NOTICE_DATE', 'BUDGET_START', 'BUDGET_END', 'PROJECT_START', 'PROJECT_END']
for column in date_columns:
    batch_dfs[column] = pd.to_datetime(batch_dfs[column], format='mixed', errors='coerce')

# Display the modified dataframe
batch_dfs



Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,CORE_PROJECT_NUM,ED_INST_TYPE,FOA_NUMBER,FULL_PROJECT_NUM,FUNDING_ICs,FUNDING_MECHANISM,FY,IC_NAME,NIH_SPENDING_CATS,ORG_CITY,ORG_COUNTRY,ORG_DEPT,ORG_DISTRICT,ORG_DUNS,ORG_FIPS,ORG_NAME,ORG_STATE,ORG_ZIPCODE,PHR,PI_IDS,PI_NAMEs,PROGRAM_OFFICER_NAME,PROJECT_START,PROJECT_END,PROJECT_TERMS,PROJECT_TITLE,SERIAL_NUMBER,STUDY_SECTION,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,TOTAL_COST,TOTAL_COST_SUB_PROJECT,FUNDING_Ics,DIRECT_COST_AMT,INDIRECT_COST_AMT,ORG_IPF_CODE,OPPORTUNITY NUMBER
0,7000731,C06,RR,1.0,Y,2009-10-20,2009-10-20,2013-09-30,702.0,C06RR020081,,PAR-04-122,1C06RR020081-01A1,NCRR:8000000\,Construction,2010,NATIONAL CENTER FOR RESEARCH RESOURCES,,COLUMBUS,UNITED STATES,,3.0,832127323,US,OHIO STATE UNIVERSITY,OH,432101016,,2074044;,"GREVER, MICHAEL R;","MCCULLOUGH, WILLIE",2009-10-20,2013-09-30,Extramural Activities;research facility,"PAR04-122, Extramural Research Facilities Impr...",20081,ZRR1,Special Emphasis Panel,,A1,1.0,8000000.0,,,,,,
1,6826709,C06,RR,1.0,Y,2009-10-20,2009-10-20,2014-10-19,702.0,C06RR020088,SCHOOLS OF VETERINARY MEDICINE,RFA-RR-03-011,1C06RR020088-01,NCRR:3920956\,Construction,2010,NATIONAL CENTER FOR RESEARCH RESOURCES,HIV/AIDS,COLUMBUS,UNITED STATES,NONE,3.0,832127323,US,OHIO STATE UNIVERSITY,OH,432101016,,1893210;,"OGLESBEE, MICHAEL J;","LIN, TI",2009-10-20,2014-10-19,Infectious Diseases Research,FACILITIES IMPROVEMENT FOR INFECTIOUS DISEASE ...,20088,ZRR1,Special Emphasis Panel,,,1.0,3920956.0,,,,,,
2,7000889,C06,RR,1.0,Y,2009-12-09,2009-12-10,2014-06-30,702.0,C06RR020096,ORGANIZED RESEARCH UNITS,PAR-04-122,1C06RR020096-01A1,NCRR:8000000\,Construction,2010,NATIONAL CENTER FOR RESEARCH RESOURCES,HIV/AIDS,LINCOLN,UNITED STATES,NONE,1.0,555456995,US,UNIVERSITY OF NEBRASKA LINCOLN,NE,685830861,,9902047;,"PAUL, PREM SAGAR;","MCCULLOUGH, WILLIE",2009-12-10,2014-06-30,Extramural Activities;Extramural Research Faci...,Extramural Research Facilities Construction,20096,STRB,Scientific and Technical Review Board on Biome...,,A1,1.0,8000000.0,,,,,,
3,7000560,C06,RR,1.0,Y,2009-10-20,2009-10-20,2014-10-19,702.0,C06RR020132,,PAR-04-122,1C06RR020132-01A1,NCRR:4675896\,Construction,2010,NATIONAL CENTER FOR RESEARCH RESOURCES,,BUFFALO,UNITED STATES,,26.0,824771034,US,ROSWELL PARK CANCER INSTITUTE CORP,NY,142630001,,6895846;,"LEE, KELVIN P;","LIN, TI",2009-10-20,2014-10-19,Extramural Research Facilities Construction Pr...,Extramural Research Facility Construction Proj...,20132,STRB,Scientific and Technical Review Board on Biome...,,A1,1.0,4675896.0,,,,,,
4,7000669,C06,RR,1.0,Y,2009-12-09,2009-12-10,2015-06-30,702.0,C06RR020533,ORGANIZED RESEARCH UNITS,PAR-04-122,1C06RR020533-01A1,NCRR:3978104\,Construction,2010,NATIONAL CENTER FOR RESEARCH RESOURCES,,BOISE,UNITED STATES,NONE,2.0,72995848,US,BOISE STATE UNIVERSITY,ID,837250001,,9837979;,"RUDIN, MARK JOSEPH;","MCCULLOUGH, WILLIE",2009-12-10,2015-06-30,Extramural Activities;Extramural Research Faci...,"PAR04-122, Extramural Research Facilities Cons...",20533,STRB,Scientific and Technical Review Board on Biome...,,A1,1.0,3978104.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038042,10909470,R01,DC,7.0,N,2023-08-30,2023-08-16,2024-04-30,173.0,R01DC016269,SCH ALLIED HEALTH PROFESSIONS,,7R01DC016269-06,NIDCD:450474\,NON-SBIR/STTR RPGS,2022,NATIONAL INSTITUTE ON DEAFNESS AND OTHER COMMU...,,SALT LAKE CITY,UNITED STATES,PSYCHOLOGY,1.0,009095365,US,UNIVERSITY OF UTAH,UT,841129049,PUBLIC HEALTH RELEVANCE STATEMENT One in 8 peo...,11711764 (contact),"TANNER, KRISTINE MARIE (contact)","SHEKIM, LANA O",2018-05-14,2024-04-30,,Pathophysiology of Voice Disorders due to Comb...,16269,ZRG1,Special Emphasis Panel[ZRG1-BBBP-B(03)M],,,6.0,450474.0,,,303035.0,147439.0,514002.0,PA-16-160
1038043,10910432,R01,HL,7.0,N,2023-08-25,2023-09-01,2024-01-31,837.0,R01HL115195,SCHOOLS OF MEDICINE,,7R01HL115195-11,NHLBI:698308\,NON-SBIR/STTR RPGS,2022,"NATIONAL HEART, LUNG, AND BLOOD INSTITUTE",,TUCSON,UNITED STATES,INTERNAL MEDICINE/MEDICINE,7.0,806345617,US,UNIVERSITY OF ARIZONA,AZ,857210158,Lay abstract Hypertension is one of the most c...,9640105 (contact),"QIU, HONGYU (contact)","CATANIA, SELEN MURATOGLU",2023-08-21,2024-01-31,,Intrinsic stiffness of aortic vascular smooth ...,115195,HM,Hypertension and Microcirculation Study Sectio...,,,11.0,698308.0,,,527713.0,170595.0,490201.0,PA-21-268
1038044,10910856,U01,GH,6.0,N,2023-08-11,2022-09-30,2024-09-29,326.0,U01GH002243,,,6U01GH002243-05M005,CGH:1276000\NCEZID:260000\NCIRD:6800\,NON-SBIR/STTR RPGS,2022,Center for Global Health,,GUATEMALA,GUATEMALA,,,846107027,GT,UNIVERSITY OF THE VALLEY OF GUATEMALA,,01015,PROJECT NARRATIVE This Guatemala-based proposa...,6864071 (contact),"CORDON-ROSALES, CELIA (contact)",,2018-09-30,2024-09-29,,"GH18-004, Surveillance and research for the in...",2243,ZGH1,ZGH1-HMS(01),,M005,5.0,1542800.0,,,,,10009062.0,RFA-GH-18-004
1038045,10910857,U01,CK,6.0,N,2023-06-12,2023-05-01,2024-04-30,860.0,U01CK000643,SCHOOLS OF MEDICINE,,6U01CK000643-02M001,NCEZID:368449\,NON-SBIR/STTR RPGS,2022,National Center for Emerging and Zoonotic Infe...,,LOS ANGELES,UNITED STATES,INTERNAL MEDICINE/MEDICINE,36.0,092530369,US,UNIVERSITY OF CALIFORNIA LOS ANGELES,CA,900952000,,8573421 (contact),"TALAN, DAVID ANDREW (contact)",,2022-05-01,2027-04-30,,"RFA-CK-22-003, Emerging Infections Sentinel Ne...",643,ZCK1,ZCK1-GCA(48),,M001,2.0,368449.0,,,,,577505.0,RFA-CK-22-003


In [36]:
import requests
import json


In [53]:
demo_json_url="https://api.reporter.nih.gov/v2/projects/search"

demo_json_payload  = """
    {
     "criteria":
     {
       "fiscal_years":[2023,2024]
     },
     "include_fields": [
        "ApplId","SubprojectId","FiscalYear","Organization", "ProjectNum","OrgCountry",
        "ProjectNumSplit","ContactPiName","AllText","FullStudySection",
        "ProjectStartDate","ProjectEndDate"
     ],
     "offset":0,
     "sort_field":"project_start_date",
     "sort_order":"desc"
 }
"""

In [54]:
r = requests.post(demo_json_url, data=demo_json_payload, headers={'Content-Type': 'application/json'})

In [55]:
r.json()

{'meta': {'search_id': 'iXgrfUv0hE-keQWXXsSagw',
  'total': 96173,
  'offset': 0,
  'limit': 50,
  'sort_field': 'project_start_date',
  'sort_order': 'desc',
  'sorted_by_relevance': False,
  'properties': {'URL': 'https:/reporter.nih.gov/search/iXgrfUv0hE-keQWXXsSagw/projects'}},
 'results': [{'appl_id': 10825849,
   'subproject_id': None,
   'fiscal_year': 2023,
   'project_num': '1F32AA031422-01',
   'organization': {'org_name': 'UNIV OF NORTH CAROLINA CHAPEL HILL',
    'city': None,
    'country': None,
    'org_city': 'CHAPEL HILL',
    'org_country': 'UNITED STATES',
    'org_state': 'NC',
    'org_state_name': None,
    'dept_type': 'PSYCHOLOGY',
    'fips_country_code': None,
    'org_duns': ['608195277'],
    'org_ueis': ['D3LHU66KBLD5'],
    'primary_duns': '608195277',
    'primary_uei': 'D3LHU66KBLD5',
    'org_fips': 'US',
    'org_ipf_code': '578206',
    'org_zipcode': '275995023',
    'external_org_id': 578206},
   'project_num_split': {'appl_type_code': '1',
    'acti

In [56]:
96173/50/60/60

0.5342944444444445