<a href="https://colab.research.google.com/github/scameronp/nih-reporter/blob/main/NIH_RePORTER_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Context

This project aims at exploring gender disparities in science based on [NIH RePORTER](https://exporter.nih.gov/ExPORTER_Catalog.aspx?sid=1&index=0) project data from 1985 to 2020.

It is supported by the [Canada Research Chair on the Transformations of Scholarly Communication](https://crctcs.openum.ca/en) (Prof. Vincent Larivière).

# Data preprocessing

## Step 1: download CSV project files

In [7]:
from urllib.request import urlretrieve
import zipfile
import tempfile


LOCAL_SOURCE_DIR = '/content/downloads'
# used to download zip files
TMP_DIR = tempfile.gettempdir()
BASE_URL = 'https://exporter.nih.gov/CSVs/final'


for file_name in [f'RePORTER_PRJ_C_FY{year}.zip' for year in range(1985, 2021)]:
    url = f'{BASE_URL}/{file_name}'

    # download file in local
    zip_path = f'{TMP_DIR}/{file_name}'
    urlretrieve(url, zip_path)

    # unzip
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_SOURCE_DIR)

## Step 2: headers uniformization and reordering

In [8]:
ACTIVITY = 'ACTIVITY'
ADMINISTERING_IC = 'ADMINISTERING_IC'
APPLICATION_ID = 'APPLICATION_ID'
APPLICATION_TYPE = 'APPLICATION_TYPE'
ARRA_FUNDED = 'ARRA_FUNDED'
AWARD_NOTICE_DATE = 'AWARD_NOTICE_DATE'
BUDGET_END = 'BUDGET_END'
BUDGET_START = 'BUDGET_START'
CFDA_CODE = 'CFDA_CODE'
CORE_PROJECT_NUM = 'CORE_PROJECT_NUM'
DIRECT_COST_AMT = 'DIRECT_COST_AMT'
ED_INST_TYPE = 'ED_INST_TYPE'
FOA_NUMBER = 'FOA_NUMBER'
FULL_PROJECT_NUM = 'FULL_PROJECT_NUM'
FUNDING_ICS = 'FUNDING_ICS'
FUNDING_MECHANISM = 'FUNDING_MECHANISM'
FY = 'FY'
IC_NAME = 'IC_NAME'
INDIRECT_COST_AMT = 'INDIRECT_COST_AMT'
NIH_SPENDING_CATS = 'NIH_SPENDING_CATS'
ORG_CITY = 'ORG_CITY'
ORG_COUNTRY = 'ORG_COUNTRY'
ORG_DEPT = 'ORG_DEPT'
ORG_DISTRICT = 'ORG_DISTRICT'
ORG_DUNS = 'ORG_DUNS'
ORG_FIPS = 'ORG_FIPS'
ORG_IPF_CODE = 'ORG_IPF_CODE'
ORG_NAME = 'ORG_NAME'
ORG_STATE = 'ORG_STATE'
ORG_ZIPCODE = 'ORG_ZIPCODE'
PHR ='PHR'
PI_IDS = 'PI_IDS'
PI_NAMES = 'PI_NAMES'
PROGRAM_OFFICER_NAME = 'PROGRAM_OFFICER_NAME'
PROJECT_END = 'PROJECT_END'
PROJECT_START = 'PROJECT_START'
PROJECT_TERMS = 'PROJECT_TERMS'
PROJECT_TITLE = 'PROJECT_TITLE'
SERIAL_NUMBER = 'SERIAL_NUMBER'
STUDY_SECTION = 'STUDY_SECTION'
STUDY_SECTION_NAME = 'STUDY_SECTION_NAME'
SUBPROJECT_ID = 'SUBPROJECT_ID'
SUFFIX = 'SUFFIX'
SUPPORT_YEAR = 'SUPPORT_YEAR'
TOTAL_COST = 'TOTAL_COST'
TOTAL_COST_SUB_PROJECT = 'TOTAL_COST_SUB_PROJECT'

headers = [
    APPLICATION_ID,
    ACTIVITY,
    ADMINISTERING_IC,
    APPLICATION_TYPE,
    ARRA_FUNDED,
    AWARD_NOTICE_DATE,
    BUDGET_START,
    BUDGET_END,
    CFDA_CODE,
    CORE_PROJECT_NUM,
    ED_INST_TYPE,
    FOA_NUMBER,
    FULL_PROJECT_NUM,
    FUNDING_ICS,
    FUNDING_MECHANISM,
    FY,
    IC_NAME,
    NIH_SPENDING_CATS,
    ORG_CITY,
    ORG_COUNTRY,
    ORG_DEPT,
    ORG_DISTRICT,
    ORG_DUNS,
    ORG_FIPS,
    ORG_IPF_CODE,
    ORG_NAME,
    ORG_STATE,
    ORG_ZIPCODE,
    PHR,
    PI_IDS,
    PI_NAMES,
    PROGRAM_OFFICER_NAME,
    PROJECT_START,
    PROJECT_END,
    PROJECT_TERMS,
    PROJECT_TITLE,
    SERIAL_NUMBER,
    STUDY_SECTION,
    STUDY_SECTION_NAME,
    SUBPROJECT_ID,
    SUFFIX,
    SUPPORT_YEAR,
    DIRECT_COST_AMT,
    INDIRECT_COST_AMT,
    TOTAL_COST,
    TOTAL_COST_SUB_PROJECT
                 ]

## Step 3: CSV files merge

In [11]:
import csv
import os
import codecs


SOURCE_DIR = '/content/downloads'
TARGET_FILE = '/content/output.tsv'


def is_integer(string):
    try:
        int(string)
        return True
    except ValueError as a:
        return False


def is_corrupted(dict_row):
    empty_column = None in dict_row
    is_application_id_not_integer = not is_integer(dict_row[APPLICATION_ID])   
    return empty_column or is_application_id_not_integer


with codecs.open(TARGET_FILE, 'w', encoding='utf-8') as output_file:
    writer = csv.DictWriter(output_file, fieldnames=headers, dialect=csv.excel_tab)
    writer.writeheader()

    csv_files = sorted(os.listdir(SOURCE_DIR))

    for csv_file_name in csv_files:
        print(csv_file_name)

        with codecs.open(SOURCE_DIR + '/' + csv_file_name, 'r', encoding='ISO-8859-1') as csv_file_descriptor:
            reader = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
            for index, dict_row in enumerate(reader):
                if is_corrupted(dict_row):
                    print(csv_file_name, index, dict_row[APPLICATION_ID])
                else:
                    upper_dict = dict()
                    for key, value in dict_row.items():
                        try:
                            upper_dict[key.upper()] = value.replace("\n", "")
                        except Exception as e:
                            print(e)
                            print(dict_row)
                            print(csv_file_name)
                            #raise e
                    writer.writerow(upper_dict)

RePORTER_PRJ_C_FY1985.csv
RePORTER_PRJ_C_FY1986.csv
RePORTER_PRJ_C_FY1987.csv
RePORTER_PRJ_C_FY1988.csv
RePORTER_PRJ_C_FY1989.csv
RePORTER_PRJ_C_FY1990.csv
RePORTER_PRJ_C_FY1991.csv
RePORTER_PRJ_C_FY1992.csv
RePORTER_PRJ_C_FY1993.csv
RePORTER_PRJ_C_FY1994.csv
RePORTER_PRJ_C_FY1995.csv
RePORTER_PRJ_C_FY1996.csv
RePORTER_PRJ_C_FY1997.csv
RePORTER_PRJ_C_FY1998.csv
RePORTER_PRJ_C_FY1999.csv
RePORTER_PRJ_C_FY2000.csv
RePORTER_PRJ_C_FY2001.csv
RePORTER_PRJ_C_FY2002.csv
RePORTER_PRJ_C_FY2003.csv
RePORTER_PRJ_C_FY2004.csv
RePORTER_PRJ_C_FY2005.csv
RePORTER_PRJ_C_FY2006.csv
RePORTER_PRJ_C_FY2007.csv
RePORTER_PRJ_C_FY2008.csv
RePORTER_PRJ_C_FY2009.csv
RePORTER_PRJ_C_FY2010.csv
RePORTER_PRJ_C_FY2011.csv
RePORTER_PRJ_C_FY2012.csv
RePORTER_PRJ_C_FY2013.csv
RePORTER_PRJ_C_FY2014.csv
RePORTER_PRJ_C_FY2015.csv
RePORTER_PRJ_C_FY2016_new.csv
RePORTER_PRJ_C_FY2017_new.csv
RePORTER_PRJ_C_FY2017_new.csv 46320 9350897
RePORTER_PRJ_C_FY2018_new.csv
RePORTER_PRJ_C_FY2018_new.csv 55939 9473820
RePORTER_PRJ_C_F

## Step 4: contact PIs first name extraction

In [13]:
import csv
import codecs
import itertools
import re


SOURCE_FILE = '/content/output.tsv'
TARGET_FILE = '/content/pis.tsv'
ENCODING = 'utf-8'


def string_to_list(string):
    pi_list = []
    for string_part in string.rstrip('; ').split(';'):
        string_part = string_part.strip(', ').replace('"', '')
        if string_part != '':
            pi_list.append(string_part)
    return pi_list

def filter_contact_pi(pi_ids_or_names_list):
    filtered_list = []
    for item in pi_ids_or_names_list: 
         if '(contact)' in item:
            filtered_list.append(item)
    return filtered_list

def extract_contact_pi(pi_ids_or_names_list):
    if len(pi_ids_or_names_list) == 0:
        return None, "no_value"
    elif len(pi_ids_or_names_list) == 1:
        return pi_ids_or_names_list[0], "single_value"
    else:              
        filtered_pi_ids_or_names = filter_contact_pi(pi_ids_or_names_list)
        if len(filtered_pi_ids_or_names) == 0:
            return pi_ids_or_names_list[0], "multiple_values_but_no_explicit_contact_first_chosen"
        elif len(filtered_pi_ids_or_names) == 1:
            return filtered_pi_ids_or_names[0], "multiple_values_and_single_explicit_contact"
        else:
            return filtered_pi_ids_or_names[0], "multiple_values_and_multiple_explicit_contacts_first_chosen"

def extract_pi_first_name(full_name):
    if full_name is None:
        return None
    else:
        full_name_list = full_name.split(',')
        if len(full_name_list) <= 1:
            return None
        else:
            raw_first_name = full_name_list[1].replace('.', ' ').replace('(contact)', '')
            final_first_name = re.sub(' +', ' ', raw_first_name)
            first_name_parts = final_first_name.strip(' ').split(' ')
            l = []
            for part in first_name_parts:
                if len(part) > 1:
                    l.append(part)
            return ' '.join(l)
     
    
        
with codecs.open(TARGET_FILE, 'w', encoding=ENCODING) as output_file:
    writer = csv.DictWriter(output_file, fieldnames=[
        'APPLICATION_ID',
        'PI_ID', 
        'PI_NAME', 
        'PI_FIRST_NAME',
        'NB_PI_IDS',
        'NB_PI_NAMES',
        'CONTACT_PI_ID_STATUS', 
        'CONTACT_PI_NAME_STATUS',
        'RAW_PI_IDS',
        'RAW_PI_NAMES'
    ], dialect=csv.excel_tab)
    writer.writeheader()

    with codecs.open(SOURCE_FILE, 'r', encoding=ENCODING) as input_file:
        reader = csv.DictReader(input_file, dialect=csv.excel_tab)

        for index, input_row in enumerate(reader):

            pi_ids_list = string_to_list(input_row['PI_IDS'])
            pi_names_list = string_to_list(input_row['PI_NAMES'])

            try:
                contact_pi_id, pi_id_status = extract_contact_pi(pi_ids_list)
                contact_pi_name, pi_name_status = extract_contact_pi(pi_names_list)
            except Exception as e:
                print('APPLICATION_ID', input_row['APPLICATION_ID'])
                print('PI_IDS', input_row['PI_IDS'])
                print('PI_NAMES', input_row['PI_NAMES'])
                print('+++')
                print(pi_ids_list)
                print(pi_names_list)
                print(e)
                print('----------------')

            dict_row = {
                'APPLICATION_ID': input_row['APPLICATION_ID'],
                'PI_ID': contact_pi_id, 
                'PI_NAME': contact_pi_name, 
                'PI_FIRST_NAME': extract_pi_first_name(contact_pi_name),
                'NB_PI_IDS': len(pi_ids_list),
                'NB_PI_NAMES': len(pi_names_list),
                'CONTACT_PI_ID_STATUS': pi_id_status, 
                'CONTACT_PI_NAME_STATUS': pi_name_status,
                'RAW_PI_IDS': input_row['PI_IDS'],
                'RAW_PI_NAMES': input_row['PI_NAMES']
            }
            writer.writerow(dict_row)

## Step 5: assign gender to first names

In [14]:
# to do

# Data analysis