<a href="https://colab.research.google.com/github/scameronp/nih-reporter/blob/main/NIH_RePORTER_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Context

This project aims at exploring gender disparities in science based on [NIH RePORTER](https://exporter.nih.gov/ExPORTER_Catalog.aspx?sid=1&index=0) project data from 1985 to 2020.

It is supported by the [Canada Research Chair on the Transformations of Scholarly Communication](https://crctcs.openum.ca/en) (Prof. Vincent Larivière).

# Imports

In [1]:
from urllib.request import urlretrieve
import zipfile
import tempfile
import csv
import os
import codecs
import pandas as pd
import itertools
import re

# Headers

In [2]:
# original headers for output.csv file (from RePORTER downloaded csv files)

ACTIVITY = 'ACTIVITY'
ADMINISTERING_IC = 'ADMINISTERING_IC'
APPLICATION_ID = 'APPLICATION_ID'
APPLICATION_TYPE = 'APPLICATION_TYPE'
ARRA_FUNDED = 'ARRA_FUNDED'
AWARD_NOTICE_DATE = 'AWARD_NOTICE_DATE'
BUDGET_END = 'BUDGET_END'
BUDGET_START = 'BUDGET_START'
CFDA_CODE = 'CFDA_CODE'
CORE_PROJECT_NUM = 'CORE_PROJECT_NUM'
DIRECT_COST_AMT = 'DIRECT_COST_AMT'
ED_INST_TYPE = 'ED_INST_TYPE'
FOA_NUMBER = 'FOA_NUMBER'
FULL_PROJECT_NUM = 'FULL_PROJECT_NUM'
FUNDING_ICS = 'FUNDING_ICS'
FUNDING_MECHANISM = 'FUNDING_MECHANISM'
FY = 'FY'
IC_NAME = 'IC_NAME'
INDIRECT_COST_AMT = 'INDIRECT_COST_AMT'
NIH_SPENDING_CATS = 'NIH_SPENDING_CATS'
ORG_CITY = 'ORG_CITY'
ORG_COUNTRY = 'ORG_COUNTRY'
ORG_DEPT = 'ORG_DEPT'
ORG_DISTRICT = 'ORG_DISTRICT'
ORG_DUNS = 'ORG_DUNS'
ORG_FIPS = 'ORG_FIPS'
ORG_IPF_CODE = 'ORG_IPF_CODE'
ORG_NAME = 'ORG_NAME'
ORG_STATE = 'ORG_STATE'
ORG_ZIPCODE = 'ORG_ZIPCODE'
PHR ='PHR'
PI_IDS = 'PI_IDS'
PI_NAMES = 'PI_NAMES'
PROGRAM_OFFICER_NAME = 'PROGRAM_OFFICER_NAME'
PROJECT_END = 'PROJECT_END'
PROJECT_START = 'PROJECT_START'
PROJECT_TERMS = 'PROJECT_TERMS'
PROJECT_TITLE = 'PROJECT_TITLE'
SERIAL_NUMBER = 'SERIAL_NUMBER'
STUDY_SECTION = 'STUDY_SECTION'
STUDY_SECTION_NAME = 'STUDY_SECTION_NAME'
SUBPROJECT_ID = 'SUBPROJECT_ID'
SUFFIX = 'SUFFIX'
SUPPORT_YEAR = 'SUPPORT_YEAR'
TOTAL_COST = 'TOTAL_COST'
TOTAL_COST_SUB_PROJECT = 'TOTAL_COST_SUB_PROJECT'


# new headers for enhanced-output.csv file (added for the project)

NB_PI_IDS = 'NB_PI_IDS'
NB_PI_NAMES = 'NB_PI_NAMES'
CONTACT_PI_ID = 'CONTACT_PI_ID'
CONTACT_PI_NAME = 'CONTACT_PI_NAME'
CONTACT_PI_ID_STATUS = 'CONTACT_PI_ID_STATUS'
CONTACT_PI_NAME_STATUS = 'CONTACT_PI_NAME_STATUS'
CONTACT_PI_FIRST_NAME = 'CONTACT_PI_FIRST_NAME'
CONTACT_PI_GENDER = 'CONTACT_PI_GENDER'

# Data preprocessing

## Step 1: download CSV project files

In [3]:
LOCAL_SOURCE_DIR = '/content/downloads'
# used to download zip files
TMP_DIR = tempfile.gettempdir()
BASE_URL = 'https://exporter.nih.gov/CSVs/final'


for file_name in [f'RePORTER_PRJ_C_FY{year}.zip' for year in range(1985, 2021)]:
    url = f'{BASE_URL}/{file_name}'

    # download file in local
    zip_path = f'{TMP_DIR}/{file_name}'
    urlretrieve(url, zip_path)

    # unzip
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_SOURCE_DIR)

## Step 2: headers reordering

In [4]:
# original headers for output.csv file (from RePORTER downloaded csv files)

headers = [
    APPLICATION_ID,
    ACTIVITY,
    ADMINISTERING_IC,
    APPLICATION_TYPE,
    ARRA_FUNDED,
    AWARD_NOTICE_DATE,
    BUDGET_START,
    BUDGET_END,
    CFDA_CODE,
    CORE_PROJECT_NUM,
    ED_INST_TYPE,
    FOA_NUMBER,
    FULL_PROJECT_NUM,
    FUNDING_ICS,
    FUNDING_MECHANISM,
    FY,
    IC_NAME,
    NIH_SPENDING_CATS,
    ORG_CITY,
    ORG_COUNTRY,
    ORG_DEPT,
    ORG_DISTRICT,
    ORG_DUNS,
    ORG_FIPS,
    ORG_IPF_CODE,
    ORG_NAME,
    ORG_STATE,
    ORG_ZIPCODE,
    PHR,
    PI_IDS,
    PI_NAMES,
    PROGRAM_OFFICER_NAME,
    PROJECT_START,
    PROJECT_END,
    PROJECT_TERMS,
    PROJECT_TITLE,
    SERIAL_NUMBER,
    STUDY_SECTION,
    STUDY_SECTION_NAME,
    SUBPROJECT_ID,
    SUFFIX,
    SUPPORT_YEAR,
    DIRECT_COST_AMT,
    INDIRECT_COST_AMT,
    TOTAL_COST,
    TOTAL_COST_SUB_PROJECT
                 ]

## Step 3: CSV files merge

In [5]:
SOURCE_DIR = '/content/downloads'
TARGET_FILE = '/content/output.csv'


def is_integer(string):
    try:
        int(string)
        return True
    except ValueError as a:
        return False


def is_corrupted(dict_row):
    empty_column = None in dict_row
    is_application_id_not_integer = not is_integer(dict_row[APPLICATION_ID])   
    return empty_column or is_application_id_not_integer


with codecs.open(TARGET_FILE, 'w', encoding='utf-8') as output_file:
    writer = csv.DictWriter(output_file, fieldnames=headers, dialect=csv.excel)
    writer.writeheader()

    csv_files = sorted(os.listdir(SOURCE_DIR))

    for csv_file_name in csv_files:
        print(csv_file_name)

        with codecs.open(SOURCE_DIR + '/' + csv_file_name, 'r', encoding='ISO-8859-1') as csv_file_descriptor:
            reader = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
            for index, dict_row in enumerate(reader):
                if is_corrupted(dict_row):
                    print(csv_file_name, index, dict_row[APPLICATION_ID])
                else:
                    upper_dict = dict()
                    for key, value in dict_row.items():
                        try:
                            # letter case uniformization
                            upper_dict[key.upper()] = value.replace("\n", "")
                        except Exception as e:
                            print(e)
                            print(dict_row)
                            print(csv_file_name)
                            #raise e
                    writer.writerow(upper_dict)

RePORTER_PRJ_C_FY1985.csv
RePORTER_PRJ_C_FY1986.csv
RePORTER_PRJ_C_FY1987.csv
RePORTER_PRJ_C_FY1988.csv
RePORTER_PRJ_C_FY1989.csv
RePORTER_PRJ_C_FY1990.csv
RePORTER_PRJ_C_FY1991.csv
RePORTER_PRJ_C_FY1992.csv
RePORTER_PRJ_C_FY1993.csv
RePORTER_PRJ_C_FY1994.csv
RePORTER_PRJ_C_FY1995.csv
RePORTER_PRJ_C_FY1996.csv
RePORTER_PRJ_C_FY1997.csv
RePORTER_PRJ_C_FY1998.csv
RePORTER_PRJ_C_FY1999.csv
RePORTER_PRJ_C_FY2000.csv
RePORTER_PRJ_C_FY2001.csv
RePORTER_PRJ_C_FY2002.csv
RePORTER_PRJ_C_FY2003.csv
RePORTER_PRJ_C_FY2004.csv
RePORTER_PRJ_C_FY2005.csv
RePORTER_PRJ_C_FY2006.csv
RePORTER_PRJ_C_FY2007.csv
RePORTER_PRJ_C_FY2008.csv
RePORTER_PRJ_C_FY2009.csv
RePORTER_PRJ_C_FY2010.csv
RePORTER_PRJ_C_FY2011.csv
RePORTER_PRJ_C_FY2012.csv
RePORTER_PRJ_C_FY2013.csv
RePORTER_PRJ_C_FY2014.csv
RePORTER_PRJ_C_FY2015.csv
RePORTER_PRJ_C_FY2016_new.csv
RePORTER_PRJ_C_FY2017_new.csv
RePORTER_PRJ_C_FY2017_new.csv 46320 9350897
RePORTER_PRJ_C_FY2018_new.csv
RePORTER_PRJ_C_FY2018_new.csv 55939 9473820
RePORTER_PRJ_C_F

## Step 4: contact PIs first name extraction & gender assignment

In [9]:
SOURCE_FILE = '/content/output.csv'
TARGET_FILE = '/content/enhanced-output.csv'
ENCODING = 'utf-8'


def string_to_list(string):
    pi_list = []
    for string_part in string.rstrip('; ').split(';'):
        string_part = string_part.strip(', ').replace('"', '')
        if string_part != '':
            pi_list.append(string_part)
    return pi_list


def filter_contact_pi(pi_ids_or_names_list):
    filtered_list = []
    for item in pi_ids_or_names_list: 
         if '(contact)' in item:
            filtered_list.append(item)
    return filtered_list


def extract_contact_pi(pi_ids_or_names_list):
    if len(pi_ids_or_names_list) == 0:
        return None, "no_value"
    elif len(pi_ids_or_names_list) == 1:
        return pi_ids_or_names_list[0], "single_value"
    else:              
        filtered_pi_ids_or_names = filter_contact_pi(pi_ids_or_names_list)
        if len(filtered_pi_ids_or_names) == 0:
            return pi_ids_or_names_list[0], "multiple_values_but_no_explicit_contact_first_chosen"
        elif len(filtered_pi_ids_or_names) == 1:
            return filtered_pi_ids_or_names[0], "multiple_values_and_single_explicit_contact"
        else:
            return filtered_pi_ids_or_names[0], "multiple_values_and_multiple_explicit_contacts_first_chosen"


def normalize_first_name(first_name):
    first_name = re.sub('[\.\-\;\,]', '', first_name)
    first_name = re.sub(' +', ' ', first_name)
    first_name_parts = first_name.strip(' ').split(' ')
    l = []
    for part in first_name_parts:
        if len(part) > 1:
            l.append(part)
    return ' '.join(l).upper()


def extract_pi_first_name(full_name):
    if full_name is None:
        return None
    else:
        full_name_list = full_name.split(',')
        if len(full_name_list) <= 1:
            return None
        else:
            return normalize_first_name(full_name_list[1].replace('(contact)', ''))

In [14]:
# upload and clean gender_US_names.csv list

with codecs.open('/content/gender_US_names.csv', 'r', encoding='UTF-8-sig') as csv_file_descriptor:
    gender_dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')

    gender_dict = {}

    for item in gender_dataset:
        normalized_first_name = normalize_first_name(item['Name'])
        gender_dict[normalized_first_name] = item['Gender']

In [15]:
# enhance output.csv file with contact PIs extraction and gender assignment
       
with codecs.open(TARGET_FILE, 'w', encoding=ENCODING) as output_file:
    writer = csv.DictWriter(output_file, fieldnames=[
        APPLICATION_ID,
        ACTIVITY,
        ADMINISTERING_IC,
        APPLICATION_TYPE,
        ARRA_FUNDED,
        AWARD_NOTICE_DATE,
        BUDGET_START,
        BUDGET_END,
        CFDA_CODE,
        CORE_PROJECT_NUM,
        ED_INST_TYPE,
        FOA_NUMBER,
        FULL_PROJECT_NUM,
        FUNDING_ICS,
        FUNDING_MECHANISM,
        FY,
        IC_NAME,
        NIH_SPENDING_CATS,
        ORG_CITY,
        ORG_COUNTRY,
        ORG_DEPT,
        ORG_DISTRICT,
        ORG_DUNS,
        ORG_FIPS,
        ORG_IPF_CODE,
        ORG_NAME,
        ORG_STATE,
        ORG_ZIPCODE,
        PHR,
        PI_IDS,
        PI_NAMES,

        NB_PI_IDS,
        NB_PI_NAMES,
        CONTACT_PI_ID, 
        CONTACT_PI_NAME, 
        CONTACT_PI_ID_STATUS, 
        CONTACT_PI_NAME_STATUS,
        CONTACT_PI_FIRST_NAME,
        CONTACT_PI_GENDER,

        PROGRAM_OFFICER_NAME,
        PROJECT_START,
        PROJECT_END,
        PROJECT_TERMS,
        PROJECT_TITLE,
        SERIAL_NUMBER,
        STUDY_SECTION,
        STUDY_SECTION_NAME,
        SUBPROJECT_ID,
        SUFFIX,
        SUPPORT_YEAR,
        DIRECT_COST_AMT,
        INDIRECT_COST_AMT,
        TOTAL_COST,
        TOTAL_COST_SUB_PROJECT
    ], dialect=csv.excel)
    writer.writeheader()


    with codecs.open(SOURCE_FILE, 'r', encoding=ENCODING) as input_file:
        reader = csv.DictReader(input_file, dialect=csv.excel)

        for index, input_row in enumerate(reader):

            pi_ids_list = string_to_list(input_row[PI_IDS])
            pi_names_list = string_to_list(input_row[PI_NAMES])

            try:
                contact_pi_id, pi_id_status = extract_contact_pi(pi_ids_list)
                contact_pi_name, pi_name_status = extract_contact_pi(pi_names_list)
            except Exception as e:
                print('APPLICATION_ID', input_row[APPLICATION_ID])
                print('PI_IDS', input_row[PI_IDS])
                print('PI_NAMES', input_row[PI_NAMES])
                print('+++')
                print(pi_ids_list)
                print(pi_names_list)
                print(e)
                print('----------------')

            contact_pi_first_name = extract_pi_first_name(contact_pi_name)

            dict_row = {
                APPLICATION_ID: input_row[APPLICATION_ID],
                ACTIVITY: input_row[ACTIVITY],
                ADMINISTERING_IC: input_row[ADMINISTERING_IC],
                APPLICATION_TYPE: input_row[APPLICATION_TYPE],
                ARRA_FUNDED: input_row[ARRA_FUNDED],
                AWARD_NOTICE_DATE: input_row[AWARD_NOTICE_DATE],
                BUDGET_START: input_row[BUDGET_START],
                BUDGET_END: input_row[BUDGET_END],
                CFDA_CODE: input_row[CFDA_CODE],
                CORE_PROJECT_NUM: input_row[CORE_PROJECT_NUM],
                ED_INST_TYPE: input_row[ED_INST_TYPE],
                FOA_NUMBER: input_row[FOA_NUMBER],
                FULL_PROJECT_NUM: input_row[FULL_PROJECT_NUM],
                FUNDING_ICS: input_row[FUNDING_ICS],
                FUNDING_MECHANISM: input_row[FUNDING_MECHANISM],
                FY: input_row[FY],
                IC_NAME: input_row[IC_NAME],
                NIH_SPENDING_CATS: input_row[NIH_SPENDING_CATS],
                ORG_CITY: input_row[ORG_CITY],
                ORG_COUNTRY: input_row[ORG_COUNTRY],
                ORG_DEPT: input_row[ORG_DEPT],
                ORG_DISTRICT: input_row[ORG_DISTRICT],
                ORG_DUNS: input_row[ORG_DUNS],
                ORG_FIPS: input_row[ORG_FIPS],
                ORG_IPF_CODE: input_row[ORG_IPF_CODE],
                ORG_NAME: input_row[ORG_NAME],
                ORG_STATE: input_row[ORG_STATE],
                ORG_ZIPCODE: input_row[ORG_ZIPCODE],
                PHR: input_row[PHR],
                PI_IDS: input_row[PI_IDS],
                PI_NAMES: input_row[PI_NAMES],

                NB_PI_IDS: len(pi_ids_list),
                NB_PI_NAMES: len(pi_names_list),
                CONTACT_PI_ID: contact_pi_id, 
                CONTACT_PI_NAME: contact_pi_name, 
                CONTACT_PI_ID_STATUS: pi_id_status, 
                CONTACT_PI_NAME_STATUS: pi_name_status,
                CONTACT_PI_FIRST_NAME: contact_pi_first_name,
                CONTACT_PI_GENDER: gender_dict.get(contact_pi_first_name),

                PROGRAM_OFFICER_NAME: input_row[PROGRAM_OFFICER_NAME],
                PROJECT_START: input_row[PROJECT_START],
                PROJECT_END: input_row[PROJECT_END],
                PROJECT_TERMS: input_row[PROJECT_TERMS],
                PROJECT_TITLE: input_row[PROJECT_TITLE],
                SERIAL_NUMBER: input_row[SERIAL_NUMBER],
                STUDY_SECTION: input_row[STUDY_SECTION],
                STUDY_SECTION_NAME: input_row[STUDY_SECTION_NAME],
                SUBPROJECT_ID: input_row[SUBPROJECT_ID],
                SUFFIX: input_row[SUFFIX],
                SUPPORT_YEAR: input_row[SUPPORT_YEAR],
                DIRECT_COST_AMT: input_row[DIRECT_COST_AMT],
                INDIRECT_COST_AMT: input_row[INDIRECT_COST_AMT],
                TOTAL_COST: input_row[TOTAL_COST],
                TOTAL_COST_SUB_PROJECT: input_row[TOTAL_COST_SUB_PROJECT]
            }
            writer.writerow(dict_row)

### QA on enhanced output

In [16]:
dataset = pd.read_csv('/content/enhanced-output.csv')
dataset.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,CORE_PROJECT_NUM,ED_INST_TYPE,FOA_NUMBER,FULL_PROJECT_NUM,FUNDING_ICS,FUNDING_MECHANISM,FY,IC_NAME,NIH_SPENDING_CATS,ORG_CITY,ORG_COUNTRY,ORG_DEPT,ORG_DISTRICT,ORG_DUNS,ORG_FIPS,ORG_IPF_CODE,ORG_NAME,ORG_STATE,ORG_ZIPCODE,PHR,PI_IDS,PI_NAMES,NB_PI_IDS,NB_PI_NAMES,CONTACT_PI_ID,CONTACT_PI_NAME,CONTACT_PI_ID_STATUS,CONTACT_PI_NAME_STATUS,CONTACT_PI_FIRST_NAME,CONTACT_PI_GENDER,PROGRAM_OFFICER_NAME,PROJECT_START,PROJECT_END,PROJECT_TERMS,PROJECT_TITLE,SERIAL_NUMBER,STUDY_SECTION,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT
0,3000011,A03,AH,1.0,,,7/1/1985,6/30/1986,,A03AH000859,SCHOOLS OF PUBLIC HEALTH,,1A03AH000859-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,BIRMINGHAM,UNITED STATES,,7.0,4514360.0,US,,UNIVERSITY OF ALABAMA AT BIRMINGHAM,AL,35294,,3700006;,"BRIDGERS, WILLIAM F;",1,1,3700010.0,"BRIDGERS, WILLIAM F",single_value,single_value,WILLIAM,M,,7/1/1985,6/30/1986,,PUBLIC HEALTH TRAINEESHIPS,859,STC,,,,1,,,,
1,3000012,A03,AH,1.0,,,7/1/1985,6/30/1986,,A03AH000860,SCHOOLS OF PUBLIC HEALTH,,1A03AH000860-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,BERKELEY,UNITED STATES,,9.0,94878337.0,US,,UNIVERSITY OF CALIFORNIA BERKELEY,CA,947045940,,2407264;,"LASHOF, JOYCE C.;",1,1,2407260.0,"LASHOF, JOYCE C.",single_value,single_value,JOYCE,F,,7/1/1985,6/30/1986,,PUBLIC HEALTH TRAINEESHIPS,860,STC,,,,1,,,,
2,3000013,A03,AH,1.0,,,7/1/1985,6/30/1986,,A03AH000861,SCHOOLS OF PUBLIC HEALTH,,1A03AH000861-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,LOS ANGELES,UNITED STATES,,30.0,119132785.0,US,,UNIVERSITY OF CALIFORNIA LOS ANGELES,CA,90095,,1871887;,"DETELS, ROGER;",1,1,1871890.0,"DETELS, ROGER",single_value,single_value,ROGER,M,,7/1/1985,6/30/1986,,PUBLIC HEALTH TRAINEESHIPS,861,STC,,,,1,,,,
3,3000014,A03,AH,1.0,,,7/1/1985,6/30/1986,,A03AH000862,SCHOOLS OF PUBLIC HEALTH,,1A03AH000862-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,LOMA LINDA,UNITED STATES,,41.0,,US,,LOMA LINDA UNIVERSITY,CA,92350,,1877259;,"CRAWFORD, JAMES MACKINNON;",1,1,1877260.0,"CRAWFORD, JAMES MACKINNON",single_value,single_value,JAMES MACKINNON,,,7/1/1985,6/30/1986,,PUBLIC HEALTH TRAINEESHIPS,862,STC,,,,1,,,,
4,3000015,A03,AH,1.0,,,7/1/1985,6/30/1986,,A03AH000863,OTHER SPECIALIZED SCHOOLS,,1A03AH000863-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,SAN DIEGO,UNITED STATES,,53.0,73371346.0,US,,SAN DIEGO STATE UNIVERSITY,CA,92182,,1957769;,"SCUTCHFIELD, F DOUGLAS;",1,1,1957770.0,"SCUTCHFIELD, F DOUGLAS",single_value,single_value,DOUGLAS,M,,7/1/1985,6/30/1986,,PUBLIC HEALTH TRAINEESHIPS,863,STC,,,,1,,,,


In [17]:
# count number of successful gender assignment

with codecs.open('enhanced-output.csv', 'r', encoding='UTF-8') as csv_file_descriptor:
    dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
    
    count_f = 0
    count_m = 0
    count_null = 0

    for row in dataset:
        if row[CONTACT_PI_GENDER] == 'F':
            count_f += 1
        elif row[CONTACT_PI_GENDER] == 'M':
            count_m += 1
        else:
            count_null += 1

    print("count_f: ", count_f)
    print("count_m: ", count_m)
    print("count_null: ", count_null)
    print("sum: ", count_f + count_m + count_null)

count_f:  603453
count_m:  1627218
count_null:  308878
sum:  2539549


### Results

| gender    	| nb of rows   	|% of rows   	|
|:-------------	|-----------:	|-----------:	|
| count_f:    	| 603 453   	|23,76 %   	   	|
| count_m:    	| 1 627 218   	|64,08 %   	   	|
| count_null: 	| 308 878   	|12,16 %   	   	|
| sum w/ null:  | 2 539 549 	|100,00 % 	   	|


| gender    	| nb of rows   	|% of rows   	|
|:-------------	|-----------:	|-----------:	|
| count_f:    	| 603 453   	|27,05 %   	   	|
| count_m:    	| 1 627 218   	|72,95 %   	   	|
| sum w/o null: | 2 230 671 	|100,00 % 	   	|


Those preliminary matching results are consistent with literature on the composition of the scientific workforce:

> "At the global level, women account for less than a third of scientists and engineers (1); a percentage that is similar to their proportion of scientific authorships (2). In the United States, women represent 28.4% of the scientific workforce, and this percentage varies by domain, with a high of 72.8% in psychology and a low of 14.5% in engineering (3)." https://doi.org/10.1073/pnas.2113067119

The recall rate could be improved in future iterations (by combining with another dataset or by manually assigning gender). 