# Context

This project aims at exploring gender disparities in science based on [NIH RePORTER](https://exporter.nih.gov/ExPORTER_Catalog.aspx?sid=1&index=0) project data from 1985 to 2020.

It is supported by the [Canada Research Chair on the Transformations of Scholarly Communication](https://crctcs.openum.ca/en) (Prof. Vincent Larivière).

# Imports

In [52]:
from urllib.request import urlretrieve
import zipfile
import tempfile
import csv
import os
import codecs
import pandas as pd
import itertools
import re
from datetime import datetime
from utils.csv_headers import *
import random

# Data preprocessing

## Download CSV project files

In [None]:
LOCAL_SOURCE_DIR = '/content/downloads'
# used to download zip files
TMP_DIR = tempfile.gettempdir()
BASE_URL = 'https://exporter.nih.gov/CSVs/final'


for file_name in [f'RePORTER_PRJ_C_FY{year}.zip' for year in range(1985, 2021)]:
    url = f'{BASE_URL}/{file_name}'

    # download file in local
    zip_path = f'{TMP_DIR}/{file_name}'
    urlretrieve(url, zip_path)

    # unzip
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_SOURCE_DIR)

## CSV files merge

In [None]:
SOURCE_DIR = '/content/downloads'
TARGET_FILE = '/content/output.csv'


def is_integer(string):
    try:
        int(string)
        return True
    except ValueError as a:
        return False


def is_corrupted(dict_row):
    empty_column = None in dict_row
    is_application_id_not_integer = not is_integer(dict_row[APPLICATION_ID])   
    return empty_column or is_application_id_not_integer


with codecs.open(TARGET_FILE, 'w', encoding='utf-8') as output_file:
    writer = csv.DictWriter(output_file, fieldnames=ORDERED_HEADERS, dialect=csv.excel)
    writer.writeheader()

    csv_files = sorted(os.listdir(SOURCE_DIR))

    for csv_file_name in csv_files:
        print(csv_file_name)

        with codecs.open(SOURCE_DIR + '/' + csv_file_name, 'r', encoding='ISO-8859-1') as csv_file_descriptor:
            reader = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
            for index, dict_row in enumerate(reader):
                if is_corrupted(dict_row):
                    print(csv_file_name, index, dict_row[APPLICATION_ID])
                else:
                    upper_dict = dict()
                    for key, value in dict_row.items():
                        try:
                            # letter case uniformization
                            upper_dict[key.upper()] = value.replace("\n", "")
                        except Exception as e:
                            print(e)
                            print(dict_row)
                            print(csv_file_name)
                            #raise e
                    writer.writerow(upper_dict)

RePORTER_PRJ_C_FY1985.csv
RePORTER_PRJ_C_FY1986.csv
RePORTER_PRJ_C_FY1987.csv
RePORTER_PRJ_C_FY1988.csv
RePORTER_PRJ_C_FY1989.csv
RePORTER_PRJ_C_FY1990.csv
RePORTER_PRJ_C_FY1991.csv
RePORTER_PRJ_C_FY1992.csv
RePORTER_PRJ_C_FY1993.csv
RePORTER_PRJ_C_FY1994.csv
RePORTER_PRJ_C_FY1995.csv
RePORTER_PRJ_C_FY1996.csv
RePORTER_PRJ_C_FY1997.csv
RePORTER_PRJ_C_FY1998.csv
RePORTER_PRJ_C_FY1999.csv
RePORTER_PRJ_C_FY2000.csv
RePORTER_PRJ_C_FY2001.csv
RePORTER_PRJ_C_FY2002.csv
RePORTER_PRJ_C_FY2003.csv
RePORTER_PRJ_C_FY2004.csv
RePORTER_PRJ_C_FY2005.csv
RePORTER_PRJ_C_FY2006.csv
RePORTER_PRJ_C_FY2007.csv
RePORTER_PRJ_C_FY2008.csv
RePORTER_PRJ_C_FY2009.csv
RePORTER_PRJ_C_FY2010.csv
RePORTER_PRJ_C_FY2011.csv
RePORTER_PRJ_C_FY2012.csv
RePORTER_PRJ_C_FY2013.csv
RePORTER_PRJ_C_FY2014.csv
RePORTER_PRJ_C_FY2015.csv
RePORTER_PRJ_C_FY2016_new.csv
RePORTER_PRJ_C_FY2017_new.csv
RePORTER_PRJ_C_FY2017_new.csv 46320 9350897
RePORTER_PRJ_C_FY2018_new.csv
RePORTER_PRJ_C_FY2018_new.csv 55939 9473820
RePORTER_PRJ_C_F

## Contact PIs first name extraction & gender assignment

In [None]:
SOURCE_FILE = '/content/output.csv'
TARGET_FILE = '/content/enhanced-output.csv'
ENCODING = 'utf-8'


def string_to_list(string):
    pi_list = []
    for string_part in string.rstrip('; ').split(';'):
        string_part = string_part.strip(', ').replace('"', '')
        if string_part != '':
            pi_list.append(string_part)
    return pi_list


def filter_contact_pi(pi_ids_or_names_list):
    filtered_list = []
    for item in pi_ids_or_names_list: 
         if '(contact)' in item:
            filtered_list.append(item)
    return filtered_list


def extract_contact_pi(pi_ids_or_names_list):
    if len(pi_ids_or_names_list) == 0:
        return None, "no_value"
    elif len(pi_ids_or_names_list) == 1:
        return pi_ids_or_names_list[0], "single_value"
    else:              
        filtered_pi_ids_or_names = filter_contact_pi(pi_ids_or_names_list)
        if len(filtered_pi_ids_or_names) == 0:
            return pi_ids_or_names_list[0], "multiple_values_but_no_explicit_contact_first_chosen"
        elif len(filtered_pi_ids_or_names) == 1:
            return filtered_pi_ids_or_names[0], "multiple_values_and_single_explicit_contact"
        else:
            return filtered_pi_ids_or_names[0], "multiple_values_and_multiple_explicit_contacts_first_chosen"


def normalize_first_name(first_name):
    first_name = re.sub('[\.\-\;\,]', '', first_name)
    first_name = re.sub(' +', ' ', first_name)
    first_name_parts = first_name.strip(' ').split(' ')
    l = []
    for part in first_name_parts:
        if len(part) > 1:
            l.append(part)
    return ' '.join(l).upper()


def extract_pi_first_name(full_name):
    if full_name is None:
        return None
    else:
        full_name_list = full_name.split(',')
        if len(full_name_list) <= 1:
            return None
        else:
            return normalize_first_name(full_name_list[1].replace('(contact)', ''))

In [None]:
# upload and clean gender_US_names.csv list

with codecs.open('/content/gender_US_names.csv', 'r', encoding='UTF-8-sig') as csv_file_descriptor:
    gender_dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')

    gender_dict = {}

    for item in gender_dataset:
        normalized_first_name = normalize_first_name(item['Name'])
        gender_dict[normalized_first_name] = item['Gender']

In [None]:
# enhance output.csv file with contact PIs extraction and gender assignment
       
with codecs.open(TARGET_FILE, 'w', encoding=ENCODING) as output_file:
    writer = csv.DictWriter(output_file, fieldnames=[
        APPLICATION_ID,
        ACTIVITY,
        ADMINISTERING_IC,
        APPLICATION_TYPE,
        ARRA_FUNDED,
        AWARD_NOTICE_DATE,
        BUDGET_START,
        BUDGET_END,
        CFDA_CODE,
        CORE_PROJECT_NUM,
        ED_INST_TYPE,
        FOA_NUMBER,
        FULL_PROJECT_NUM,
        FUNDING_ICS,
        FUNDING_MECHANISM,
        FY,
        IC_NAME,
        NIH_SPENDING_CATS,
        ORG_CITY,
        ORG_COUNTRY,
        ORG_DEPT,
        ORG_DISTRICT,
        ORG_DUNS,
        ORG_FIPS,
        ORG_IPF_CODE,
        ORG_NAME,
        ORG_STATE,
        ORG_ZIPCODE,
        PHR,
        PI_IDS,
        PI_NAMES,

        NB_PI_IDS,
        NB_PI_NAMES,
        CONTACT_PI_ID, 
        CONTACT_PI_NAME, 
        CONTACT_PI_ID_STATUS, 
        CONTACT_PI_NAME_STATUS,
        CONTACT_PI_FIRST_NAME,
        CONTACT_PI_GENDER,

        PROGRAM_OFFICER_NAME,
        PROJECT_START,
        PROJECT_END,
        PROJECT_TERMS,
        PROJECT_TITLE,
        SERIAL_NUMBER,
        STUDY_SECTION,
        STUDY_SECTION_NAME,
        SUBPROJECT_ID,
        SUFFIX,
        SUPPORT_YEAR,
        DIRECT_COST_AMT,
        INDIRECT_COST_AMT,
        TOTAL_COST,
        TOTAL_COST_SUB_PROJECT
    ], dialect=csv.excel)
    writer.writeheader()


    with codecs.open(SOURCE_FILE, 'r', encoding=ENCODING) as input_file:
        reader = csv.DictReader(input_file, dialect=csv.excel)

        for index, input_row in enumerate(reader):

            pi_ids_list = string_to_list(input_row[PI_IDS])
            pi_names_list = string_to_list(input_row[PI_NAMES])

            try:
                contact_pi_id, pi_id_status = extract_contact_pi(pi_ids_list)
                contact_pi_name, pi_name_status = extract_contact_pi(pi_names_list)
            except Exception as e:
                print('APPLICATION_ID', input_row[APPLICATION_ID])
                print('PI_IDS', input_row[PI_IDS])
                print('PI_NAMES', input_row[PI_NAMES])
                print('+++')
                print(pi_ids_list)
                print(pi_names_list)
                print(e)
                print('----------------')

            contact_pi_first_name = extract_pi_first_name(contact_pi_name)

            dict_row = {
                APPLICATION_ID: input_row[APPLICATION_ID],
                ACTIVITY: input_row[ACTIVITY],
                ADMINISTERING_IC: input_row[ADMINISTERING_IC],
                APPLICATION_TYPE: input_row[APPLICATION_TYPE],
                ARRA_FUNDED: input_row[ARRA_FUNDED],
                AWARD_NOTICE_DATE: input_row[AWARD_NOTICE_DATE],
                BUDGET_START: input_row[BUDGET_START],
                BUDGET_END: input_row[BUDGET_END],
                CFDA_CODE: input_row[CFDA_CODE],
                CORE_PROJECT_NUM: input_row[CORE_PROJECT_NUM],
                ED_INST_TYPE: input_row[ED_INST_TYPE],
                FOA_NUMBER: input_row[FOA_NUMBER],
                FULL_PROJECT_NUM: input_row[FULL_PROJECT_NUM],
                FUNDING_ICS: input_row[FUNDING_ICS],
                FUNDING_MECHANISM: input_row[FUNDING_MECHANISM],
                FY: input_row[FY],
                IC_NAME: input_row[IC_NAME],
                NIH_SPENDING_CATS: input_row[NIH_SPENDING_CATS],
                ORG_CITY: input_row[ORG_CITY],
                ORG_COUNTRY: input_row[ORG_COUNTRY],
                ORG_DEPT: input_row[ORG_DEPT],
                ORG_DISTRICT: input_row[ORG_DISTRICT],
                ORG_DUNS: input_row[ORG_DUNS],
                ORG_FIPS: input_row[ORG_FIPS],
                ORG_IPF_CODE: input_row[ORG_IPF_CODE],
                ORG_NAME: input_row[ORG_NAME],
                ORG_STATE: input_row[ORG_STATE],
                ORG_ZIPCODE: input_row[ORG_ZIPCODE],
                PHR: input_row[PHR],
                PI_IDS: input_row[PI_IDS],
                PI_NAMES: input_row[PI_NAMES],

                NB_PI_IDS: len(pi_ids_list),
                NB_PI_NAMES: len(pi_names_list),
                CONTACT_PI_ID: contact_pi_id, 
                CONTACT_PI_NAME: contact_pi_name, 
                CONTACT_PI_ID_STATUS: pi_id_status, 
                CONTACT_PI_NAME_STATUS: pi_name_status,
                CONTACT_PI_FIRST_NAME: contact_pi_first_name,
                CONTACT_PI_GENDER: gender_dict.get(contact_pi_first_name),

                PROGRAM_OFFICER_NAME: input_row[PROGRAM_OFFICER_NAME],
                PROJECT_START: input_row[PROJECT_START],
                PROJECT_END: input_row[PROJECT_END],
                PROJECT_TERMS: input_row[PROJECT_TERMS],
                PROJECT_TITLE: input_row[PROJECT_TITLE],
                SERIAL_NUMBER: input_row[SERIAL_NUMBER],
                STUDY_SECTION: input_row[STUDY_SECTION],
                STUDY_SECTION_NAME: input_row[STUDY_SECTION_NAME],
                SUBPROJECT_ID: input_row[SUBPROJECT_ID],
                SUFFIX: input_row[SUFFIX],
                SUPPORT_YEAR: input_row[SUPPORT_YEAR],
                DIRECT_COST_AMT: input_row[DIRECT_COST_AMT],
                INDIRECT_COST_AMT: input_row[INDIRECT_COST_AMT],
                TOTAL_COST: input_row[TOTAL_COST],
                TOTAL_COST_SUB_PROJECT: input_row[TOTAL_COST_SUB_PROJECT]
            }
            writer.writerow(dict_row)

## QA on enhanced output

In [None]:
dataset = pd.read_csv('/content/enhanced-output.csv')
dataset.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,CORE_PROJECT_NUM,ED_INST_TYPE,FOA_NUMBER,FULL_PROJECT_NUM,FUNDING_ICS,FUNDING_MECHANISM,FY,IC_NAME,NIH_SPENDING_CATS,ORG_CITY,ORG_COUNTRY,ORG_DEPT,ORG_DISTRICT,ORG_DUNS,ORG_FIPS,ORG_IPF_CODE,ORG_NAME,ORG_STATE,ORG_ZIPCODE,PHR,PI_IDS,PI_NAMES,NB_PI_IDS,NB_PI_NAMES,CONTACT_PI_ID,CONTACT_PI_NAME,CONTACT_PI_ID_STATUS,CONTACT_PI_NAME_STATUS,CONTACT_PI_FIRST_NAME,CONTACT_PI_GENDER,PROGRAM_OFFICER_NAME,PROJECT_START,PROJECT_END,PROJECT_TERMS,PROJECT_TITLE,SERIAL_NUMBER,STUDY_SECTION,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT
0,3000011,A03,AH,1.0,,,7/1/1985,6/30/1986,,A03AH000859,SCHOOLS OF PUBLIC HEALTH,,1A03AH000859-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,BIRMINGHAM,UNITED STATES,,7.0,4514360.0,US,,UNIVERSITY OF ALABAMA AT BIRMINGHAM,AL,35294,,3700006;,"BRIDGERS, WILLIAM F;",1,1,3700010.0,"BRIDGERS, WILLIAM F",single_value,single_value,WILLIAM,M,,7/1/1985,6/30/1986,,PUBLIC HEALTH TRAINEESHIPS,859,STC,,,,1,,,,
1,3000012,A03,AH,1.0,,,7/1/1985,6/30/1986,,A03AH000860,SCHOOLS OF PUBLIC HEALTH,,1A03AH000860-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,BERKELEY,UNITED STATES,,9.0,94878337.0,US,,UNIVERSITY OF CALIFORNIA BERKELEY,CA,947045940,,2407264;,"LASHOF, JOYCE C.;",1,1,2407260.0,"LASHOF, JOYCE C.",single_value,single_value,JOYCE,F,,7/1/1985,6/30/1986,,PUBLIC HEALTH TRAINEESHIPS,860,STC,,,,1,,,,
2,3000013,A03,AH,1.0,,,7/1/1985,6/30/1986,,A03AH000861,SCHOOLS OF PUBLIC HEALTH,,1A03AH000861-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,LOS ANGELES,UNITED STATES,,30.0,119132785.0,US,,UNIVERSITY OF CALIFORNIA LOS ANGELES,CA,90095,,1871887;,"DETELS, ROGER;",1,1,1871890.0,"DETELS, ROGER",single_value,single_value,ROGER,M,,7/1/1985,6/30/1986,,PUBLIC HEALTH TRAINEESHIPS,861,STC,,,,1,,,,
3,3000014,A03,AH,1.0,,,7/1/1985,6/30/1986,,A03AH000862,SCHOOLS OF PUBLIC HEALTH,,1A03AH000862-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,LOMA LINDA,UNITED STATES,,41.0,,US,,LOMA LINDA UNIVERSITY,CA,92350,,1877259;,"CRAWFORD, JAMES MACKINNON;",1,1,1877260.0,"CRAWFORD, JAMES MACKINNON",single_value,single_value,JAMES MACKINNON,,,7/1/1985,6/30/1986,,PUBLIC HEALTH TRAINEESHIPS,862,STC,,,,1,,,,
4,3000015,A03,AH,1.0,,,7/1/1985,6/30/1986,,A03AH000863,OTHER SPECIALIZED SCHOOLS,,1A03AH000863-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,SAN DIEGO,UNITED STATES,,53.0,73371346.0,US,,SAN DIEGO STATE UNIVERSITY,CA,92182,,1957769;,"SCUTCHFIELD, F DOUGLAS;",1,1,1957770.0,"SCUTCHFIELD, F DOUGLAS",single_value,single_value,DOUGLAS,M,,7/1/1985,6/30/1986,,PUBLIC HEALTH TRAINEESHIPS,863,STC,,,,1,,,,


# High-level exploration

In [None]:
# count number of successful gender assignment

with codecs.open('enhanced-output.csv', 'r', encoding='UTF-8') as csv_file_descriptor:
    dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
    
    count_f = 0
    count_m = 0
    count_null = 0

    for row in dataset:
        if row[CONTACT_PI_GENDER] == 'F':
            count_f += 1
        elif row[CONTACT_PI_GENDER] == 'M':
            count_m += 1
        else:
            count_null += 1

    print("count_f: ", count_f)
    print("count_m: ", count_m)
    print("count_null: ", count_null)
    print("sum: ", count_f + count_m + count_null)

count_f:  603453
count_m:  1627218
count_null:  308878
sum:  2539549


| gender    	| nb of rows   	|% of rows   	|
|:-------------	|-----------:	|-----------:	|
| count_f:    	| 603 453   	|23,76 %   	   	|
| count_m:    	| 1 627 218   	|64,08 %   	   	|
| count_null: 	| 308 878   	|12,16 %   	   	|
| sum w/ null:  | 2 539 549 	|100,00 % 	   	|


| gender    	| nb of rows   	|% of rows   	|
|:-------------	|-----------:	|-----------:	|
| count_f:    	| 603 453   	|27,05 %   	   	|
| count_m:    	| 1 627 218   	|72,95 %   	   	|
| sum w/o null: | 2 230 671 	|100,00 % 	   	|


Those preliminary matching results are consistent with literature on the composition of the scientific workforce:

> "At the global level, women account for less than a third of scientists and engineers (1); a percentage that is similar to their proportion of scientific authorships (2). In the United States, women represent 28.4% of the scientific workforce, and this percentage varies by domain, with a high of 72.8% in psychology and a low of 14.5% in engineering (3)." https://doi.org/10.1073/pnas.2113067119

The recall rate could be improved in future iterations (by combining with another dataset or by manually assigning gender). 

# Data Subsetting

In [24]:
# check consistency of gender % for a specific period of time

def str_to_int(string):
    try:
        return int(string)
    except ValueError as a:
        return None


with codecs.open('sample.csv', 'r', encoding='UTF-8') as csv_file_descriptor:
    dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
    
    count_f = 0
    count_m = 0
    count_null = 0

    for row in dataset:
        FY_int = str_to_int(row[FY])
        if FY_int in range(2015, 2021):
            if row[CONTACT_PI_GENDER] == 'F':
                count_f += 1
            elif row[CONTACT_PI_GENDER] == 'M':
                count_m += 1
            else:
                count_null += 1

    print("count_f: ", count_f)
    print("count_m: ", count_m)
    print("count_null: ", count_null)
    print("sum: ", count_f + count_m + count_null)

count_f:  3
count_m:  10
count_null:  2
sum:  15


In [14]:
# create a random subset with X% rows

SAMPLE_PERCENTAGE = 10

def str_to_int(string):
    try:
        return int(string)
    except ValueError as a:
        return None

with codecs.open('enhanced-output.csv', 'r', encoding='UTF-8') as csv_file_descriptor:
    dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')

    with codecs.open('random_subset.csv', 'w', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=dataset.fieldnames, dialect=csv.excel)
        writer.writeheader()

        for row in dataset:
            if random.randrange(0, 100) < SAMPLE_PERCENTAGE:
                 writer.writerow(row)

In [17]:
# QA: count number of successful gender assignment for the random-subset

with codecs.open('random_subset.csv', 'r', encoding='UTF-8') as csv_file_descriptor:
    dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
    
    count_f = 0
    count_m = 0
    count_null = 0

    for row in dataset:
        if row[CONTACT_PI_GENDER] == 'F':
            count_f += 1
        elif row[CONTACT_PI_GENDER] == 'M':
            count_m += 1
        else:
            count_null += 1

    print("count_f: ", count_f)
    print("count_m: ", count_m)
    print("count_null: ", count_null)
    print("sum: ", count_f + count_m + count_null)

count_f:  60638
count_m:  163014
count_null:  31053
sum:  254705


**Résultats pour 1%** :

| gender    	| nb of rows   	|% of rows   	|
|:-------------	|-----------:	|-----------:	|
| count_f:    	|6 066    	|23,78 %   	   	|
| count_m:    	|16 368    	|64,17 %   	   	|
| count_null: 	|3 073    	|12,05 %   	   	|
| sum w/ null:  |25 507  	|100,00 % 	   	|


| gender    	| nb of rows   	|% of rows   	|
|:-------------	|-----------:	|-----------:	|
| count_f:    	|6 066    	|27,04 %   	   	|
| count_m:    	|16 368    	|72,96 %   	   	|
| sum w/o null: |22 434  	|100,00 % 	   	|

These results are really close to the results obtained with the full dataset (enhanced-output.csv). The random subsetting method used thus seems appropriate for our purpose.

In [4]:
dataset = pd.read_csv('random_subset.csv')
dataset.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,CORE_PROJECT_NUM,...,SERIAL_NUMBER,STUDY_SECTION,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT
0,3001993,A19,AH,1.0,,,7/1/1985,6/30/1986,,A19AH000173,...,173,STC,,,,1.0,,,,
1,3002019,A19,AH,1.0,,,7/1/1985,6/30/1986,,A19AH000200,...,200,STC,,,,1.0,,,,
2,3003935,D10,NU,1.0,,,2/1/1985,1/31/1988,,D10NU024219,...,24219,PGR,,,,1.0,,,,
3,3004120,D10,NU,5.0,,,8/1/1985,7/31/1986,,D10NU025143,...,25143,PGR,,,,3.0,,,,
4,3005815,D15,PE,5.0,,,7/1/1985,6/30/1986,,D15PE014261,...,14261,STC,,,,2.0,,,,


# Prochaines étapes : circonscrire la portée de l'analyse

## Préambule

L'étude de [Hoppe et al. (2019)](https://www.science.org/doi/abs/10.1126/sciadv.aaw7238) a montré que le taux de financement plus faible des chercheur.e.s Afro-américain.e.s/noir.e.s (AA/B) s'explique par trois étapes du processus décisionnel de la part des évaluateur.trice.s :

> all three of the factors that underlie the funding gap—preference for some topics over others, assignment of poorer scores, and decision to discuss an application—revolve around decisions made by reviewers

De plus, l'étude de [Kozlowskia et al. (2019)](https://www.pnas.org/content/119/2/e2113067119) a montré que :

>  minoritized authors tend to publish in scientific disciplines and on research topics that reflect their gendered and racialized social identities

Autrement dit, que :

> there is a privilege of choice in scientific knowledge production, wherein research on a particular topic is influenced by scientist’s race and gender

On peut donc penser que des mécanismes similaires à ceux identifiés par Hoppe et al. (2019) et par [Magua et al. (2017)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5446598/) sont à l'oeuvre pour expliquer le sous-financement des femmes en recherche. Notre jeu de données ne nous permet cependant pas d'en étudier les causes sous-jacentes comme l'ont fait Hoppe et al. (2019), ni de mener une analyse intersectionnelle comme l'ont fait Kozlowskia et al. (2019). Il nous permet toutefois de faire l'état des lieux et d'explorer les questions ci-dessous.

## Exemples de questions exploratoires

* Pour étudier la concentration du financement :
    * Les femmes sont-elles moins financées que les hommes par les NIH?
    * Les femmes sont-elles financées aussi souvent que les hommes, i.e. quelle est la proportion de femmes vs hommes qui ont obtenu plus d'un financement? Combien en ont-elles/ils obtenus?
    * Quelle est la durée des projets financés pour les femmes vs les hommes?
* Et son évolution dans le temps :
    * Quelle est l'évolution du financement des femmes vs des hommes dans le temps? Le "funding gap" est-il stable, se réduit-il, augmente-t-il?
    * **RÉGRESSION :** Si le "funding gap" se réduit, dans combien d'années peut-on estimer que la parité sera atteinte si la tendance se maintient? (voir [Holman et al., 2018](https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.2004956))
* Pour étudier l'effet de la collaboration scientifique :
    * Les femmes ont-elles davantage de co-PIs que les hommes ou l'inverse? Combien en ont-elles/ils?
    * Les équipes plus nombreuses sont-elles plus financées, plus souvent, plus longtemps?

## Autres questions pour aller plus loin...

* NLP : s'inspirer de la méthode de Hoppe et al. (2019) pour identifier des champs de recherche à partir du titre des projets et des termes associés
    * Quels sont les champs de recherche les plus et les moins financés?
    * Quels sont les champs de recherche où oeuvrent davantage les femmes vs les hommes? 
* Inclure les autres types de financement des NIH dans l'analyse à des fins de comparaison (p. ex. subvention d’infrastructure vs subvention de projet)

## Et en ajoutant d'autres données...

* D'autres données des NIH :
    * Les résumés des projets financés, pour améliorer l'identification des champs de recherche et mieux répondre aux questions ci-dessus
    * Les publications associées aux projets financés, pour étudier la concentration de la production scientifique :
        * À financement égal et en normalisant par champ de recherche, les femmes publient-elles autant que les hommes? (voir [Larivière et al., 2013](https://www.nature.com/articles/504211a))
* Des données externes :
    * Taille / prestige des universités d'attache des PIs contacts

## Problèmes, interrogations, bloquants, limitations

* Restreindre aux subventions de type [R01](https://grants.nih.gov/grants/funding/r01.htm) dans un premier temps, comme l'ont fait Magua et al. (2017)
* Attribuer un genre seulement aux PIs contacts lorsqu'il y a plusieurs PIs
* Essayer de comprendre les "application type" et voir si je restreins à certains types seulement
* Comment gérer les projets vs sous-projets?
    * Il y a une hiérarchie dans les données : des projets, des sous-projets. Pour un projet parent, le montant total est dans la colonne TOTAL_COST, mais pour un sous-projet, le montant sera dans TOTAL_COST_SUB_PROJECT et il n'y aura rien dans TOTAL_COST.
    * Pour voir si une ligne réfère à un projet parent ou à un sous-projet, il faut regarder dans la colonne SUBPROJECT_ID. Quand c'est vide, on parle d'un projet parent ou d'un projet qui n'a pas de sous-projets, et quand il y a un ID, il s'agit d'un sous-projet. Dans ce dernier cas, l'ID du projet parent se trouve dans la colonne FULL_PROJECT_NUMBER.
    * Pour les années plus récentes, on a aussi de nouvelles colonnes DIRECT_COST_AMT et INDIRECT_COST_AMT. En général, DIRECT_COST_AMT +  INDIRECT_COST_AMT = TOTAL_COST, mais parfois ce n'est pas le cas (par exemple, pour le APPLICATION_ID 8857469 qui n'a pourtant pas de sous-projet).

## ~~To do~~

* ~~Re-rouler le subsetting en ajoutant un seed() à des fins de reproductibilité~~
* ~~Enlever les colonnes que je n'utiliserai pas~~
* ~~Enlever les lignes qui ne correspondent pas à une subvention R01~~
* ~~Au besoin, enlever les lignes qui ne correspondent pas à un application type que je ne souhaite pas analyser~~
* ~~Enlever les lignes pour lesquelles le genre du PI contact n'a pas été attribué (y revenir dans une itération future, coder à la main et/ou utiliser d'autres listes)~~
* ~~Enlever les autres lignes où un champ important est vide (à définir)~~
* ~~Compter le nombre de lignes restantes et vérifier si le subset est encore représentatif -- autrement, travailler à partir du enhanced-output complet, puis re-générer un nouveau subset~~
* ~~Trouver comment gérer les projets / sous-projets~~
* ~~Finir la revue de littérature, puis ajuster mes questions au besoin~~

In [16]:
# compter le nombre de lignes pour les subventions de type R01

with codecs.open('random_subset.csv', 'r', encoding='UTF-8') as csv_file_descriptor:
    dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
    
    count_r01 = 0

    for row in dataset:
        if row[ACTIVITY] == 'R01':
            count_r01 += 1

    print("count_r01: ", count_r01)

count_r01:  89841


In [18]:
# compter la répartition F-H-NULL pour les subventions R01

with codecs.open('random_subset.csv', 'r', encoding='UTF-8') as csv_file_descriptor:
    dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
    
    count_r01 = 0
    count_r01_f = 0
    count_r01_m = 0
    count_r01_null = 0
    
    for row in dataset:
        if row[ACTIVITY] == 'R01':
            count_r01 += 1
            if row[CONTACT_PI_GENDER] == 'F':
                count_r01_f += 1
            elif row[CONTACT_PI_GENDER] == 'M':
                count_r01_m += 1
            else:
                count_r01_null += 1

    print("count_r01_f: ", count_r01_f)
    print("count_r01_m: ", count_r01_m)
    print("count_r01_null: ", count_r01_null)
    print("sum_r01: ", count_r01_f + count_r01_m + count_r01_null) # should be the same as count_r01
    print("count_r01: ", count_r01)

count_r01_f:  19876
count_r01_m:  61419
count_r01_null:  8546
sum_r01:  89841
count_r01:  89841


Les résultats avec 1% s'éloignent un peu des proportions du dataset au global... Est-ce un effet du trop petit échantillonnage ou bien d'une sous-représentation encore plus importante des femmes pour ce type de subvention?

Quand on compare avec un subset de 10% des lignes, on obtient un résultat très proche de celui du dataset entier, voir ci-dessous :

Donc on y va avec un subset de 10%.

In [12]:
# compter la répartition F-H-NULL pour les subventions R01, pour tout le dataset entier

with codecs.open('enhanced-output.csv', 'r', encoding='UTF-8') as csv_file_descriptor:
    dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
    
    count_r01 = 0
    count_r01_f = 0
    count_r01_m = 0
    count_r01_null = 0
    
    for row in dataset:
        if row[ACTIVITY] == 'R01':
            count_r01 += 1
            if row[CONTACT_PI_GENDER] == 'F':
                count_r01_f += 1
            elif row[CONTACT_PI_GENDER] == 'M':
                count_r01_m += 1
            else:
                count_r01_null += 1

    print("count_r01_f: ", count_r01_f)
    print("count_r01_m: ", count_r01_m)
    print("count_r01_null: ", count_r01_null)
    print("sum_r01: ", count_r01_f + count_r01_m + count_r01_null) # should be the same as count_r01
    print("count_r01: ", count_r01)

count_r01_f:  197157
count_r01_m:  612168
count_r01_null:  85977
sum_r01:  895302
count_r01:  895302


# Subset Cleaning

In [102]:
# Removing lines and columns to reduce the scope of the project

subset_headers = [APPLICATION_ID,
                  CONTACT_PI_NAME, 
                  CONTACT_PI_GENDER, 
                  NB_PI_NAMES, 
                  ORG_COUNTRY, 
                  ORG_STATE, 
                  ORG_NAME, 
                  PROJECT_START, 
                  PROJECT_END, 
                  PROJECT_TITLE, 
                  PROJECT_TERMS, 
                  TOTAL_COST]

subset_headers_set = set(subset_headers) #on crée un set parce que c'est indexé et python va parcourir plus vite que lire une liste


with codecs.open('enhanced-output.csv', 'r', encoding='UTF-8') as csv_file_descriptor:
    dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')

    with codecs.open('cleaned_subset.csv', 'w', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=subset_headers, dialect=csv.excel)
        writer.writeheader()

        for row in dataset:
            if row[ACTIVITY] == 'R01' and \
                row[APPLICATION_TYPE] == '1' and \
                row[FY] == '2020' and \
                row[SUPPORT_YEAR] == '01' and \
                row[TOTAL_COST] != '' and (
                row[CONTACT_PI_GENDER] == 'F' or
                row[CONTACT_PI_GENDER] == 'M'
            ):
                subset_row = {}
                for key, value in row.items():
                    if key in subset_headers_set:
                        subset_row[key] = value
                writer.writerow(subset_row)

In [22]:
# QA on the cleaned subset

with codecs.open('cleaned_subset.csv', 'r', encoding='UTF-8') as csv_file_descriptor:
    dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
    
    count_r01 = 0
    count_r01_f = 0
    count_r01_m = 0
    count_r01_null = 0
    count_other_activities = 0
    
    for row in dataset:
        if row[ACTIVITY] == 'R01':
            count_r01 += 1
            if row[CONTACT_PI_GENDER] == 'F':
                count_r01_f += 1
            elif row[CONTACT_PI_GENDER] == 'M':
                count_r01_m += 1
            else:
                count_r01_null += 1
        else:
            count_other_activities += 1

    print("count_r01_f: ", count_r01_f)
    print("count_r01_m: ", count_r01_m)
    print("count_r01_null: ", count_r01_null)
    print("sum_r01: ", count_r01_f + count_r01_m + count_r01_null) # should be the same as count_r01
    print("count_r01: ", count_r01)
    print("count_other_activities: ", count_other_activities)

count_r01_f:  19876
count_r01_m:  61419
count_r01_null:  0
sum_r01:  81295
count_r01:  81295
count_other_activities:  0
