# Imports

In [1]:
from urllib.request import urlretrieve
import zipfile
import tempfile
import csv
import pandas as pd
import itertools
import re
from datetime import datetime
from utils.csv_headers import *

# Data preprocessing

## Download CSV project files

In [None]:
LOCAL_SOURCE_DIR = 'content/downloads'
# used to download zip files
TMP_DIR = tempfile.gettempdir()
BASE_URL = 'https://exporter.nih.gov/CSVs/final'


for file_name in [f'RePORTER_PRJ_C_FY{year}.zip' for year in range(1985, 2021)]:
    url = f'{BASE_URL}/{file_name}'

    # download file in local
    zip_path = f'{TMP_DIR}/{file_name}'
    urlretrieve(url, zip_path)

    # unzip
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_SOURCE_DIR)

## CSV files merge

In [None]:
SOURCE_DIR = 'content/downloads'
TARGET_FILE = 'content/output.csv'


def is_integer(string):
    try:
        int(string)
        return True
    except ValueError as a:
        return False


def is_corrupted(dict_row):
    empty_column = None in dict_row
    is_application_id_not_integer = not is_integer(dict_row[APPLICATION_ID])   
    return empty_column or is_application_id_not_integer


with open(TARGET_FILE, 'w', encoding='utf-8') as output_file:
    writer = csv.DictWriter(output_file, fieldnames=ORDERED_HEADERS, dialect=csv.excel)
    writer.writeheader()

    csv_files = sorted(os.listdir(SOURCE_DIR))

    for csv_file_name in csv_files:
        print(csv_file_name)

        with open(SOURCE_DIR + '/' + csv_file_name, 'r', encoding='ISO-8859-1') as csv_file_descriptor:
            reader = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')
            for index, dict_row in enumerate(reader):
                if is_corrupted(dict_row):
                    print(csv_file_name, index, dict_row[APPLICATION_ID])
                else:
                    upper_dict = dict()
                    for key, value in dict_row.items():
                        try:
                            # letter case uniformization
                            upper_dict[key.upper()] = value.replace("\n", "")
                        except Exception as e:
                            print(e)
                            print(dict_row)
                            print(csv_file_name)
                            #raise e
                    writer.writerow(upper_dict)

RePORTER_PRJ_C_FY1985.csv
RePORTER_PRJ_C_FY1986.csv
RePORTER_PRJ_C_FY1987.csv
RePORTER_PRJ_C_FY1988.csv
RePORTER_PRJ_C_FY1989.csv
RePORTER_PRJ_C_FY1990.csv
RePORTER_PRJ_C_FY1991.csv
RePORTER_PRJ_C_FY1992.csv
RePORTER_PRJ_C_FY1993.csv
RePORTER_PRJ_C_FY1994.csv
RePORTER_PRJ_C_FY1995.csv
RePORTER_PRJ_C_FY1996.csv
RePORTER_PRJ_C_FY1997.csv
RePORTER_PRJ_C_FY1998.csv
RePORTER_PRJ_C_FY1999.csv
RePORTER_PRJ_C_FY2000.csv
RePORTER_PRJ_C_FY2001.csv
RePORTER_PRJ_C_FY2002.csv
RePORTER_PRJ_C_FY2003.csv
RePORTER_PRJ_C_FY2004.csv
RePORTER_PRJ_C_FY2005.csv
RePORTER_PRJ_C_FY2006.csv
RePORTER_PRJ_C_FY2007.csv
RePORTER_PRJ_C_FY2008.csv
RePORTER_PRJ_C_FY2009.csv
RePORTER_PRJ_C_FY2010.csv
RePORTER_PRJ_C_FY2011.csv
RePORTER_PRJ_C_FY2012.csv
RePORTER_PRJ_C_FY2013.csv
RePORTER_PRJ_C_FY2014.csv
RePORTER_PRJ_C_FY2015.csv
RePORTER_PRJ_C_FY2016_new.csv
RePORTER_PRJ_C_FY2017_new.csv
RePORTER_PRJ_C_FY2017_new.csv 46320 9350897
RePORTER_PRJ_C_FY2018_new.csv
RePORTER_PRJ_C_FY2018_new.csv 55939 9473820
RePORTER_PRJ_C_F

## Contact PIs first name extraction & gender assignment

In [None]:
SOURCE_FILE = 'content/output.csv'
TARGET_FILE = 'content/enhanced-output.csv'
ENCODING = 'utf-8'


def string_to_list(string):
    pi_list = []
    for string_part in string.rstrip('; ').split(';'):
        string_part = string_part.strip(', ').replace('"', '')
        if string_part != '':
            pi_list.append(string_part)
    return pi_list


def filter_contact_pi(pi_ids_or_names_list):
    filtered_list = []
    for item in pi_ids_or_names_list: 
         if '(contact)' in item:
            filtered_list.append(item)
    return filtered_list


def extract_contact_pi(pi_ids_or_names_list):
    if len(pi_ids_or_names_list) == 0:
        return None, "no_value"
    elif len(pi_ids_or_names_list) == 1:
        return pi_ids_or_names_list[0], "single_value"
    else:              
        filtered_pi_ids_or_names = filter_contact_pi(pi_ids_or_names_list)
        if len(filtered_pi_ids_or_names) == 0:
            return pi_ids_or_names_list[0], "multiple_values_but_no_explicit_contact_first_chosen"
        elif len(filtered_pi_ids_or_names) == 1:
            return filtered_pi_ids_or_names[0], "multiple_values_and_single_explicit_contact"
        else:
            return filtered_pi_ids_or_names[0], "multiple_values_and_multiple_explicit_contacts_first_chosen"


def normalize_first_name(first_name):
    first_name = re.sub('[\.\-\;\,]', '', first_name)
    first_name = re.sub(' +', ' ', first_name)
    first_name_parts = first_name.strip(' ').split(' ')
    l = []
    for part in first_name_parts:
        if len(part) > 1:
            l.append(part)
    return ' '.join(l).upper()


def extract_pi_first_name(full_name):
    if full_name is None:
        return None
    else:
        full_name_list = full_name.split(',')
        if len(full_name_list) <= 1:
            return None
        else:
            return normalize_first_name(full_name_list[1].replace('(contact)', ''))

In [None]:
# upload and clean gender_US_names.csv list

with open('content/gender_US_names.csv', 'r', encoding='UTF-8-sig') as csv_file_descriptor:
    gender_dataset = csv.DictReader(csv_file_descriptor, delimiter=',', quotechar='"')

    gender_dict = {}

    for item in gender_dataset:
        normalized_first_name = normalize_first_name(item['Name'])
        gender_dict[normalized_first_name] = item['Gender']

In [None]:
# enhance output.csv file with contact PIs extraction and gender assignment
       
with open(TARGET_FILE, 'w', encoding=ENCODING) as output_file:
    writer = csv.DictWriter(output_file, fieldnames=[
        APPLICATION_ID,
        ACTIVITY,
        ADMINISTERING_IC,
        APPLICATION_TYPE,
        ARRA_FUNDED,
        AWARD_NOTICE_DATE,
        BUDGET_START,
        BUDGET_END,
        CFDA_CODE,
        CORE_PROJECT_NUM,
        ED_INST_TYPE,
        FOA_NUMBER,
        FULL_PROJECT_NUM,
        FUNDING_ICS,
        FUNDING_MECHANISM,
        FY,
        IC_NAME,
        NIH_SPENDING_CATS,
        ORG_CITY,
        ORG_COUNTRY,
        ORG_DEPT,
        ORG_DISTRICT,
        ORG_DUNS,
        ORG_FIPS,
        ORG_IPF_CODE,
        ORG_NAME,
        ORG_STATE,
        ORG_ZIPCODE,
        PHR,
        PI_IDS,
        PI_NAMES,

        NB_PI_IDS,
        NB_PI_NAMES,
        CONTACT_PI_ID, 
        CONTACT_PI_NAME, 
        CONTACT_PI_ID_STATUS, 
        CONTACT_PI_NAME_STATUS,
        CONTACT_PI_FIRST_NAME,
        CONTACT_PI_GENDER,

        PROGRAM_OFFICER_NAME,
        PROJECT_START,
        PROJECT_END,
        PROJECT_TERMS,
        PROJECT_TITLE,
        SERIAL_NUMBER,
        STUDY_SECTION,
        STUDY_SECTION_NAME,
        SUBPROJECT_ID,
        SUFFIX,
        SUPPORT_YEAR,
        DIRECT_COST_AMT,
        INDIRECT_COST_AMT,
        TOTAL_COST,
        TOTAL_COST_SUB_PROJECT
    ], dialect=csv.excel)
    writer.writeheader()


    with open(SOURCE_FILE, 'r', encoding=ENCODING) as input_file:
        reader = csv.DictReader(input_file, dialect=csv.excel)

        for index, input_row in enumerate(reader):

            pi_ids_list = string_to_list(input_row[PI_IDS])
            pi_names_list = string_to_list(input_row[PI_NAMES])

            try:
                contact_pi_id, pi_id_status = extract_contact_pi(pi_ids_list)
                contact_pi_name, pi_name_status = extract_contact_pi(pi_names_list)
            except Exception as e:
                print('APPLICATION_ID', input_row[APPLICATION_ID])
                print('PI_IDS', input_row[PI_IDS])
                print('PI_NAMES', input_row[PI_NAMES])
                print('+++')
                print(pi_ids_list)
                print(pi_names_list)
                print(e)
                print('----------------')

            contact_pi_first_name = extract_pi_first_name(contact_pi_name)

            dict_row = {
                APPLICATION_ID: input_row[APPLICATION_ID],
                ACTIVITY: input_row[ACTIVITY],
                ADMINISTERING_IC: input_row[ADMINISTERING_IC],
                APPLICATION_TYPE: input_row[APPLICATION_TYPE],
                ARRA_FUNDED: input_row[ARRA_FUNDED],
                AWARD_NOTICE_DATE: input_row[AWARD_NOTICE_DATE],
                BUDGET_START: input_row[BUDGET_START],
                BUDGET_END: input_row[BUDGET_END],
                CFDA_CODE: input_row[CFDA_CODE],
                CORE_PROJECT_NUM: input_row[CORE_PROJECT_NUM],
                ED_INST_TYPE: input_row[ED_INST_TYPE],
                FOA_NUMBER: input_row[FOA_NUMBER],
                FULL_PROJECT_NUM: input_row[FULL_PROJECT_NUM],
                FUNDING_ICS: input_row[FUNDING_ICS],
                FUNDING_MECHANISM: input_row[FUNDING_MECHANISM],
                FY: input_row[FY],
                IC_NAME: input_row[IC_NAME],
                NIH_SPENDING_CATS: input_row[NIH_SPENDING_CATS],
                ORG_CITY: input_row[ORG_CITY],
                ORG_COUNTRY: input_row[ORG_COUNTRY],
                ORG_DEPT: input_row[ORG_DEPT],
                ORG_DISTRICT: input_row[ORG_DISTRICT],
                ORG_DUNS: input_row[ORG_DUNS],
                ORG_FIPS: input_row[ORG_FIPS],
                ORG_IPF_CODE: input_row[ORG_IPF_CODE],
                ORG_NAME: input_row[ORG_NAME],
                ORG_STATE: input_row[ORG_STATE],
                ORG_ZIPCODE: input_row[ORG_ZIPCODE],
                PHR: input_row[PHR],
                PI_IDS: input_row[PI_IDS],
                PI_NAMES: input_row[PI_NAMES],

                NB_PI_IDS: len(pi_ids_list),
                NB_PI_NAMES: len(pi_names_list),
                CONTACT_PI_ID: contact_pi_id, 
                CONTACT_PI_NAME: contact_pi_name, 
                CONTACT_PI_ID_STATUS: pi_id_status, 
                CONTACT_PI_NAME_STATUS: pi_name_status,
                CONTACT_PI_FIRST_NAME: contact_pi_first_name,
                CONTACT_PI_GENDER: gender_dict.get(contact_pi_first_name),

                PROGRAM_OFFICER_NAME: input_row[PROGRAM_OFFICER_NAME],
                PROJECT_START: input_row[PROJECT_START],
                PROJECT_END: input_row[PROJECT_END],
                PROJECT_TERMS: input_row[PROJECT_TERMS],
                PROJECT_TITLE: input_row[PROJECT_TITLE],
                SERIAL_NUMBER: input_row[SERIAL_NUMBER],
                STUDY_SECTION: input_row[STUDY_SECTION],
                STUDY_SECTION_NAME: input_row[STUDY_SECTION_NAME],
                SUBPROJECT_ID: input_row[SUBPROJECT_ID],
                SUFFIX: input_row[SUFFIX],
                SUPPORT_YEAR: input_row[SUPPORT_YEAR],
                DIRECT_COST_AMT: input_row[DIRECT_COST_AMT],
                INDIRECT_COST_AMT: input_row[INDIRECT_COST_AMT],
                TOTAL_COST: input_row[TOTAL_COST],
                TOTAL_COST_SUB_PROJECT: input_row[TOTAL_COST_SUB_PROJECT]
            }
            writer.writerow(dict_row)