This jupyter notebook will develp the SOP for updating the Monday.com studies board consistently.

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import re
from datetime import datetime

In [2]:
input_dir = Path("/Users/hinashah/Documents/HEAL/MondayFolderUpdate_202407/")

In [3]:
def get_unique_values(df:pd.DataFrame, col_name:str='appl_id'):
    if col_name in df.columns:
        return df[ ~pd.isna(df[col_name])][col_name].drop_duplicates()
    return None

def get_na_count(df:pd.DataFrame, col_name:str='appl_id'):
    if col_name in df.columns:
        return len(df[pd.isna(df[col_name])])
    return -1

In [4]:
gt_file = pd.read_csv(input_dir/"study_lookup_table.csv", dtype=str)
gt_file.replace("0", np.NaN, inplace=True)

print(len(gt_file))
print(gt_file.columns)
print(gt_file.dtypes)
### QC the file:
for k in gt_file.columns:
    print(f"Number of distinct values in --{k}--: {len(get_unique_values(gt_file, k))}")
    print(f"---- NA count: {get_na_count(gt_file, k)}")
    ## Look for patterns?
    if 'appl' in k:
        d = gt_file[[ (not pd.isna(l)) and (not l.isdigit()) for l in gt_file[k] ]]
        print(f"Number of funky looking appl_ids: {len(d)}")
    elif k == 'study_hdp_id':
        d = gt_file[ [ (not pd.isna(l)) and (re.match(r'HDP[\d]+', l) is None) for l in gt_file[k]]]
        print(f" Number of funky looking HDPIDs: {len(d)}")

1477
Index(['appl_id', 'xstudy_id', 'study_most_recent_appl', 'study_hdp_id',
       'study_hdp_id_appl'],
      dtype='object')
appl_id                   object
xstudy_id                 object
study_most_recent_appl    object
study_hdp_id              object
study_hdp_id_appl         object
dtype: object
Number of distinct values in --appl_id--: 1468
---- NA count: 0
Number of funky looking appl_ids: 0
Number of distinct values in --xstudy_id--: 1200
---- NA count: 0
Number of distinct values in --study_most_recent_appl--: 1191
---- NA count: 0
Number of funky looking appl_ids: 0
Number of distinct values in --study_hdp_id--: 1162
---- NA count: 51
 Number of funky looking HDPIDs: 0
Number of distinct values in --study_hdp_id_appl--: 1153
---- NA count: 51
Number of funky looking appl_ids: 0


In [5]:
## Import Monday Board 
## TODO: change to read in all groups
board_files_list = list(input_dir.glob("HEAL_Studies_*.xlsx"))
print(board_files_list)
monday_board = pd.DataFrame()
for file_path in board_files_list:
    tmp_df = pd.read_excel(file_path, skiprows=4, dtype={"Most Recent Appl_ID":str, "Name":str}, skipfooter=1)
    group_name = ' '.join(file_path.name.split('_')[2:-1])
    if group_name == 'Studies never added to the Platform legacy':
        study_type = 'APPLIDONLY'
    elif group_name == 'HEAL Studies in the Platform':
        study_type = "HDP"
    elif group_name == "CTN Protocols":
        study_type == "CTN"
    else:
        study_type = "Unknown"  
    tmp_df['study_type'] = [study_type]*len(tmp_df)
    monday_board = pd.concat([monday_board, tmp_df])
    print(len(monday_board))

# monday_board = pd.read_excel(input_dir/"HEAL_Studies_1719341536.xlsx", skiprows=4, dtype={"Most Recent Appl_ID":str}, skipfooter=1)
# print(monday_board["Name"].describe())
# print(monday_board.columns)

[PosixPath('/Users/hinashah/Documents/HEAL/MondayFolderUpdate_202407/HEAL_Studies_Studies_under_investigation_1723180673.xlsx'), PosixPath('/Users/hinashah/Documents/HEAL/MondayFolderUpdate_202407/HEAL_Studies_Studies_never_added_to_the_Platform_legacy_1723180718.xlsx'), PosixPath('/Users/hinashah/Documents/HEAL/MondayFolderUpdate_202407/HEAL_Studies_CTN_Protocols_1723180574.xlsx'), PosixPath('/Users/hinashah/Documents/HEAL/MondayFolderUpdate_202407/HEAL_Studies_HEAL_Studies_in_the_Platform_1723180621.xlsx')]
8
45
84
1244


  monday_board = pd.concat([monday_board, tmp_df])
  monday_board = pd.concat([monday_board, tmp_df])


In [6]:
### Steps for updating Monday board:

## From Study lookup table, get unique set of most_recent_appl, study_hdp_id, and study_hdp_id_appl
lookup_fields = gt_file[['study_hdp_id', 'study_most_recent_appl', 'study_hdp_id_appl']].copy(deep=True).drop_duplicates()
## Create a column "Name" or "Key" that will either have study_hdp_id OR most_recent_appl when study_hdp_id is empty
lookup_fields['key'] = [m if pd.isna(h) else h for (h, m) in lookup_fields[['study_hdp_id', 'study_most_recent_appl']].values ]

### A few checks:
## How many of the "keys" from Monday board are in lookup fields?
print(f"Number records from Monday already in lookup table: {len(monday_board[monday_board.Name.isin(lookup_fields.key)])}")
## How many of the keys from MOnday board are not there in looup fields
mondayboard_missingin_looup = monday_board[~monday_board.Name.isin(lookup_fields.key)]
print(f"Number records from Monday that are not in lookup table: {len(mondayboard_missingin_looup)}")
## How many of the keys from lookup fields are not there in Monday??
lookup_missingin_mondayboard = lookup_fields[~lookup_fields.key.isin(monday_board.Name)]
print(f"Number records from l0ookup that are not in Monday: {len(lookup_missingin_mondayboard)}")


Number records from Monday already in lookup table: 1160
Number records from Monday that are not in lookup table: 84
Number records from l0ookup that are not in Monday: 40


In [7]:
mondayboard_missingin_looup

Unnamed: 0,Name,Most Recent Appl_ID,HDP appl_ID,Project #,Archived,HEAL-Related,Research Focus,Research Program,Title,Contact PI,...,Repo per Platform,Platform Reg Time,CEDAR Form %,Repo Mapping,repo_22_2,repo_22_3,Creation Log,study_type,link to Draft Data Dictionary Tracker,link to Draft Data Dictionary Tracker.1
0,HDP01285,,,CTN-0130,,,,,,,...,,NaT,0.0,,,,"Hina Shah Jun 26, 2024 2:05 PM",Unknown,,
1,HDP01286,,,CTN-0133,,,,,,,...,,NaT,0.0,,,,"Hina Shah Jun 26, 2024 2:05 PM",Unknown,,
2,HDP01287,,,CTN-0095-A-2,,,,,,,...,NIDA Data Share,NaT,0.0,,,,"Hina Shah Jun 26, 2024 2:05 PM",Unknown,,CTN0095A2-Data-Dictionary.xlsx
3,HDP01288,,,CTN-0093,,,,,,,...,NIDA Data Share,NaT,0.0,,,,"Hina Shah Jun 26, 2024 2:05 PM",Unknown,,CTN0093-DataDictionary.xlx
4,HDP01289,,,CTN-0147,,,,,,,...,,NaT,0.0,,,,"Hina Shah Jun 26, 2024 2:05 PM",Unknown,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,HDP01319,,,CTN-0097,,,,,,,...,,NaT,0.0,,,,"Hina Shah Jun 26, 2024 2:01 PM",APPLIDONLY,,
35,HDP01320,,,CTN-0110,,,,,,,...,,NaT,0.0,,,,"Hina Shah Jun 26, 2024 2:01 PM",APPLIDONLY,,
36,HDP01321,,,CTN-0096,,,,,,,...,,NaT,0.0,,,,"Hina Shah Jun 26, 2024 2:01 PM",APPLIDONLY,,
37,HDP01322,,,CTN-0084-A-2,,,,,,,...,,NaT,0.0,,,,"Hina Shah Jun 26, 2024 2:01 PM",APPLIDONLY,,


In [8]:
# Get rest of the tables
convert_dict = {'appl_id':str}

awards_df = pd.read_csv(input_dir/"awards.csv", low_memory=False, dtype=convert_dict)
awards_df = awards_df.dropna(how='all')
print(f"Awards table has: {len(awards_df)} entries, with {len(get_unique_values(awards_df))} appl_ids")
reporter_df = pd.read_csv(input_dir/"reporter.csv", low_memory=False, dtype=convert_dict)
reporter_df = reporter_df.dropna(how='all')
print(f"Reporter table has: {len(reporter_df)} entries, with {len(get_unique_values(reporter_df))} appl_ids")
progress_tracker_df = pd.read_csv(input_dir/"progress_tracker.csv", low_memory=False, dtype=convert_dict)
print(f"Platform generated table has: {len(progress_tracker_df)} entries, with {len(get_unique_values(progress_tracker_df))} appl_ids")
print(f"Platform table has {len(get_unique_values(progress_tracker_df))} unique HDP IDs")
repo_maping_df = pd.read_csv(input_dir/"repo_mapping.csv", low_memory=False, dtype=convert_dict)
print(f"Repo mapping table has: {len(repo_maping_df)} entrie, with {len(get_unique_values(repo_maping_df))} appl_ids")
pi_emails_df = pd.read_csv(input_dir/"pi_emails.csv", low_memory=False, dtype=convert_dict)
print(f"Repo mapping table has: {len(pi_emails_df)} entrie, with {len(get_unique_values(pi_emails_df))} appl_ids")
resnet_df = pd.read_csv(input_dir/"research_networks.csv", low_memory=False, dtype=convert_dict)
print(f"Research Network table has: {len(resnet_df)} entrie, with {len(get_unique_values(resnet_df))} appl_ids")


Awards table has: 1618 entries, with 1618 appl_ids
Reporter table has: 1617 entries, with 1617 appl_ids
Platform generated table has: 1322 entries, with 1313 appl_ids
Platform table has 1313 unique HDP IDs
Repo mapping table has: 1323 entrie, with 1323 appl_ids
Repo mapping table has: 1059 entrie, with 1059 appl_ids
Research Network table has: 1662 entrie, with 1662 appl_ids


In [9]:
## Manipulate emails to carry forward emails from a previous appl_id to the most recent one according to the lookup table and email table
appl_ids = gt_file[['appl_id', 'study_most_recent_appl']].drop_duplicates()
print(len(appl_ids))
appl_ids_emails = pd.merge(appl_ids, pi_emails_df, how='left', on='appl_id')

most_recent_emails = appl_ids_emails[ ~pd.isna(appl_ids_emails.pi_email)][['study_most_recent_appl', 'pi_email']].drop_duplicates()
most_recent_emails.rename(columns={'pi_email':'pi_email_latest'}, inplace=True)
print(most_recent_emails)
email_counts = most_recent_emails.groupby('study_most_recent_appl').size()
appl_ids_counts = appl_ids_emails.groupby('study_most_recent_appl').size()

print(email_counts.describe())
appl_ids_emails['email_count'] = [email_counts[k] if k in email_counts else 0 for k in appl_ids_emails['study_most_recent_appl']]
appl_ids_emails['applid_count'] = [appl_ids_counts[k] if k in appl_ids_counts else 0 for k in appl_ids_emails['study_most_recent_appl']]
appl_ids_emails['pi_email'].fillna('', inplace=True)
appl_ids_emails['keep'] = [1 if (c==0 or (c==1 and len(e)>0) or (c>1 and a==m)) else 0 for (c,a,m,e) in appl_ids_emails[['email_count', 'appl_id', 'study_most_recent_appl', 'pi_email' ]].values]
print(len(appl_ids_emails))

pi_emails_df_updated = appl_ids_emails[appl_ids_emails['keep']==1][['study_most_recent_appl', 'pi_email']].drop_duplicates()
pi_emails_df_updated['pi_email'] = [k.strip() for k in pi_emails_df_updated['pi_email']]
print(pi_emails_df_updated)

## Get Monday board emails, and fill in any that are different from mysql..
pi_emails_df_updated_monday = pd.merge(pi_emails_df_updated, monday_board[['Most Recent Appl_ID', 'Contact Email']].drop_duplicates(), how='left', left_on='study_most_recent_appl', right_on='Most Recent Appl_ID').drop(columns='Most Recent Appl_ID')
pi_emails_df_updated_monday['Contact Email'].replace('-', '', inplace=True)
pi_emails_df_updated_monday['Contact Email'].fillna('-', inplace=True)
pi_emails_df_updated_monday['pi_email_updated'] = [me if (len(e)==0 and len(me) > 1) else e for (e,me) in pi_emails_df_updated_monday[['pi_email', 'Contact Email']].values]
print(pi_emails_df_updated_monday)
pi_emails_df_updated_monday.to_csv(input_dir/"email_updates.csv", index=False)
appl_ids_emails.to_csv(input_dir/"email_counts.csv", index=False)

pi_emails_df_updated = pi_emails_df_updated_monday[['study_most_recent_appl', 'pi_email_updated']].rename(columns={'pi_email_updated':'pi_email'})

1468
     study_most_recent_appl                   pi_email_latest
1                   9755001                 kwatkins@rand.org
2                   9850412                   damico@rand.org
4                  10478911                 LYNN.DEBAR@KP.ORG
8                  10468778          cheville.andrea@mayo.edu
11                 10054792                   xcao11@jhmi.edu
...                     ...                               ...
1455               10167785               bahmedani@yahoo.com
1456               10331849                  tbrocki1@JHU.EDU
1458               10197811                  kzivin@UMICH.EDU
1459               10197809            Gregory.E.Simon@kp.org
1461                9823898  d-mencihella@md.northwestern.edu

[876 rows x 2 columns]
count    875.000000
mean       1.001143
std        0.033806
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
dtype: float64
1468
     study_most_recent_appl                  pi

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  appl_ids_emails['pi_email'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  pi_emails_df_updated_monday['Contact Email'].replace('-', '', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate ob

In [10]:
## Collect fields from report/awards tables that are required by Monday Board
rename_dict = {'proj_num':'Project #', 
               'proj_title':'Title',
                'rfa':'Research Focus',
                'res_prg':'Research Program',
                'ctc_pi_nm':'Contact PI',
                'pi_email':'Contact Email',
                'adm_ic':'Administering IC',
                'prg_ofc':'NIH PO',
                'org_nm': 'Institution(s)',
                'pi':'PI(s)',
                'org_cy':'City',
                'org_st':'State',
                'act_code':'Activity Code',
                'awd_ty':'Award Type',
                'fisc_yr':'Award Year',
                'tot_fund':'Total Funded',
                'proj_abs':'Summary',
                'fund_mech': 'SBIR/STTR',
                'dai_res':'DAI Import Status',
                'proj_strt_date':'Project Start',
                'proj_end_date':'Project End',
                'proj_url':'Reporter Link',
                'res_net':'Research Network',
                'repo_22_1':'Repo Mapping',
                'repo_22_2':'repo_22_2',
                'repo_22_3':'repo_22_3',
                'time_of_registration':'Platform Reg Time',
                'overall_percent_complete':'CEDAR Form %',
                'repository_name' : 'Repo per Platform',
                'archived':'Archived',
                'heal_funded':'HEAL-Related'
                }

def create_mysql_subset(in_df:pd.DataFrame, extra_fields = ['appl_id']):
    subset = in_df[[k for k in rename_dict.keys() if k in in_df.columns] + extra_fields].copy(deep=True)
    subset.rename(columns={k:v for k,v in rename_dict.items() if k in in_df.columns}, inplace=True)
    return subset
    
mysql_fields_reporter = create_mysql_subset(awards_df)
mysql_fields_awards = create_mysql_subset(reporter_df)
myql_fields_repomapping = create_mysql_subset(repo_maping_df)
mysql_fields_platform = create_mysql_subset(progress_tracker_df, extra_fields=['hdp_id'])
mysql_fields_piemails = create_mysql_subset(pi_emails_df_updated, extra_fields=['study_most_recent_appl'])
mysql_fields_resnet = create_mysql_subset(resnet_df)

In [11]:
print(len(lookup_fields))
data_merge_1 = pd.merge(lookup_fields, mysql_fields_reporter, how='left', left_on='study_most_recent_appl', right_on='appl_id').drop(columns='appl_id')
print(len(data_merge_1))
data_merge_2 = pd.merge(data_merge_1, mysql_fields_awards, how='left', left_on='study_most_recent_appl', right_on='appl_id').drop(columns='appl_id')
print(len(data_merge_2))
data_merge_1 = pd.merge(data_merge_2, myql_fields_repomapping, how='left', left_on='study_most_recent_appl', right_on='appl_id').drop(columns='appl_id')
print(len(data_merge_1))
data_merge_2 = pd.merge(data_merge_1, mysql_fields_platform, how='left', left_on='study_hdp_id', right_on='hdp_id')
print(len(data_merge_2))
data_merge_1 = pd.merge(data_merge_2, mysql_fields_resnet, how='left', left_on='study_most_recent_appl', right_on='appl_id').drop(columns='appl_id')
print(len(data_merge_1))
combined_data_ph1 = pd.merge(data_merge_1, mysql_fields_piemails, how='left', on='study_most_recent_appl')
print(len(combined_data_ph1))
print(len(combined_data_ph1.drop_duplicates()))


1200
1200
1200
1200
1200
1200
1200
1200


In [12]:
## Find out which columns have NA values, and investigate for incompletemess?
print("Number of empty values for each of the fields gathered:")
for k in rename_dict.values():
    print(f"{k} : {get_na_count(combined_data_ph1, k)}")

Number of empty values for each of the fields gathered:
Project # : 1
Title : 1
Research Focus : 14
Research Program : 78
Contact PI : 1
Contact Email : 0
Administering IC : 1
NIH PO : 59
Institution(s) : 1
PI(s) : 1
City : 6
State : 8
Activity Code : 1
Award Type : 1
Award Year : 1
Total Funded : 1
Summary : 9
SBIR/STTR : 1
DAI Import Status : 709
Project Start : 1
Project End : 1
Reporter Link : 1
Research Network : 775
Repo Mapping : 733
repo_22_2 : 1092
repo_22_3 : 1192
Platform Reg Time : 844
CEDAR Form % : 38
Repo per Platform : 1036
Archived : 38
HEAL-Related : 22


In [13]:
# This cell is meant to fill in holes in the mysql data from the progress tracker a.k.a platform MDS data.

rename_dict = {'project_num':'Project #', 
               'project_title':'Title',
                'investigators_name':'PI(s)',
                'award_type':'Award Type',
                'year_awarded':'Award Year',
                'award_amount':'Total Funded',
                'study_name':'Summary',
                'project_end_date':'Project End',
                'nih_reporter_link':'Reporter Link',
                'time_of_registration':'Platform Reg Time',
                'overall_percent_complete':'CEDAR Form %',
                'repository_name' : 'Repo per Platform',
                'archived':'Archived',
                }
progress_tracker_data = progress_tracker_df.copy(deep=True)
progress_tracker_data['project_title'].replace('0', '', inplace=True)
progress_tracker_data = create_mysql_subset(progress_tracker_data, extra_fields=['hdp_id'])

progress_tracker_data['PI(s)'].fillna('', inplace=True)
progress_tracker_data['PI(s)'] = [ k.translate(str.maketrans(',', ';', "[]\'")) for k in  progress_tracker_data['PI(s)']]

progress_tracker_data['key'] = progress_tracker_data['hdp_id']
progress_tracker_data['study_hdp_id'] = progress_tracker_data['hdp_id']
progress_tracker_data['Research Network'] = [ 'CTN' if k.startswith('CTN') else '' for k in progress_tracker_data['Project #']]

fill_in_data = pd.merge(combined_data_ph1, progress_tracker_data, how='left', on='study_hdp_id')
fill_in_data.to_csv(input_dir/"tmp.csv", index=False)


columns_to_compare = list(rename_dict.values())
columns_to_compare.extend(['key', 'Research Network'])

for k in columns_to_compare:
    k_x = k+'_x'
    k_y = k+'_y'
    fill_in_data[k] = [v_y if pd.isna(v_x) else v_x for (v_x, v_y) in fill_in_data[[k_x, k_y]].values]
    fill_in_data.drop(columns=[k_x, k_y], inplace=True)

columns = fill_in_data.columns.sort_values()
fill_in_data = fill_in_data[columns]
fill_in_data.to_csv(input_dir/"tmp.csv", index=False)
combined_data_ph1 = fill_in_data.copy(deep=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  progress_tracker_data['project_title'].replace('0', '', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  progress_tracker_data['PI(s)'].fillna('', inplace=True)


In [14]:
## Add CTN  data 
ctn_data = progress_tracker_df[[k.startswith('CTN') for k in progress_tracker_df['project_num']]]
ctn_data['project_title'].replace('0', '', inplace=True)

print(len(ctn_data))
rename_dict = {'project_num':'Project #', 
               'project_title':'Title',
                'investigators_name':'PI(s)',
                'award_type':'Award Type',
                'year_awarded':'Award Year',
                'award_amount':'Total Funded',
                'study_name':'Summary',
                'proj_end_date':'Project End',
                'nih_reporter_link':'Reporter Link',
                'time_of_registration':'Platform Reg Time',
                'overall_percent_complete':'CEDAR Form %',
                'repository_name' : 'Repo per Platform',
                'archived':'Archived',
                }
ctn_fields_platform = create_mysql_subset(ctn_data, extra_fields=['hdp_id'])
## Edit pi name
ctn_fields_platform['PI(s)'].fillna('', inplace=True)
ctn_fields_platform['PI(s)'] = [ k.translate(str.maketrans(',', ';', "[]\'")) for k in  ctn_fields_platform['PI(s)']]

ctn_fields_platform['key'] = ctn_fields_platform['hdp_id']
ctn_fields_platform['study_hdp_id'] = ctn_fields_platform['hdp_id']
ctn_fields_platform['Research Network'] = ['CTN']*len(ctn_fields_platform)

## Combine the data to the other data set
all_data = pd.concat([combined_data_ph1, ctn_fields_platform])
all_data


40


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ctn_data['project_title'].replace('0', '', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctn_data['project_title'].replace('0', '', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df

Unnamed: 0,Activity Code,Administering IC,Archived,Award Type,Award Year,CEDAR Form %,City,Contact Email,Contact PI,DAI Import Status,...,Total Funded,hdp_id_x,hdp_id_y,key,repo_22_2,repo_22_3,study_hdp_id,study_hdp_id_appl,study_most_recent_appl,hdp_id
0,U01,NIAAA,live,3,2019.0,0.0,PORTLAND,,"NAGEL, BONNIE J",NO,...,93429.0,HDP00632,HDP00632,HDP00632,Vivli,,HDP00632,9860408,9860408,
1,R34,NIAAA,live,3,2019.0,0.0,SANTA MONICA,kwatkins@rand.org,"WATKINS, KATHERINE E",NO,...,99988.0,HDP00696,HDP00696,HDP00696,,,HDP00696,9755001,9755001,
2,R01,NIAAA,live,3,2019.0,0.0,SANTA MONICA,damico@rand.org,"D'AMICO, ELIZABETH J.",NO,...,182670.0,HDP00509,HDP00509,HDP00509,Vivli,,HDP00509,9850412,9850412,
3,UH3,NIA,live,5,2022.0,80.8,OAKLAND,LYNN.DEBAR@KP.ORG,"DEBAR, LYNN L.",,...,2200655.0,HDP00242,HDP00242,HDP00242,,,HDP00242,9869480,10478911,
4,UH3,NIA,live,5,2022.0,25.0,ROCHESTER,cheville.andrea@mayo.edu,"CHEVILLE, ANDREA LYNNE",,...,1506116.0,HDP00391,HDP00391,HDP00391,,,HDP00391,9876435,10468778,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1314,,,live,0,,0.0,,,,,...,0.0,,,HDP01320,,,HDP01320,,,HDP01320
1315,,,live,0,,76.9,,,,,...,0.0,,,HDP01321,,,HDP01321,,,HDP01321
1316,,,live,0,,0.0,,,,,...,0.0,,,HDP01322,,,HDP01322,,,HDP01322
1317,,,live,0,,0.0,,,,,...,0.0,,,HDP01323,,,HDP01323,,,HDP01323


In [15]:
mysql_fields_resnet['Research Network'].value_counts()

Research Network
CTN                    188
HBCD                   132
JCOIN                   79
BACPAC                  66
PAIN ERN                39
EPPIC-NET               36
ACT NOW                 32
HPC                     28
PRISM                   27
HEALING COMMUNITIES     23
DATA 2 ACTION           18
IMPOWR                  16
HOPE                    14
HARM REDUCTION          11
MFMU                     5
A2CPS                    5
PRECISION                5
RE-JOIN                  5
Name: count, dtype: int64

In [16]:
combined_data = all_data.copy(deep=True)

In [17]:
## Create a column named "Location"dd
from datetime import datetime
combined_data['study_type'] = [ 'CTN' if m.startswith('CTN') else ('APPLIDONLY' if pd.isna(k) else 'HDP') for (m,k) in combined_data[['Project #', 'study_hdp_id_appl']].values]

combined_data['City'] = combined_data[['City']].fillna('-')
combined_data['State'] = combined_data[['State']].fillna('-')
combined_data['Location'] = [c+","+s for (c,s) in combined_data[['City', "State"]].values]

combined_data['Project Start'] = pd.to_datetime(combined_data['Project Start'], format='%Y-%m-%d', errors='coerce').dt.date
combined_data['Project End'] = pd.to_datetime(combined_data['Project End'], format='%Y-%m-%d', errors='coerce').dt.date
combined_data['Platform Reg Time'] = pd.to_datetime(combined_data['Platform Reg Time'], utc=True).dt.date

combined_data['Archived'] = [a if a=='archived' else 'n' for a in combined_data['Archived']]

combined_data['HEAL-Related'] = ['Y' if ((p != 'CTN' ) and (pd.isna(a))) else 'N' for (p,a) in combined_data[['study_type', 'HEAL-Related']].values]
combined_data['SBIR/STTR'] = ['Y' if 'SBIR/STTR'==t else 'N' for t in combined_data['SBIR/STTR']]

print(combined_data.study_type.value_counts())

## Rename a few of the other columns:
combined_data.rename(columns={'study_most_recent_appl':'Most Recent Appl_ID', 'study_hdp_id_appl':'HDP appl_ID'}, inplace=True)
combined_data.drop(columns=['study_hdp_id', 'hdp_id', 'hdp_id_x', 'hdp_id_y'], inplace=True)


study_type
HDP           1162
CTN             40
APPLIDONLY      38
Name: count, dtype: int64


In [18]:
handled_columns = ['study_type', 'City', 'State', 'Location', 'Project Start', 'Project End', 'Platform Reg Time', 'Archived', 'HEAL-Related', 'SBIR/STTR']
rest_obj_columns = [k for k in combined_data.columns if k not in handled_columns and combined_data[k].dtype in ['object', 'str']]

#combined_data[rest_obj_columns].fillna('-')
print(len(combined_data.columns))
print(len(rest_obj_columns))
print(len(handled_columns))
print(rest_obj_columns)

for k in rest_obj_columns:
    combined_data[k] = ['-' if (t is np.NaN) or (t=='') else t for t in combined_data[k]]
    

36
23
10
['Activity Code', 'Administering IC', 'Award Type', 'Contact Email', 'Contact PI', 'DAI Import Status', 'Institution(s)', 'NIH PO', 'PI(s)', 'Project #', 'Repo Mapping', 'Repo per Platform', 'Reporter Link', 'Research Focus', 'Research Network', 'Research Program', 'Summary', 'Title', 'key', 'repo_22_2', 'repo_22_3', 'HDP appl_ID', 'Most Recent Appl_ID']


In [19]:
#TODO: Find what's in Monday.com board, but not in mysql extract
# Mark these entries for deletion, and these would have to be deleted manually on Monday.com

In [20]:
# combined_data_subset = combined_data[ combined_data['Most Recent Appl_ID'].isin(monday_board['Most Recent Appl_ID']) | combined_data['key'].isin(monday_board['Name'])].drop_duplicates()
# combined_data_subset = combined_data_subset[ ~pd.isna(combined_data_subset['Most Recent Appl_ID']) ]
# print(len(combined_data_subset))
# combined_data_subset.to_excel(input_dir/"MondayBoard_Update_Step2.xlsx", index=False)

In [21]:
combined_data.index.name = 'index'
combined_data.to_excel(input_dir/"MondayBoard_Update.xlsx")

print("Making sure uniqueness of key values")
key_counts = combined_data.groupby('key').size()
key_counts.describe()

Making sure uniqueness of key values


count    1240.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
dtype: float64

*******************
*******************
DEBUG CODE BELOW
*******************
*******************

In [18]:
combined_data[combined_data['key']=='HDP00889'].drop_duplicates()

Unnamed: 0_level_0,Most Recent Appl_ID,HDP appl_ID,key,Research Focus,Research Program,DAI Import Status,HEAL-Related,Project #,Title,Contact PI,...,repo_22_2,repo_22_3,Platform Reg Time,CEDAR Form %,Repo per Platform,Archived,Network,Contact Email,Location,study_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
660,10593312,10601172,HDP00889,,,,Y,3R24DA055306-02S1,Wake Forest IMPOWR Dissemination Education and...,"ADAMS, MEREDITH C. B.",...,,,2022-07-26,5.8,,,,meradams@wakehealth.edu,"WINSTON-SALEM,NC",HDP


In [19]:
comparison_cols = ['Most Recent Appl_ID', 'HDP appl_ID', 'Contact Email', 'Network']

comparison_df = pd.merge(monday_board[['Name'] +comparison_cols ], combined_data[['key'] + comparison_cols], left_on = 'Name', right_on='key').drop_duplicates()
comparison_df.to_csv(input_dir/"comparison.csv", index=False)
comparison_df.to_excel(input_dir/"comparison.xlsx", index=False)

In [21]:
pi_emails_df_updated[ pi_emails_df_updated.study_most_recent_appl=='9901704']

Unnamed: 0,study_most_recent_appl,pi_email
881,9901704,jhambm@upmc.edu


In [22]:
resnet_df[resnet_df.appl_id=='9908734']

Unnamed: 0,appl_id,res_net,res_net_override_flag
1486,9908734,,0
