In [1]:
## If running on Google Colab, run this cell to mount Google Drive to access files on Google Drive.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


This jupyter notebook will develp the SOP for updating the Monday.com studies board consistently.

In [2]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.0


In [3]:
import pandas as pd
from pathlib import Path
import numpy as np
import re

In [4]:
## Set this to the directory where:
## 1- Every group on Monday Studies board has been exported as an individual group to
## 2- All relevant tables from MySql database for HEAL have been exported as a csv to.
input_dir = Path("/content/drive/MyDrive/Jan2025MySQLtoMonday")

In [6]:
def get_unique_values(df:pd.DataFrame, col_name:str='appl_id'):
    if col_name in df.columns:
        return df[ ~pd.isna(df[col_name])][col_name].drop_duplicates()
    return None

def get_na_count(df:pd.DataFrame, col_name:str='appl_id'):
    if col_name in df.columns:
        return len(df[pd.isna(df[col_name])])
    return -1

In [7]:
gt_file = pd.read_csv(input_dir/"study_lookup_table.csv", dtype=str)
gt_file.replace("0", np.nan, inplace=True)

print(len(gt_file))
print(gt_file.columns)
print(gt_file.dtypes)
### QC the lookup table:
for k in gt_file.columns:
    print(f"Number of distinct values in --{k}--: {len(get_unique_values(gt_file, k))}")
    print(f"---- NA count: {get_na_count(gt_file, k)}")
    ## Look for patterns?
    if 'appl' in k:
        d = gt_file[[ (not pd.isna(l)) and (not l.isdigit()) for l in gt_file[k] ]]
        print(f"Number of funky looking appl_ids: {len(d)}")
    elif k == 'study_hdp_id':
        d = gt_file[ [ (not pd.isna(l)) and (re.match(r'HDP[\d]+', l) is None) for l in gt_file[k]]]
        print(f" Number of funky looking HDPIDs: {len(d)}")

1821
Index(['appl_id', 'xstudy_id', 'study_most_recent_appl', 'study_hdp_id',
       'study_hdp_id_appl'],
      dtype='object')
appl_id                   object
xstudy_id                 object
study_most_recent_appl    object
study_hdp_id              object
study_hdp_id_appl         object
dtype: object
Number of distinct values in --appl_id--: 1808
---- NA count: 0
Number of funky looking appl_ids: 0
Number of distinct values in --xstudy_id--: 1419
---- NA count: 0
Number of distinct values in --study_most_recent_appl--: 1406
---- NA count: 0
Number of funky looking appl_ids: 0
Number of distinct values in --study_hdp_id--: 1315
---- NA count: 118
 Number of funky looking HDPIDs: 0
Number of distinct values in --study_hdp_id_appl--: 1302
---- NA count: 118
Number of funky looking appl_ids: 0


In [8]:
## Import Monday Board
## TODO: change to read in all groups
study_type = ""
board_files_list = list(input_dir.glob("HEAL_Studies_*.xlsx"))
print(board_files_list)
monday_board = pd.DataFrame()
for file_path in board_files_list:
    tmp_df = pd.read_excel(file_path, skiprows=4, dtype={"Most Recent Appl_ID":str, "Name":str}, skipfooter=1)
    group_name = ' '.join(file_path.name.split('_')[2:-1])
    if group_name == 'Studies Not Added to Platform':
        study_type = 'APPLIDONLY'
    elif group_name == 'HEAL Studies in the Platform':
        study_type = "HDP"
    elif group_name == "CTN Protocols":
        study_type == "CTN"
    else:
        study_type = "Unknown"
    tmp_df.dropna(how='all', inplace=True)
    if len(tmp_df) >  0:
        tmp_df['study_type'] = [study_type]*len(tmp_df)
        monday_board = pd.concat([monday_board, tmp_df])
        print(len(monday_board))


[PosixPath('/content/drive/MyDrive/Jan2025MySQLtoMonday/HEAL_Studies_CTN_Protocols_1734968640.xlsx'), PosixPath('/content/drive/MyDrive/Jan2025MySQLtoMonday/HEAL_Studies_Studies_under_investigation_1734968652.xlsx'), PosixPath('/content/drive/MyDrive/Jan2025MySQLtoMonday/HEAL_Studies_Studies_Not_Added_to_Platform_1734968659.xlsx'), PosixPath('/content/drive/MyDrive/Jan2025MySQLtoMonday/HEAL_Studies_HEAL_Studies_in_the_Platform_1734968672.xlsx')]
40


  monday_board = pd.concat([monday_board, tmp_df])
  monday_board = pd.concat([monday_board, tmp_df])


44


  monday_board = pd.concat([monday_board, tmp_df])


82
1251


In [9]:
### Steps for updating Monday board:

## From Study lookup table, get unique set of most_recent_appl, study_hdp_id, and study_hdp_id_appl
lookup_fields = gt_file[['study_hdp_id', 'study_most_recent_appl', 'study_hdp_id_appl']].copy(deep=True).drop_duplicates()
## Create a column "Key" that will either have study_hdp_id OR most_recent_appl when study_hdp_id is empty
lookup_fields['key'] = [m if pd.isna(h) else h for (h, m) in lookup_fields[['study_hdp_id', 'study_most_recent_appl']].values ]

### A few checks:
## How many of the "keys" from Monday board are in lookup fields?
print(f"Number records from Monday already in lookup table: {len(monday_board[monday_board.Name.isin(lookup_fields.key)])}")
## How many of the keys from MOnday board are not there in looup fields
mondayboard_missingin_lookup = monday_board[~monday_board.Name.isin(lookup_fields.key)]
print(f"Number records from Monday that are not in lookup table (Consider these as discrepancies **Investigate**): {len(mondayboard_missingin_lookup)}")
## How many of the keys from lookup fields are not there in Monday??
lookup_missingin_mondayboard = lookup_fields[~lookup_fields.key.isin(monday_board.Name)]
print(f"Number records from lookup table that are not on Monday (Potentially new entries): {len(lookup_missingin_mondayboard)}")


Number records from Monday already in lookup table: 1203
Number records from Monday that are not in lookup table (Consider these as discrepancies **Investigate**): 48
Number records from lookup table that are not on Monday (Potentially new entries): 216


In [10]:
mondayboard_missingin_lookup

Unnamed: 0,Name,Most Recent Appl_ID,HDP appl_ID,Project #,Archived,HEAL-Related,Research Focus,Research Program,Title,Contact PI,...,CEDAR Form %,Repo Mapping,repo_22_2,repo_22_3,Creation Log,study_type,link to Data Dictionary Tracker,HEAL Individuals,Pubs Tracker Item,Repo & Tool Engagement_HEAL
0,HDP01285,-,-,CTN-0130,n,N,-,-,-,-,...,0.0,-,-,-,"Hina Shah Jun 26, 2024 2:05 PM",,,,,
1,HDP01286,-,-,CTN-0133,n,N,-,-,-,-,...,0.0,-,-,-,"Hina Shah Jun 26, 2024 2:05 PM",,,,,
2,HDP01287,-,-,CTN-0095-A-2,n,N,-,-,-,-,...,0.0,-,-,-,"Hina Shah Jun 26, 2024 2:05 PM",,CTN0095A2-Data-Dictionary.xlsx,,36774521,
3,HDP01288,-,-,CTN-0093,n,N,-,-,-,-,...,0.0,-,-,-,"Hina Shah Jun 26, 2024 2:05 PM",,CTN0093-DataDictionary.xlx,,,
4,HDP01289,-,-,CTN-0147,n,N,-,-,-,-,...,0.0,-,-,-,"Hina Shah Jun 26, 2024 2:05 PM",,,,,
5,HDP01290,-,-,CTN-0099,n,N,-,-,-,-,...,0.0,-,-,-,"Hina Shah Jun 26, 2024 2:05 PM",,,,"37501652, 37716904",
6,HDP01291,-,-,CTN-0088,n,N,-,-,-,-,...,0.0,-,-,-,"Hina Shah Jun 26, 2024 2:05 PM",,,,"37583120, 37163263",
7,HDP01292,-,-,CTN-0144,n,N,-,-,-,-,...,0.0,-,-,-,"Hina Shah Jun 26, 2024 2:05 PM",,,,,
8,HDP01293,-,-,CTN-0099-A-1,n,N,-,-,-,-,...,0.0,-,-,-,"Hina Shah Jun 26, 2024 2:05 PM",,,,,
9,HDP01294,-,-,CTN-0095,n,N,-,-,-,-,...,0.0,-,-,-,"Hina Shah Jun 26, 2024 2:05 PM",,,,,


In [11]:
# Get rest of the tables
convert_dict = {'appl_id':str}

awards_df = pd.read_csv(input_dir/"awards.csv", low_memory=False, dtype=convert_dict)
awards_df = awards_df.dropna(how='all')
print(f"Awards table has: {len(awards_df)} entries, with {len(get_unique_values(awards_df))} appl_ids")
reporter_df = pd.read_csv(input_dir/"reporter.csv", low_memory=False, dtype=convert_dict)
reporter_df = reporter_df.dropna(how='all')
print(f"Reporter table has: {len(reporter_df)} entries, with {len(get_unique_values(reporter_df))} appl_ids")
progress_tracker_df = pd.read_csv(input_dir/"progress_tracker.csv", low_memory=False, dtype=convert_dict)
print(f"Platform generated table has: {len(progress_tracker_df)} entries, with {len(get_unique_values(progress_tracker_df))} appl_ids")
print(f"Platform table has {len(get_unique_values(progress_tracker_df))} unique HDP IDs")
pi_emails_df = pd.read_csv(input_dir/"pi_emails.csv", low_memory=False, dtype=convert_dict)
print(f"Repo mapping table has: {len(pi_emails_df)} entrie, with {len(get_unique_values(pi_emails_df))} appl_ids")
resnet_df = pd.read_csv(input_dir/"research_networks.csv", low_memory=False, dtype=convert_dict)
print(f"Research Network table has: {len(resnet_df)} entrie, with {len(get_unique_values(resnet_df))} appl_ids")
engagement_flags_df = pd.read_csv(input_dir/"engagement_flags.csv", low_memory=False, dtype=convert_dict)
print(f"Engagment Flags table has: {len(resnet_df)} entrie, with {len(get_unique_values(engagement_flags_df))} appl_ids")


Awards table has: 2015 entries, with 2015 appl_ids
Reporter table has: 2015 entries, with 2015 appl_ids
Platform generated table has: 1480 entries, with 1426 appl_ids
Platform table has 1426 unique HDP IDs
Repo mapping table has: 1326 entrie, with 1326 appl_ids
Research Network table has: 2015 entrie, with 2015 appl_ids
Engagment Flags table has: 2015 entrie, with 1807 appl_ids


In [12]:
## Manipulate emails to carry forward emails from a previous appl_id to the most recent one according to the lookup table and email table
appl_ids = gt_file[['appl_id', 'study_most_recent_appl']].drop_duplicates()
appl_ids_emails = pd.merge(appl_ids, pi_emails_df, how='left', on='appl_id')

most_recent_emails = appl_ids_emails[ ~pd.isna(appl_ids_emails.pi_email)][['study_most_recent_appl', 'pi_email']].drop_duplicates()
most_recent_emails.rename(columns={'pi_email':'pi_email_latest'}, inplace=True)
print(f"ALL PI emails associated with a project (identified by most_recent_appl)\n {most_recent_emails}")
email_counts = most_recent_emails.groupby('study_most_recent_appl').size()
appl_ids_counts = appl_ids_emails.groupby('study_most_recent_appl').size()
print("Statistics on number of emails per project:: ")
print(email_counts.describe())

appl_ids_emails['email_count'] = [email_counts[k] if k in email_counts else 0 for k in appl_ids_emails['study_most_recent_appl']]
appl_ids_emails['applid_count'] = [appl_ids_counts[k] if k in appl_ids_counts else 0 for k in appl_ids_emails['study_most_recent_appl']]
appl_ids_emails['pi_email'] = appl_ids_emails['pi_email'].fillna('')
appl_ids_emails['keep'] = [1 if (c==0 or (c==1 and len(e)>0) or (c>1 and a==m)) else 0 for (c,a,m,e) in appl_ids_emails[['email_count', 'appl_id', 'study_most_recent_appl', 'pi_email' ]].values]

pi_emails_df_updated = appl_ids_emails[appl_ids_emails['keep']==1][['study_most_recent_appl', 'pi_email']].drop_duplicates()
pi_emails_df_updated['pi_email'] = [k.strip() for k in pi_emails_df_updated['pi_email']]
print("Emails that were kept for each project when there's only one email available, or when email is available for the most recent award (study_most_recent_appl)")
print(pi_emails_df_updated)

## Get Monday board emails, and fill in any that are different from mysql..
pi_emails_df_updated_monday = pd.merge(pi_emails_df_updated, monday_board[['Most Recent Appl_ID', 'Contact Email']].drop_duplicates(), how='left', left_on='study_most_recent_appl', right_on='Most Recent Appl_ID').drop(columns='Most Recent Appl_ID')
pi_emails_df_updated_monday['Contact Email'] = pi_emails_df_updated_monday['Contact Email'].replace('-', '')
pi_emails_df_updated_monday['Contact Email'] = pi_emails_df_updated_monday['Contact Email'].fillna('-')
pi_emails_df_updated_monday['pi_email_updated'] = [me if (len(e)==0 and len(me) > 1) else e for (e,me) in pi_emails_df_updated_monday[['pi_email', 'Contact Email']].values]
print("Adding any Monday Board emails when emails missing from MySql")
print(pi_emails_df_updated_monday)
pi_emails_df_updated_monday.to_csv(input_dir/"email_updates.csv", index=False)
appl_ids_emails.to_csv(input_dir/"email_counts.csv", index=False)

pi_emails_df_updated = pi_emails_df_updated_monday[['study_most_recent_appl', 'pi_email_updated']].rename(columns={'pi_email_updated':'pi_email'})

ALL PI emails associated with a project (identified by most_recent_appl)
      study_most_recent_appl               pi_email_latest
1                   9755001             kwatkins@rand.org
2                   9850412               damico@rand.org
3                  10996795         acfernan@med.umich.ed
4                  11001407               fqeadan@luc.edu
6                  10478911             LYNN.DEBAR@KP.ORG
...                     ...                           ...
1799               11141362        rredwards@partners.org
1802               11193588                sreis@pitt.edu
1803               11193636         monica.kraft@mssm.edu
1804               11195200  lucila.ohno-machado@yale.edu
1805               11193592        szuchner@med.miami.edu

[1114 rows x 2 columns]
Statistics on number of emails per project:: 
count    1090.000000
mean        1.022018
std         0.146810
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         

In [19]:
## Collect fields from report/awards tables that are required by Monday Board
rename_dict = {'proj_num':'Project #',
               'proj_title':'Title',
                'rfa':'Research Focus',
                'res_prg':'Research Program',
                'ctc_pi_nm':'Contact PI',
                'pi_email':'Contact Email',
                'adm_ic':'Administering IC',
                'prg_ofc':'NIH PO',
                'org_nm': 'Institution(s)',
                'pi':'PI(s)',
                'org_cy':'City',
                'org_st':'State',
                'act_code':'Activity Code',
                'awd_ty':'Award Type',
                'fisc_yr':'Award Year',
                'tot_fund':'Total Funded',
                'proj_abs':'Summary',
                'fund_mech': 'SBIR/STTR',
                'proj_strt_date':'Project Start',
                'proj_end_date':'Project End',
                'proj_url':'Reporter Link',
                'res_net':'Research Network',
                'time_of_registration':'Platform Reg Time',
                'overall_percent_complete':'CEDAR Form %',
                'repository_name' : 'Repo per Platform',
                'archived':'Archived',
                'heal_funded':'HEAL-Related',
                'do_not_engage':'Do not Engage',
                'checklist_exempt_all':'Checklist Exempt'
                }

def create_mysql_subset(in_df:pd.DataFrame, extra_fields = ['appl_id']):
    subset = in_df[[k for k in rename_dict.keys() if k in in_df.columns] + extra_fields].copy(deep=True)
    subset.rename(columns={k:v for k,v in rename_dict.items() if k in in_df.columns}, inplace=True)
    return subset

mysql_fields_reporter = create_mysql_subset(awards_df)
mysql_fields_awards = create_mysql_subset(reporter_df)
mysql_fields_platform = create_mysql_subset(progress_tracker_df, extra_fields=['hdp_id'])
mysql_fields_piemails = create_mysql_subset(pi_emails_df_updated, extra_fields=['study_most_recent_appl'])
mysql_fields_resnet = create_mysql_subset(resnet_df)
mysql_fields_resnet['Research Network'] = [k.upper() if not pd.isna(k) else '' for k in mysql_fields_resnet['Research Network']]
mysql_fields_enagementflags = create_mysql_subset(engagement_flags_df)

In [20]:
## Combine all the fields into one table using "Most Recent Appl_ID" as the key. Monday Board will display information from the most recent appl_id for a project, which is available in mysql's study lookup table.
print(len(lookup_fields))
data_merge_1 = pd.merge(lookup_fields, mysql_fields_reporter, how='left', left_on='study_most_recent_appl', right_on='appl_id').drop(columns='appl_id')
print(len(data_merge_1))
data_merge_2 = pd.merge(data_merge_1, mysql_fields_awards, how='left', left_on='study_most_recent_appl', right_on='appl_id').drop(columns='appl_id')
print(len(data_merge_2))
data_merge_1 = pd.merge(data_merge_2, mysql_fields_platform, how='left', left_on='study_hdp_id', right_on='hdp_id')
print(len(data_merge_1))
data_merge_2 = pd.merge(data_merge_1, mysql_fields_resnet, how='left', left_on='study_most_recent_appl', right_on='appl_id').drop(columns='appl_id')
print(len(data_merge_2))
data_merge_1 = pd.merge(data_merge_2, mysql_fields_enagementflags, how='left', left_on='study_most_recent_appl', right_on='appl_id').drop(columns='appl_id')
print(len(data_merge_1))
combined_data_ph1 = pd.merge(data_merge_1, mysql_fields_piemails, how='left', on='study_most_recent_appl')
print(len(combined_data_ph1))
print(len(combined_data_ph1.drop_duplicates()))


1419
1419
1419
1419
1419
1419
1419
1419


In [21]:
# Fill in holes in the mysql data using the progress tracker data a.k.a platform MDS data.
rename_dict = {'project_num':'Project #',
               'project_title':'Title',
                'investigators_name':'PI(s)',
                'award_type':'Award Type',
                'year_awarded':'Award Year',
                'award_amount':'Total Funded',
                'study_name':'Summary',
                'project_end_date':'Project End',
                'nih_reporter_link':'Reporter Link',
                'time_of_registration':'Platform Reg Time',
                'overall_percent_complete':'CEDAR Form %',
                'repository_name' : 'Repo per Platform',
                'archived':'Archived',
                }
progress_tracker_data = progress_tracker_df.copy(deep=True)
progress_tracker_data['project_title'] = progress_tracker_data['project_title'].replace('0', '')
progress_tracker_data = create_mysql_subset(progress_tracker_data, extra_fields=['hdp_id'])

progress_tracker_data['PI(s)'] = progress_tracker_data['PI(s)'].fillna('')
progress_tracker_data['PI(s)'] = [ k.translate(str.maketrans(',', ';', "[]\'")) for k in  progress_tracker_data['PI(s)']]

progress_tracker_data['key'] = progress_tracker_data['hdp_id']
progress_tracker_data['study_hdp_id'] = progress_tracker_data['hdp_id']
progress_tracker_data['Research Network'] = [ 'CTN' if k.startswith('CTN') else '' for k in progress_tracker_data['Project #']]

fill_in_data = pd.merge(combined_data_ph1, progress_tracker_data, how='left', on='study_hdp_id')
fill_in_data.to_csv(input_dir/"tmp.csv", index=False)

columns_to_compare = list(rename_dict.values())
columns_to_compare.extend(['key', 'Research Network'])

for k in columns_to_compare:
    k_x = k+'_x'
    k_y = k+'_y'
    fill_in_data[k] = [v_y if pd.isna(v_x) else v_x for (v_x, v_y) in fill_in_data[[k_x, k_y]].values]
    fill_in_data.drop(columns=[k_x, k_y], inplace=True)

columns = fill_in_data.columns.sort_values()
fill_in_data = fill_in_data[columns]
fill_in_data.to_csv(input_dir/"tmp.csv", index=False)
combined_data_ph1 = fill_in_data.copy(deep=True)


In [22]:
## Add CTN  data
ctn_data = progress_tracker_df[[k.startswith('CTN') for k in progress_tracker_df['project_num']]]
ctn_data['project_title'] = ctn_data['project_title'].replace('0', '')
print(f"Number of CTN entries found in Platform MDS {len(ctn_data)}")
rename_dict = {'project_num':'Project #',
               'project_title':'Title',
                'investigators_name':'PI(s)',
                'award_type':'Award Type',
                'year_awarded':'Award Year',
                'award_amount':'Total Funded',
                'study_name':'Summary',
                'proj_end_date':'Project End',
                'nih_reporter_link':'Reporter Link',
                'time_of_registration':'Platform Reg Time',
                'overall_percent_complete':'CEDAR Form %',
                'repository_name' : 'Repo per Platform',
                'archived':'Archived',
                }
ctn_fields_platform = create_mysql_subset(ctn_data, extra_fields=['hdp_id'])
## Edit pi name
ctn_fields_platform['PI(s)'] = ctn_fields_platform['PI(s)'].fillna('')
ctn_fields_platform['PI(s)'] = [ k.translate(str.maketrans(',', ';', "[]\'")) for k in  ctn_fields_platform['PI(s)']]

ctn_fields_platform['key'] = ctn_fields_platform['hdp_id']
ctn_fields_platform['study_hdp_id'] = ctn_fields_platform['hdp_id']
ctn_fields_platform['Research Network'] = ['CTN']*len(ctn_fields_platform)

## Combine the data to the other data set
all_data = pd.concat([combined_data_ph1, ctn_fields_platform])

Number of CTN entries found in Platform MDS 40


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctn_data['project_title'] = ctn_data['project_title'].replace('0', '')


In [23]:
print("------------ Preview of the final combined dataset ---------------")
all_data

------------ Preview of the final combined dataset ---------------


Unnamed: 0,Activity Code,Administering IC,Archived,Award Type,Award Year,CEDAR Form %,Checklist Exempt,City,Contact Email,Contact PI,...,Summary,Title,Total Funded,hdp_id_x,hdp_id_y,key,study_hdp_id,study_hdp_id_appl,study_most_recent_appl,hdp_id
0,U01,NIAAA,live,3,2019.0,11.5,0.0,PORTLAND,,"NAGEL, BONNIE J",...,"PROJECT SUMMARY. During young adulthood, drink...",National Consortium on Alcohol and Neurodevelo...,93429.0,HDP00632,HDP00632,HDP00632,HDP00632,9860408,9860408,
1,R34,NIAAA,live,3,2019.0,11.5,0.0,SANTA MONICA,kwatkins@rand.org,"WATKINS, KATHERINE E",...,PROJECT SUMMARY/ABSTRACT. Substance use disord...,Implementing Medication-Assisted Therapy for S...,99988.0,HDP00696,HDP00696,HDP00696,HDP00696,9755001,9755001,
2,R01,NIAAA,live,3,2019.0,11.5,0.0,SANTA MONICA,damico@rand.org,"D'AMICO, ELIZABETH J.",...,Summary/Abstract. A recent review of trajector...,AOD Use Trajectories from Age 10 to 24: Multi-...,182670.0,HDP00509,HDP00509,HDP00509,HDP00509,9850412,9850412,
3,R34,NIAAA,live,1,2024.0,11.5,0.0,ANN ARBOR,acfernan@med.umich.ed,"FERNANDEZ, ANNE CHRISTIE",...,Project Summary/Abstract. Opioid agonist thera...,Cultivating Recovery: A Pilot Study of Digital...,390000.0,HDP01356,HDP01356,HDP01356,HDP01356,10996795,10996795,
4,R01,NIAAA,live,1,2024.0,11.5,0.0,MAYWOOD,fqeadan@luc.edu,"QEADAN, FARES",...,Project Summary/Abstract. The intersection of ...,Advancing Integrated Treatment for Co-Occurrin...,385000.0,HDP01380,HDP01380,HDP01380,HDP01380,11001407,11001407,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1314,,,live,0,,7.7,,,,,...,"Randomized, Double-Blind, Placebo-Controlled T...",,0.0,,,HDP01320,HDP01320,,,HDP01320
1315,,,live,0,,76.9,,,,,...,Culturally Centered MAT for OUD Implementation...,,0.0,,,HDP01321,HDP01321,,,HDP01321
1316,,,live,0,,28.8,,,,,...,Harnessing Digital Health to Understand Clinic...,,0.0,,,HDP01322,HDP01322,,,HDP01322
1317,,,live,0,,7.7,,,,,...,Individual Level Predictive Modeling of Opioid...,,0.0,,,HDP01323,HDP01323,,,HDP01323


In [24]:
print("==== Frequencies of several research networks in the combined dataset ========")
mysql_fields_resnet['Research Network'].value_counts()



Unnamed: 0_level_0,count
Research Network,Unnamed: 1_level_1
,1236
CTN,199
HBCD,159
JCOIN,84
BACPAC,67
ACT NOW,45
EPPIC-NET,42
PRISM,31
HPC,28
DATA 2 ACTION,27


In [25]:
## Find out which columns have NA values, and investigate for incompletemess?
print("Number of empty values for each of the fields gathered in the combined dataset:")
for k in list(all_data.columns):
    print(f"{k} : {get_na_count(all_data, k)}")

Number of empty values for each of the fields gathered in the combined dataset:
Activity Code : 41
Administering IC : 41
Archived : 104
Award Type : 0
Award Year : 40
CEDAR Form % : 104
Checklist Exempt : 41
City : 46
Contact Email : 40
Contact PI : 41
Do not Engage : 41
HEAL-Related : 43
Institution(s) : 41
NIH PO : 99
PI(s) : 0
Platform Reg Time : 1062
Project # : 0
Project End : 40
Project Start : 41
Repo per Platform : 1235
Reporter Link : 40
Research Focus : 61
Research Network : 0
Research Program : 173
SBIR/STTR : 41
State : 48
Summary : 0
Title : 0
Total Funded : 0
hdp_id_x : 144
hdp_id_y : 144
key : 0
study_hdp_id : 104
study_hdp_id_appl : 144
study_most_recent_appl : 40
hdp_id : 1419


In [26]:
combined_data = all_data.copy(deep=True)

In [27]:
### Final manipulation of the combined data to make it Monday board ready

## Adding a study_type column to indicate types of entries. This will be used to put rows into their respective categories on Monday Board
combined_data['study_type'] = [ 'CTN' if m.startswith('CTN') else ('APPLIDONLY' if pd.isna(k) else 'HDP') for (m,k) in combined_data[['Project #', 'study_hdp_id_appl']].values]
print("Counts for study types in the final dataset")
print(combined_data.study_type.value_counts())

## Create a column named "Location"
from datetime import datetime
combined_data['City'] = combined_data[['City']].fillna('-')
combined_data['State'] = combined_data[['State']].fillna('-')
combined_data['Location'] = [c+","+s for (c,s) in combined_data[['City', "State"]].values]

## Convert dates to ISO format
combined_data['Project Start'] = pd.to_datetime(combined_data['Project Start'], format='%Y-%m-%d', errors='coerce').dt.date
combined_data['Project End'] = pd.to_datetime(combined_data['Project End'], format='%Y-%m-%d', errors='coerce').dt.date
combined_data['Platform Reg Time'] = pd.to_datetime(combined_data['Platform Reg Time'], utc=True).dt.date

## Change archived column to have "archived/n" values and Y/N type values in HEAL-related and SBIR/STTR columns
combined_data['Archived'] = [a if a=='archived' else 'n' for a in combined_data['Archived']]
combined_data['HEAL-Related'] = ['Y' if ((p != 'CTN' ) and (pd.isna(a))) else 'N' for (p,a) in combined_data[['study_type', 'HEAL-Related']].values]
combined_data['SBIR/STTR'] = ['Y' if 'SBIR/STTR'==t else 'N' for t in combined_data['SBIR/STTR']]
combined_data['Checklist Exempt'] = ['Y' if 1==t else 'N' for t in combined_data['Checklist Exempt']]
combined_data['Do not Engage'] = ['Y' if 1==t else 'N' for t in combined_data['Do not Engage']]

## Rename a few of the other columns:
combined_data.rename(columns={'study_most_recent_appl':'Most Recent Appl_ID', 'study_hdp_id_appl':'HDP appl_ID'}, inplace=True)
combined_data.drop(columns=['study_hdp_id', 'hdp_id', 'hdp_id_x', 'hdp_id_y'], inplace=True)


Counts for study types in the final dataset
study_type
HDP           1315
APPLIDONLY     104
CTN             40
Name: count, dtype: int64


In [28]:
handled_columns = ['study_type', 'City', 'State', 'Location', 'Project Start', 'Project End', 'Platform Reg Time', 'Archived', 'HEAL-Related', 'SBIR/STTR', 'Checklist Exempt', 'Do not Engage']
rest_obj_columns = [k for k in combined_data.columns if k not in handled_columns and combined_data[k].dtype in ['object', 'str']]

#combined_data[rest_obj_columns].fillna('-')
print("Setting empty cells to  '-' in the following colulmns:")
print(rest_obj_columns)

for k in rest_obj_columns:
    combined_data[k] = ['-' if (t is np.nan) or (t=='') else t for t in combined_data[k]]


Setting empty cells to  '-' in the following colulmns:
['Activity Code', 'Administering IC', 'Award Type', 'Contact Email', 'Contact PI', 'Institution(s)', 'NIH PO', 'PI(s)', 'Project #', 'Repo per Platform', 'Reporter Link', 'Research Focus', 'Research Network', 'Research Program', 'Summary', 'Title', 'key', 'HDP appl_ID', 'Most Recent Appl_ID']


In [29]:
#TODO: Find what's in Monday.com board, but not in mysql extract
# Mark these entries for deletion, and these would have to be deleted manually on Monday.com
### A few checks:
## How many of the "keys" from Monday board are in lookup fields?
print(f"Number records from Monday already in final dataset: {len(monday_board[monday_board.Name.isin(combined_data.key)])}")
## How many of the keys from MOnday board are not there in looup fields
mondayboard_missing_in_data = monday_board[~monday_board.Name.isin(combined_data.key)]
print(f"Number records from Monday that are not in lookup table (Consider these as discrepancies **Investigate**): {len(mondayboard_missing_in_data)}")
## How many of the keys from lookup fields are not there in Monday??
data_missing_in_mondayboard = combined_data[~combined_data.key.isin(monday_board.Name)]
print(f"Number records from lookup table that are not on Monday (Potentially new entries): {len(data_missing_in_mondayboard)}")


Number records from Monday already in final dataset: 1243
Number records from Monday that are not in lookup table (Consider these as discrepancies **Investigate**): 8
Number records from lookup table that are not on Monday (Potentially new entries): 216


In [30]:
print("****** Investigate/Delete the following entries on Monday that are not in there in the new Monday Excel upload")
mondayboard_missing_in_data

****** Investigate/Delete the following entries on Monday that are not in there in the new Monday Excel upload


Unnamed: 0,Name,Most Recent Appl_ID,HDP appl_ID,Project #,Archived,HEAL-Related,Research Focus,Research Program,Title,Contact PI,...,CEDAR Form %,Repo Mapping,repo_22_2,repo_22_3,Creation Log,study_type,link to Data Dictionary Tracker,HEAL Individuals,Pubs Tracker Item,Repo & Tool Engagement_HEAL
0,10428343_HDP00882,,,75N95020P00589-P00001-0-1,,,,,MEDICAL WRITING CONSULTING AND DRUG DOSSIER DE...,"Sigman, Caroline",...,,,,,"Kathy Jooss May 4, 2023 11:25 AM",Unknown,,,,
1,10488140_HDP00883,,,75N95019D00026-P00003-759501900088-1,,,,,SCREENING OF INVESTIGATIONAL AGENTS THROUGH TH...,"Leahy, Emer",...,,,,,"Kathy Jooss May 4, 2023 11:25 AM",Unknown,,,,
2,9673173_none,,,5U24HD095254-02,,,Enhanced Outcomes for Infants and Children Exp...,,ACT NOW Clinical Trials: ESC and Weaning Proto...,"DAS, ABHIK",...,,,,,"Kathy Jooss Nov 29, 2021 8:34 AM",Unknown,,Abhik Das,,
3,9769689_none,,,5R01DE027454-02,,,Preclinical and Translational Research in Pain...,,Modeling temporomandibular joint disorders pai...,"CHEN, YONG",...,,,,,"Kathy Jooss Nov 29, 2021 7:59 AM",Unknown,,Yong Chen,,
3,10569775,10569775.0,-,4R33AT010619-02,n,N,Translation of Research to Practice for the Tr...,Behavioral Research to Improve Medication-Base...,Adapting Web-based CBT to improve adherence an...,"HEAPY, ALICIA",...,,-,-,-,"Hina Shah Jun 19, 2024 1:20 PM",APPLIDONLY,,,,
4,10532462,10532462.0,-,4R44AT011593-02,n,N,Translation of Research to Practice for the Tr...,Behavioral Research to Improve Medication-Base...,HEAL - Development and implementation of a pr...,"MASTERSON, JO",...,,-,-,-,"Hina Shah Jun 19, 2024 1:20 PM",APPLIDONLY,,,,
14,9894780,9894780.0,-,5UG3DA048338-02,n,N,Novel Therapeutic Options for Opioid Use Disor...,Focusing Medication Development to Prevent and...,A Long-Acting Bioabsorbable Naltrexone Subcuta...,"COHEN, STEVEN M",...,,-,-,-,"Hina Shah Jun 19, 2024 1:20 PM",APPLIDONLY,,,,
19,10588504,10588504.0,-,4R33DA057747-02,n,N,Translation of Research to Practice for the Tr...,Behavioral Research to Improve Medication-Base...,Peer-Delivered Behavioral Activation Intervent...,"MAGIDSON, JESSICA F",...,,-,-,-,"Hina Shah Jun 19, 2024 1:20 PM",APPLIDONLY,,,,


In [31]:

#having an index column is created as a temporary column and is integral to the QA process. See SOP for more specific notes.
combined_data.index.name = 'index'

outfile = input_dir/"MondayBoard_Update.xlsx"
print(f"Exporting data to excel file at {outfile}")
combined_data.to_excel(outfile, engine='xlsxwriter')

print("Making sure uniqueness of key values. Min and Max should both be 1")
key_counts = combined_data.groupby('key').size()
key_counts.describe()

Exporting data to excel file at /content/drive/MyDrive/Jan2025MySQLtoMonday/MondayBoard_Update.xlsx
Making sure uniqueness of key values. Min and Max should both be 1


Unnamed: 0,0
count,1459.0
mean,1.0
std,0.0
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


*******************
*******************
DEBUG CODE BELOW
*******************
*******************

In [None]:
combined_data[combined_data['key']=='HDP00889'].drop_duplicates()

Unnamed: 0_level_0,Most Recent Appl_ID,HDP appl_ID,key,Research Focus,Research Program,DAI Import Status,HEAL-Related,Project #,Title,Contact PI,...,repo_22_2,repo_22_3,Platform Reg Time,CEDAR Form %,Repo per Platform,Archived,Network,Contact Email,Location,study_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
660,10593312,10601172,HDP00889,,,,Y,3R24DA055306-02S1,Wake Forest IMPOWR Dissemination Education and...,"ADAMS, MEREDITH C. B.",...,,,2022-07-26,5.8,,,,meradams@wakehealth.edu,"WINSTON-SALEM,NC",HDP


In [None]:
comparison_cols = ['Most Recent Appl_ID', 'HDP appl_ID', 'Contact Email', 'Network']

comparison_df = pd.merge(monday_board[['Name'] +comparison_cols ], combined_data[['key'] + comparison_cols], left_on = 'Name', right_on='key').drop_duplicates()
comparison_df.to_csv(input_dir/"comparison.csv", index=False)
comparison_df.to_excel(input_dir/"comparison.xlsx", index=False)

In [None]:
pi_emails_df_updated[ pi_emails_df_updated.study_most_recent_appl=='9901704']

Unnamed: 0,study_most_recent_appl,pi_email
881,9901704,jhambm@upmc.edu


In [None]:
resnet_df[resnet_df.appl_id=='9908734']

Unnamed: 0,appl_id,res_net,res_net_override_flag
1486,9908734,,0
