This jupyter notebook will develp the SOP for updating the Monday.com studies board consistently.

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import re
from datetime import datetime

In [10]:
input_dir = Path("/Users/hinashah/Documents/HEAL/TablesForSarah")

In [4]:
def get_unique_values(df:pd.DataFrame, col_name:str='appl_id'):
    if col_name in df.columns:
        return df[ ~pd.isna(df[col_name])][col_name].drop_duplicates()
    return None

def get_na_count(df:pd.DataFrame, col_name:str='appl_id'):
    if col_name in df.columns:
        return len(df[pd.isna(df[col_name])])
    return -1

In [11]:
fields_to_gather = [ 'hdp_id',
                'ctc_pi_nm',
                'proj_num',
                'proj_title',	
                'rfa',
                'res_net',	
                'res_prg',
                'proj_abs',
                'adm_ic',
                'proj_num_spl_ty_code',
                'proj_num',
                'fisc_yr',
                'org_nm',
                'prg_ofc',
                'proj_end_date',
                'proj_num',
                'proj_num_spl_act_code',
                'tot_fund',
                'proj_ser_num',
                'pi_email',	
                'clinical_trials_study_ID',
                'repository_name'
                ]

In [12]:
# Get rest of the tables
convert_dict = {'appl_id':str}
awards_df = pd.read_csv(input_dir/"awards.csv", low_memory=False, dtype=convert_dict)
awards_df = awards_df.dropna(how='all')
print(f"Awards table has: {len(awards_df)} entries, with {len(get_unique_values(awards_df))} appl_ids")
reporter_df = pd.read_csv(input_dir/"reporter.csv", low_memory=False, dtype=convert_dict)
reporter_df = reporter_df.dropna(how='all')
print(f"Reporter table has: {len(reporter_df)} entries, with {len(get_unique_values(reporter_df))} appl_ids")
progress_tracker_df = pd.read_csv(input_dir/"progress_tracker.csv", low_memory=False, dtype=convert_dict)
print(f"Platform generated table has: {len(progress_tracker_df)} entries, with {len(get_unique_values(progress_tracker_df))} appl_ids")
print(f"Platform table has {len(get_unique_values(progress_tracker_df))} unique HDP IDs")
repo_maping_df = pd.read_csv(input_dir/"repo_mapping.csv", low_memory=False, dtype=convert_dict)
print(f"Repo mapping table has: {len(repo_maping_df)} entrie, with {len(get_unique_values(repo_maping_df))} appl_ids")
pi_emails_df = pd.read_csv(input_dir/"pi_emails.csv", low_memory=False, dtype=convert_dict)
print(f"Repo mapping table has: {len(pi_emails_df)} entrie, with {len(get_unique_values(pi_emails_df))} appl_ids")
resnet_df = pd.read_csv(input_dir/"research_networks.csv", low_memory=False, dtype=convert_dict)
print(f"Research Network table has: {len(resnet_df)} entrie, with {len(get_unique_values(resnet_df))} appl_ids")


Awards table has: 1618 entries, with 1618 appl_ids
Reporter table has: 1617 entries, with 1617 appl_ids
Platform generated table has: 1320 entries, with 1311 appl_ids
Platform table has 1311 unique HDP IDs
Repo mapping table has: 1323 entrie, with 1323 appl_ids
Repo mapping table has: 1059 entrie, with 1059 appl_ids
Research Network table has: 1669 entrie, with 1669 appl_ids


In [25]:
def combine_data(df1:pd.DataFrame, df2:pd.DataFrame, term_list:list, on_term='appl_id'):
    # Make sure that on_term is in first dataframe
    if on_term not in df1.columns:
        print(f"Could not find on_term {on_term} in first dataframe, quitting")
        return None

    df2_include_columns = [on_term] + [k for k in df2.columns if k in term_list and k not in df1.columns]
    new_df = pd.merge(df1, df2[ df2_include_columns], on=on_term, how='outer')

    if on_term+"_1" in new_df.columns and on_term+"_2" in new_df.columns:
        print(f"Need to merge on term columns")
        new_df[on_term] = [ t2 if pd.isna(t1) else t1 for (t1, t2) in new_df[[on_term+"_1", on_term+"_2"]] ]
        new_term.delete(columns=[on_term+"_1", on_term+"_2"], inplace=True)
    return new_df

def check_termlist(df:pd.DataFrame, term_list:list):
    missing_terms = [k for k in term_list if k not in df.columns]
    print(f"Input Data Frame is missing {len(missing_terms)} terms: \n **** {missing_terms}")

In [26]:
['appl_id'] + [k for k in awards_df.columns if k in fields_to_gather]

['appl_id', 'rfa', 'res_prg']

In [27]:
start_df = awards_df[ ['appl_id'] + [k for k in awards_df.columns if k in fields_to_gather] ].drop_duplicates()
print(len(start_df))
df_next = combine_data(start_df, reporter_df, fields_to_gather)
print(len(df_next))
df_next = combine_data(df_next, progress_tracker_df, fields_to_gather)
print(len(df_next))
df_next = combine_data(df_next, repo_maping_df, fields_to_gather)
print(len(df_next))
df_next = combine_data(df_next, pi_emails_df, fields_to_gather)
print(len(df_next))
df_next = combine_data(df_next, resnet_df, fields_to_gather)
print(len(df_next))

check_termlist(df_next, fields_to_gather)


1618
1618
1668
1670
1684
1735
Input Data Frame is missing 3 terms: 
 **** ['proj_yr_end', 'proj_yr_end', 'spl_act_code']


In [28]:
df_next.to_csv(input_dir/"MySqlExportForSarah.csv", index=False)