# Creating Nodes and Edges as .csv Files
Mary Kate Montogmery

December 8 2021

In [167]:
# Import libraries
import pandas as pd
import numpy as np
import os
pd.options.mode.chained_assignment = None  # default='warn'

In [168]:
def drop_unmatched_data(df,dtype):
    '''Restrict nodes/edges to those pertaining to data that matched between data sources'''
    if dtype == 'pubmed':
        # Load entity matching results
        df1 = pd.read_csv('ENTITY_MATCH_OUTPUT/data_matching_Project_IMP_columnsOnly_pubmedclinicTrial.csv')
        df2 = pd.read_csv('ENTITY_MATCH_OUTPUT/data_matching_Project_IMP_columnsOnly_pubmedNIH.csv')
        # List ids that matched
        ids = list(np.unique(df1['pubMed'].to_list()+df2['pubMed'].to_list()))
    elif dtype == 'nih':
        df1 = pd.read_csv('ENTITY_MATCH_OUTPUT/data_matching_Project_IMP_columnsOnly_pubmedNIH.csv')
        ids = df1['clinicTrial'].to_list()
    elif dtype == 'clintrial':
        df1 = pd.read_csv('ENTITY_MATCH_OUTPUT/data_matching_Project_IMP_columnsOnly_pubmedclinicTrial.csv')
        ids = df1['clinicTrial'].to_list()
    return df.iloc[ids]

### Create nodes and edges for subproperties of data sources

In [169]:
def get_nodes_edges(dtype,data_source,colname,save_nodes,save_edges):
    '''Create nodes and edges for subproperties of data sources'''
    # Load data
    df = pd.read_csv(data_source)
    df = drop_unmatched_data(df,dtype)

    # Get list of unique values in col of interest
    node_vals = list(np.unique(df[colname].to_list()))

    # Create table of nodes with ids
    nodes = pd.DataFrame({'label':node_vals})
    nodes['id'] = nodes.index.astype(int)

    # Creat table of edges from source ids to targets
    edges = df[[colname]]
    edges['id'] = edges.index.astype(int)
    edges = edges.join(nodes.set_index('label'),on=colname,lsuffix = '_source',rsuffix = '_target').drop(columns=colname)
    edges.head()

    # Write to csv
    nodes[['id','label']].to_csv(save_nodes,index=False)
    edges[['id_source','id_target']].to_csv(save_edges,index=False)
    
    return 

# Define input files to use
pubmed_file = 'new_data/data_with_ontology_terms/pubMed.csv'
clintrials_file = 'new_data/data_with_ontology_terms/clinTrials.csv'
nih_file = 'new_data/data_with_ontology_terms/NIH.csv'
save_dir = 'nodes_and_edges'

# Pubmed Authors
get_nodes_edges('pubmed',pubmed_file,'First Author',os.path.join(save_dir,'author_nodes.csv'),os.path.join(save_dir,'authoredby_edges.csv'))
# Pubmed Journals
get_nodes_edges('pubmed',pubmed_file,'Journal/Book',os.path.join(save_dir,'journal_nodes.csv'),os.path.join(save_dir,'publishedin_edges.csv'))
# Clinical Trials Sponsors
get_nodes_edges('clintrial',clintrials_file,'Sponsor/Collaborators',os.path.join(save_dir,'sponsor_nodes.csv'),os.path.join(save_dir,'sponsoredby_edges.csv'))
# Clinical Trials Status
get_nodes_edges('clintrial',clintrials_file,'Status',os.path.join(save_dir,'status_nodes.csv'),os.path.join(save_dir,'hasstatus_edges.csv'))
# Clinical Trials Gender
get_nodes_edges('clintrial',clintrials_file,'Gender',os.path.join(save_dir,'gender_nodes.csv'),os.path.join(save_dir,'haseligiblegender_edges.csv'))
# Clinical Trials Age
get_nodes_edges('clintrial',clintrials_file,'Age',os.path.join(save_dir,'agegroup_nodes.csv'),os.path.join(save_dir,'hasagegroup_edges.csv'))
# NIH States
get_nodes_edges('nih',nih_file,'ORG_STATE',os.path.join(save_dir,'state_nodes.csv'),os.path.join(save_dir,'instate_edges.csv'))
# NIH Years
get_nodes_edges('nih',nih_file,'FY',os.path.join(save_dir,'year_nodes.csv'),os.path.join(save_dir,'inyear_edges.csv'))

### Create nodes for main data sources

In [170]:
def get_datasource_node(dtype,data_source,savename,attr):
    '''Create nodes for main data sources'''
    # Load data
    df = pd.read_csv(data_source)
    df = drop_unmatched_data(df,dtype)
    
    # Keep only id and attributes
    nodes = df[attr]
    nodes['id'] = nodes.index.astype(int)
    nodes = nodes[['id']+attr]
    for a in attr:
        nodes = nodes.rename(columns={a:a.lower()})
    
    # Write to csv
    nodes.to_csv(savename,index=False)
    return

# NIH
get_datasource_node('nih',nih_file,os.path.join(save_dir,'nih_nodes.csv'),['PROJECT_TITLE','TOTAL_COST'])
# PubMed
get_datasource_node('pubmed',pubmed_file,os.path.join(save_dir,'pubmed_nodes.csv'),['Title'])
# Clinical Trials
get_datasource_node('clintrial',clintrials_file,os.path.join(save_dir,'clintrials_nodes.csv'),['Title'])

### Create nodes and edges for pacemaker subclasses and heart conditions

In [171]:
# Load data
df_pub = drop_unmatched_data(pd.read_csv(pubmed_file),'pubmed')
df_pub = df_pub.rename(columns={'indication':'condition'})
df_clin = drop_unmatched_data(pd.read_csv(clintrials_file),'clintrial')
df_nih = drop_unmatched_data(pd.read_csv(nih_file),'nih')

def separate_edges(df,colname):
    df['id'] = df.index
    for i in list(df.index.values):
        txt = df.loc[i][colname].replace('[','').replace(']','').replace("'",'')
        if txt == '':
            df = df.drop(labels=i)
        else:
            vals = df.loc[i][colname].replace('[','').replace(']','').replace("'",'').split(', ')
            df.loc[i,colname] = vals[0]
            for j in range(1,len(vals)):
                newrow = df[df['id']==i]
                newrow[colname] = vals[j]
                df = pd.concat([df,newrow])
    return df
# Separate edges
# Pubmed
pub_condition_edges = separate_edges(df_pub,'condition')    
pub_subclass_edges = separate_edges(df_pub,'subclass')
# ClinTrials
clin_condition_edges = separate_edges(df_clin,'condition')    
clin_subclass_edges = separate_edges(df_clin,'subclass')
# NIH
nih_condition_edges = separate_edges(df_nih,'condition')    
nih_subclass_edges = separate_edges(df_nih,'subclass')

### With separate edges, create nodes and edges like before
### First for subclass

In [172]:
colname = 'subclass'
node_vals = list(np.unique(list(np.unique(pub_subclass_edges[colname].to_list()))+list(np.unique(clin_subclass_edges[colname].to_list()))+list(np.unique(nih_subclass_edges[colname].to_list()))))
# Create table of nodes with ids
nodes = pd.DataFrame({'label':node_vals})
nodes['id'] = nodes.index.astype(int)
nodes[['id','label']].to_csv(os.path.join(save_dir,colname+'_nodes.csv'),index=False)

# Create table of edges for each data source
dfs = [pub_subclass_edges,clin_subclass_edges,nih_subclass_edges]
names = ['pubmed','clin','nih']
for d in range(len(dfs)):
    df = dfs[d]
    # Creat table of edges from source ids to targets
    edges = df[['id',colname]]
    edges = edges.join(nodes.set_index('label'),on=colname,lsuffix = '_source',rsuffix = '_target').drop(columns=colname)
    edges.head()

    # Write to csv
    edges[['id_source','id_target']].to_csv(os.path.join(save_dir,names[d]+'_'+colname+'_edges.csv'),index=False)

### Then for heart conditions

In [173]:
# Repeat for conditions
colname = 'condition'
node_vals = list(np.unique(list(np.unique(pub_condition_edges[colname].to_list()))+list(np.unique(clin_condition_edges[colname].to_list()))+list(np.unique(nih_condition_edges[colname].to_list()))))
# Create table of nodes with ids
nodes = pd.DataFrame({'label':node_vals})
nodes['id'] = nodes.index.astype(int)
nodes[['id','label']].to_csv(os.path.join(save_dir,colname+'_nodes.csv'),index=False)

# Create table of edges for each data source
dfs = [pub_condition_edges,clin_condition_edges,nih_condition_edges]
names = ['pubmed','clin','nih']
for d in range(len(dfs)):
    df = dfs[d]
    # Creat table of edges from source ids to targets
    edges = df[['id',colname]]
    edges = edges.join(nodes.set_index('label'),on=colname,lsuffix = '_source',rsuffix = '_target').drop(columns=colname)
    edges.head()

    # Write to csv
    edges[['id_source','id_target']].to_csv(os.path.join(save_dir,names[d]+'_'+colname+'_edges.csv'),index=False)

### Finally, draw edges between data sources

In [174]:
# Pubmed to Clinical Trials
pb_ct = pd.read_csv('ENTITY_MATCH_OUTPUT/data_matching_Project_IMP_columnsOnly_pubmedclinicTrial.csv')
pb_ct = pb_ct.rename(columns={'pubMed':'id_Source','clinicTrial':'id_Target'})
pb_ct[['id_Source','id_Target']].to_csv(os.path.join(save_dir,'pubmed_clintrials_edges.csv'))

# Pubmed to NIH
pb_nih = pd.read_csv('ENTITY_MATCH_OUTPUT/data_matching_Project_IMP_columnsOnly_pubmedNIH.csv')
pb_nih = pb_nih.rename(columns={'pubMed':'id_Source','clinicTrial':'id_Target'})
pb_ct[['id_Source','id_Target']].to_csv(os.path.join(save_dir,'pubmed_nih_edges.csv'))