In [4]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import pandas as pd
from tqdm import tqdm

In [5]:
direct_map_data = pd.read_csv('trial_mapped_pubmed_total_12thJune.csv')
citation_data = pd.read_csv('Citation_data_15thJune.csv')
citation_data.head()

Unnamed: 0,pmid,ref_pmid,nct_id,reference_type
0,6322,13493614,NCT00117845,citation
1,6322,4770788,NCT00117845,citation
2,6322,13387693,NCT00117845,citation
3,6322,13628564,NCT00117845,citation
4,6322,14370191,NCT00117845,citation


In [6]:
direct_map_data.head()

Unnamed: 0,nct_id,pmid,reference_type
0,NCT02971111,28950719,one-to-one
1,NCT02974283,28931617,one-to-one
2,NCT02976376,28939546,one-to-one
3,NCT02979782,28988183,one-to-one
4,NCT02972164,28403865,one-to-one


We have to add node attributes and edge attributes to the graph

In [7]:
new_graph = nx.Graph()

In [8]:
new_graph.number_of_nodes()

0

In [9]:
edge_list = list()
edge_label_list = list()
for row_id in range(direct_map_data.shape[0]):
    node = str(direct_map_data.iloc[row_id, 0])
    node = node.replace('\r', '')
    node = node.replace('\n', '')
    new_graph.add_edge(node, str(direct_map_data.iloc[row_id, 1]), edge_type = direct_map_data.iloc[row_id, 2])

In [10]:
for row_id in range(citation_data.shape[0]):
    node = str(citation_data.iloc[row_id, 2])
    node = node.replace('\r', '')
    node = node.replace('\n', '')
    new_graph.add_edge(node, str(citation_data.iloc[row_id, 1]), edge_type= citation_data.iloc[row_id, 3])

In [11]:
node_dict = defaultdict()

In [12]:
ct_count = 0
pm_count = 0

In [13]:
for node in new_graph.nodes:
    if str(node)[0:3] == 'NCT':
        node_dict[str(node)] = 'CT'
        ct_count = ct_count + 1
    else:
        node_dict[str(node)] = 'PM'
        pm_count = pm_count + 1
    

In [15]:
nx.set_node_attributes(new_graph, name='node_type', values= node_dict)

In [14]:
print(ct_count, pm_count)

62951 690267


In [16]:
print(nx.info(new_graph))

Name: 
Type: Graph
Number of nodes: 753218
Number of edges: 1198607
Average degree:   3.1826


In [15]:
#nx.write_gexf(new_graph, 'new_graph_15thJune.gexf')

For simplicity, we say two clinical trials are related if they have a metapath between them. For this we compute the connected components

In [17]:
conn_comp = list(nx.connected_components(new_graph))

In [18]:
len(conn_comp)

14674

In [17]:
largest_cc = max(nx.connected_components(new_graph), key=len)

In [18]:
type(largest_cc)

set

In [19]:
largest_subgraph = new_graph.subgraph(list(largest_cc))

In [20]:
print(nx.info(largest_subgraph))

Name: 
Type: Graph
Number of nodes: 703393
Number of edges: 1162998
Average degree:   3.3068


In [21]:
#degree_GCC = list(largest_subgraph.degree())

In [22]:
#deg_hist_GCC = degree_histogram(largest_subgraph)

In [23]:
len(conn_comp)

14674

In [24]:
conn_comp_len = [len(list(comp)) for comp in conn_comp]

From the connected components, extract the nct ids only. For the largest component, we will try to break it down smaller parts

In [25]:
conn_comp_trials = list()
for comp in conn_comp:
    node_list = list(comp)
    trial_list = list()
    for node in node_list:
        node = str(node)
        node = node.replace('\r', '')
        node = node.replace('\n', '')
        if str(node)[0:3] == 'NCT':
            trial_list.append(node)
    if len(trial_list) > 1:
        conn_comp_trials.append(trial_list)

In [None]:
conn_comp_trials

In [None]:
#import pickle as pkl
#conn_comp_trials = pkl.load(open('trial_connected_components_13thJune.p', 'rb'))

In [26]:
conn_comp_len = [len(list(comp)) for comp in conn_comp_trials]

In [27]:
print(Counter(conn_comp_len))

Counter({2: 543, 3: 72, 4: 14, 5: 9, 6: 2, 47486: 1, 8: 1, 11: 1})


In [28]:
print(sum(conn_comp_len))

48920


Now, working only on the largest_subgraph. Since, edge types have different weights we cannot apply PathSim, SimRank and Personalized-Page Rank algorithms

In [29]:
node_type_large = nx.get_node_attributes(largest_subgraph, name='node_type')
edge_type_large = nx.get_edge_attributes(largest_subgraph, name='edge_type')

In [30]:
node_cts_large = list()
node_cts1_dict = defaultdict(list)
node_cts1_dict_len = list()

In [31]:
for node in largest_subgraph.nodes:
    node_type = node_dict[str(node)]
    if node_type == 'CT':
        node_ct_list = list()
        # Traverse all the neighbors of the node. Currently, we consider all the edge-types to be the same
        for pmid in nx.all_neighbors(largest_subgraph, node):
            for ct in nx.all_neighbors(largest_subgraph, pmid):
                if node_dict[str(ct)] == 'CT' and ct != node:
                    node_ct_list.append(ct)
        
        if len(node_ct_list) > 5:
            node_ct_list = list()
            # Traverse all the neighbors of the node. Currently, we consider all the edge-types to be the same
            for pmid in nx.all_neighbors(largest_subgraph, node):
                
                if (node, pmid) in edge_type_large:
                    edge_type1 = edge_type_large[(node, pmid)]
                elif (pmid, node) in edge_type_large:
                    edge_type1 = edge_type_large[(pmid, node)]
                else:
                    continue
                
                if edge_type1 != 'citation':
                    for ct in nx.all_neighbors(largest_subgraph, pmid):
                        if node_dict[str(ct)] == 'CT' and ct != node:
                            node_ct_list.append(ct)
                
            if len(node_ct_list) > 5:
                node_ct_list = list()
                # Traverse all the neighbors of the node. Currently, we consider all the edge-types to be the same
                for pmid in nx.all_neighbors(largest_subgraph, node):

                    if (node, pmid) in edge_type_large:
                        edge_type1 = edge_type_large[(node, pmid)]
                    elif (pmid, node) in edge_type_large:
                        edge_type1 = edge_type_large[(pmid, node)]
                    else:
                        continue

                    if edge_type1 != 'citation':
                        for ct in nx.all_neighbors(largest_subgraph, pmid):
                            if node_dict[str(ct)] == 'CT' and ct != node:
                                if (ct, pmid) in edge_type_large:
                                    edge_type2 = edge_type_large[(ct, pmid)]
                                elif (pmid, ct) in edge_type_large:
                                    edge_type2 = edge_type_large[(pmid, ct)]
                                else:
                                    continue
                                if edge_type2 != 'citation':
                                    node_ct_list.append(ct)
                node_cts1_dict[node] = node_ct_list
                node_cts1_dict_len.append(len(node_ct_list))
            else:
                node_cts1_dict[node] = node_ct_list
                node_cts1_dict_len.append(len(node_ct_list))
        else:        
            node_cts1_dict[node] = node_ct_list
            node_cts1_dict_len.append(len(node_ct_list))
        
        '''
        for edge in largest_subgraph.edges(node):
            print(edge_type_large[edge])

    counter = counter + 1
    if counter > 20:
        break
        '''

In [32]:
print(Counter(node_cts1_dict_len))

Counter({0: 14846, 1: 9888, 2: 6303, 3: 4439, 4: 3424, 5: 2681, 6: 844, 7: 618, 8: 509, 10: 426, 9: 405, 11: 344, 12: 258, 13: 258, 15: 226, 14: 213, 16: 171, 17: 136, 19: 127, 18: 113, 20: 107, 21: 85, 22: 80, 23: 75, 24: 63, 27: 56, 25: 50, 26: 49, 28: 42, 34: 38, 31: 37, 32: 35, 29: 32, 30: 30, 35: 28, 33: 27, 40: 25, 37: 22, 38: 21, 39: 21, 36: 19, 41: 18, 51: 15, 50: 14, 43: 14, 47: 14, 67: 11, 46: 11, 44: 11, 48: 10, 56: 9, 60: 9, 54: 9, 45: 9, 49: 9, 57: 8, 53: 8, 66: 8, 42: 8, 59: 7, 64: 6, 76: 5, 68: 5, 72: 5, 63: 5, 52: 5, 62: 5, 91: 4, 78: 4, 58: 4, 75: 4, 74: 4, 71: 3, 69: 3, 84: 3, 61: 3, 70: 3, 80: 3, 55: 3, 82: 3, 73: 3, 65: 2, 102: 2, 86: 2, 79: 2, 106: 1, 133: 1, 98: 1, 81: 1, 185: 1, 125: 1, 144: 1, 88: 1, 95: 1, 147: 1, 96: 1, 94: 1, 89: 1, 114: 1, 103: 1, 100: 1, 90: 1, 92: 1, 77: 1, 161: 1, 87: 1, 181: 1})


In [39]:
node_cts2_dict = defaultdict(list)

In [40]:
for key in node_cts1_dict.keys():
    if len(node_cts1_dict[key]) > 0 and len(node_cts1_dict[key]) < 6:
        node_cts2_dict[key] = node_cts1_dict[key]

In [42]:
len(node_cts2_dict.keys())

26735

In [43]:
node_cts2_dict

defaultdict(list,
            {'NCT02974283': ['NCT03745092'],
             'NCT02979002': ['NCT03208543',
              'NCT00947362',
              'NCT01301924',
              'NCT03435419',
              'NCT02025699'],
             'NCT02971332': ['NCT02236507'],
             'NCT01431040': ['NCT01522261'],
             'NCT01436487': ['NCT02961504', 'NCT01468064'],
             'NCT01433874': ['NCT02004470'],
             'NCT01438853': ['NCT00879606'],
             'NCT01432899': ['NCT02488265',
              'NCT01815281',
              'NCT00132990',
              'NCT01361373'],
             'NCT01430156': ['NCT02314780'],
             'NCT01436344': ['NCT02541175'],
             'NCT01434121': ['NCT04153487',
              'NCT04029675',
              'NCT03756220',
              'NCT03509662',
              'NCT03148236'],
             'NCT01437319': ['NCT01616056',
              'NCT00727402',
              'NCT00727402',
              'NCT02147509',
              'NCT0072

In [46]:
max_comp_val = max(conn_comp_len)

for comp in conn_comp_trials:
    if len(comp) != max_comp_val:
        for elem in comp:
            comp_cp = comp
            comp_cp.remove(elem)
            if elem not in node_cts2_dict.keys():
                node_cts2_dict[elem] = comp_cp

In [47]:
print(len(node_cts2_dict.keys()))

27493


In [49]:
import pickle as pkl
pkl.dump(node_cts2_dict, open('sim_search_het_graph_18thJune.pkl', 'wb'))