# Import Libraries

In [24]:
import os
import pickle
import pandas as pd
import networkx as nx
from collections import Counter
import random

# Get Explanation Paths

In [25]:
dataset_nrs = [1, 2]
embedding_method = 'e2v'

expl_folders = {1: 'expl_5', 2: 'expl_all'}
explanations_per_dataset = {}
explanation_pairs_per_dataset = {}

curr_working_dir = os.getcwd()
curr_output_dir = os.path.join(curr_working_dir, 'output')

chosen_explanations_per_dataset = {}

for dataset_nr in dataset_nrs:
    nodes = pd.read_csv(f'output/indexed_nodes_{dataset_nr}.csv')
    
    dataset_output_dir = os.path.join(curr_output_dir, f'g{dataset_nr}_{embedding_method}', expl_folders[dataset_nr])
    print(dataset_output_dir)
    
    all_explanations = []
    all_graphs = []
    all_pairs = []
    for item in os.listdir(dataset_output_dir):
        if 'incomplete' not in item and '.gpickle' in item:
            
            with open(os.path.join(dataset_output_dir, item), 'rb') as f:
                G = pickle.load(f)
                all_graphs.append(G)
                
            file_name_explanation = item.split('_graph.gpickle')[0]
            all_explanations.append(file_name_explanation)
            pair_file_name = f'{file_name_explanation}_pair.pkl'
            
            with open(os.path.join(dataset_output_dir, pair_file_name), 'rb') as f:
                loaded_info = pickle.load(f)
                
                symptom_index = nodes.loc[nodes['id'] == loaded_info['symptom_id']].index[0]
                symptom_label = nodes.loc[nodes['id'] == loaded_info['symptom_id']]['label'].values[0]
                
                drug_index = nodes.loc[nodes['id'] == loaded_info['drug_id']].index[0]
                drug_label = nodes.loc[nodes['id'] == loaded_info['drug_id']]['label'].values[0]
                
                all_pairs.append([f'{symptom_label} {symptom_index}', f'{drug_label} {drug_index}'])
            
    explanations_per_dataset[dataset_nr] = all_graphs
    explanation_pairs_per_dataset[dataset_nr] = all_pairs
    
    random.seed(111)
    chosen_explanations = []
    for expl_name in random.choices(all_explanations, k=3):
        with open(os.path.join(dataset_output_dir, f'{expl_name}_graph.gpickle'), 'rb') as f:
            G = pickle.load(f)
            
        with open(os.path.join(dataset_output_dir, f'{expl_name}_pair.pkl'), 'rb') as f:
            loaded_info = pickle.load(f)
        
        chosen_explanations.append({'filename': expl_name, 'G': G, 'pair': loaded_info})
    chosen_explanations_per_dataset[dataset_nr] = chosen_explanations

c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v\expl_5
c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g2_e2v\expl_all


In [26]:
chosen_explanations_per_dataset

{1: [{'filename': 'explanation_2_run_007',
   'G': <networkx.classes.graph.Graph at 0x1cbf599a020>,
   'pair': {'drug_id': '5252',
    'drug_name': 'neratinib',
    'symptom_id': 'HP:0000750',
    'symptom_name': 'Delayed speech and language development',
    'complete': True}},
  {'filename': 'explanation_1_run_005',
   'G': <networkx.classes.graph.Graph at 0x1cbf59985e0>,
   'pair': {'drug_id': '5252',
    'drug_name': 'neratinib',
    'symptom_id': 'HP:0001263',
    'symptom_name': 'Global developmental delay',
    'complete': True}},
  {'filename': 'explanation_2_run_010',
   'G': <networkx.classes.graph.Graph at 0x1cbf599bf70>,
   'pair': {'drug_id': '5252',
    'drug_name': 'neratinib',
    'symptom_id': 'HP:0000750',
    'symptom_name': 'Delayed speech and language development',
    'complete': True}}],
 2: [{'filename': 'explanation_2_run_006',
   'G': <networkx.classes.graph.Graph at 0x1cbf59984f0>,
   'pair': {'drug_id': '1576',
    'drug_name': 'levosimendan',
    'symptom_i

# Information for Questionnaires

In [27]:
for dataset_nr in dataset_nrs:
    print(f'For dataset {dataset_nr}')
    
    nodes = pd.read_csv(f'output/indexed_nodes_{dataset_nr}.csv')
    
    explanations = chosen_explanations_per_dataset[dataset_nr]
    for expl in explanations:
        print(f"For explanation {expl['filename']}")
        
        for node in expl['G'].nodes:
            node_label = node
            node_index = node_label.split(' ')[-1]
            node_id = nodes.iloc[[node_index]].id.values[0]
            
            if any(prefix in node_id for prefix in ['HP', 'MONDO']):
                url = f"http://purl.obolibrary.org/obo/{node_id.replace(':', '_')}"
            elif 'HGNC' in node_id:
                url = f"https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{node_id}"
            else:
                url = ''
            
            print(node_label, node_id, url)
            
        print('\n')
        
        print(expl['G'].edges(data=True))
            
    print('\n')

For dataset 1
For explanation explanation_2_run_007
ARHGEF6 1320 HGNC:685 https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:685
neratinib 1772 5252 
XYLT1 2112 HGNC:15516 https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:15516
SPEF2 2661 HGNC:26293 https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:26293
PRNP 4016 HGNC:9449 https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:9449
non-syndromic X-linked intellectual disability 4911 MONDO:0019181 http://purl.obolibrary.org/obo/MONDO_0019181
LTBP4 5487 HGNC:6717 https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:6717
DAG1 5616 HGNC:2666 https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:2666
Duchenne muscular dystrophy 6315 MONDO:0010679 http://purl.obolibrary.org/obo/MONDO_0010679
LAMA1 6406 HGNC:6481 https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:6481
PAK1 6478 HGNC:8590 https://www.genenames.org/data/gene-symbol-report/#!/hgnc_

In [28]:
edges = pd.read_csv(f'output/indexed_edges_{dataset_nr}.csv')
edges

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,WormBase:WBGene00006787,unc-52,5,304,interacts with,WormBase:WBGene00003929,pat-2,5,1542,0
1,WormBase:WBGene00006787,unc-52,5,304,interacts with,WormBase:WBGene00006789,unc-54,5,6544,0
2,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSSSCG00000015555,LAMC1,5,9268,1
3,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ZFIN:ZDB-GENE-021226-3,lamc1,5,5387,1
4,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSOANG00000001050,ENSEMBL:ENSOANG00000001050,5,2204,1
...,...,...,...,...,...,...,...,...,...,...
85987,458,scopolamine butylbromide,4,5945,targets,P11229,Muscarinic acetylcholine receptor M1,6,5919,17
85988,OMIM:300377.0080,"DMD, IVS62, A-G, -285",11,1578,is allele of,HGNC:2928,DMD,5,3310,15
85989,5297,dacomitinib,4,8798,targets,P12931,Proto-oncogene tyrosine-protein kinase Src,6,2379,17
85990,ClinVarVariant:981988,NC_000023.11:g.(31875374_31929595)_(31968515_3...,11,8189,has affected feature,HGNC:2928,DMD,5,3310,11


In [29]:
edges.loc[edges['label_tail'] == 'Cognitive impairment']

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
2805,HGNC:11427,STUB1,5,2304,causes condition,HP:0100543,Cognitive impairment,9,9574,2
5708,HGNC:17997,FKRP,5,6985,causes condition,HP:0100543,Cognitive impairment,9,9574,2
12988,HGNC:11138,SNCA,5,9864,causes condition,HP:0100543,Cognitive impairment,9,9574,2
14534,HGNC:7573,MYH3,5,3388,causes condition,HP:0100543,Cognitive impairment,9,9574,2
28110,HGNC:2928,DMD,5,3310,causes condition,HP:0100543,Cognitive impairment,9,9574,2
32448,HGNC:18028,OSGEP,5,3981,causes condition,HP:0100543,Cognitive impairment,9,9574,2
36920,HGNC:6717,LTBP4,5,4082,causes condition,HP:0100543,Cognitive impairment,9,9574,2
41398,MONDO:0010679,Duchenne muscular dystrophy,3,1913,associated with phenotype,HP:0100543,Cognitive impairment,9,9574,18
43273,HGNC:9449,PRNP,5,8674,causes condition,HP:0100543,Cognitive impairment,9,9574,2
44932,HGNC:6481,LAMA1,5,9080,causes condition,HP:0100543,Cognitive impairment,9,9574,2


# Number of Types of Edges, Nodes, Metapaths

In [30]:
def get_occurrence_df(c, label_name):
    c_perc = [{label_name: i, 'Percentage': c[i] / c.total() * 100.0, 'Appearances': c[i]} for i in c]
    c_df = pd.DataFrame.from_dict(c_perc)
    c_df['Percentage'] = c_df['Percentage'].transform(lambda x: '{:,.2f}%'.format(x))
    c_df = c_df.sort_values(by = ['Appearances'], ascending = False)
    return c_df

def count_occurrences(G):    
    node_types = []
    for n, attr in G.nodes(data=True):
        node_types.append(attr['type'])
        
    edge_types = []
    triplets = []
    for n1, n2, attr in G.edges(data=True):
        edge_type = attr['label']
        edge_types.append(edge_type)
        
        n1_type = G.nodes[n1]['type']
        n2_type = G.nodes[n2]['type']
        
        triplet = tuple([n1_type, edge_type, n2_type])
        triplets.append(triplet)
    
    node_types_df = get_occurrence_df(Counter(node_types), 'Node Type')
    
    edge_types_df = get_occurrence_df(Counter(edge_types), 'Edge Type')
    
    metapaths_df = get_occurrence_df(Counter(triplets), 'Metapath')
    
    return node_types_df, edge_types_df, metapaths_df

def get_shortest_path_len_drug_symptom(n1, n2, G):
    path_len = nx.shortest_path_length(G, n1, n2)
    return path_len

In [31]:
for dataset_nr in dataset_nrs:
    node_types_per_expl = []
    edge_types_per_expl = []
    triplet_types_per_expl = []
    path_len_per_expl = []
    
    nr_expl = 0
    for expl_g, pair in zip(explanations_per_dataset[dataset_nr], explanation_pairs_per_dataset[dataset_nr]):
        node_types_df, edge_types_df, metapaths_df = count_occurrences(expl_g)
        
        nr_node_types = node_types_df['Node Type'].nunique()
        node_types_per_expl.append(nr_node_types)
        
        nr_edge_types = edge_types_df['Edge Type'].nunique()
        edge_types_per_expl.append(nr_edge_types)
        
        nr_triplet_types = metapaths_df['Metapath'].nunique()
        triplet_types_per_expl.append(nr_triplet_types)
        
        node_1, node_2 = pair
        shortest_path_len = get_shortest_path_len_drug_symptom(node_1, node_2, expl_g)
        path_len_per_expl.append(shortest_path_len)
        
        nr_expl += 1
    
    print(f'For the {nr_expl} explanations generated from dataset {dataset_nr}')
    print(f'Average number of node types: {(sum(node_types_per_expl)/len(node_types_per_expl))}')
    print(f'Average number of edge types: {(sum(edge_types_per_expl)/len(edge_types_per_expl))}')
    print(f'Average number of triplets: {(sum(triplet_types_per_expl)/len(triplet_types_per_expl))}')
    print(f'Average shortest path length between drug and symptom pair: {(sum(path_len_per_expl)/len(path_len_per_expl))}')

For the 10 explanations generated from dataset 1
Average number of node types: 3.0
Average number of edge types: 4.5
Average number of triplets: 5.0
Average shortest path length between drug and symptom pair: 2.6
For the 17 explanations generated from dataset 2
Average number of node types: 4.647058823529412
Average number of edge types: 5.529411764705882
Average number of triplets: 8.058823529411764
Average shortest path length between drug and symptom pair: 3.2941176470588234
