# Import Libraries

In [102]:
import os
import pickle
import pandas as pd
import networkx as nx
from collections import Counter
import random

# Get Explanation Paths

In [107]:
dataset_nrs = [1, 2]
embedding_method = 'e2v'

expl_folders = {1: 'expl_5', 2: 'expl_all'}
explanations_per_dataset = {}
explanation_pairs_per_dataset = {}

curr_working_dir = os.getcwd()
curr_output_dir = os.path.join(curr_working_dir, 'output')

for dataset_nr in dataset_nrs:
    nodes = pd.read_csv(f'output/indexed_nodes_{dataset_nr}.csv')
    
    dataset_output_dir = os.path.join(curr_output_dir, f'g{dataset_nr}_{embedding_method}', expl_folders[dataset_nr])
    print(dataset_output_dir)
    
    all_explanations = []
    all_graphs = []
    all_pairs = []
    for item in os.listdir(dataset_output_dir):
        if 'incomplete' not in item and '.gpickle' in item:
            
            with open(os.path.join(dataset_output_dir, item), 'rb') as f:
                G = pickle.load(f)
                all_graphs.append(G)
                
            file_name_explanation = item.split('_graph.gpickle')[0]
            all_explanations.append(file_name_explanation)
            pair_file_name = f'{file_name_explanation}_pair.pkl'
            
            with open(os.path.join(dataset_output_dir, pair_file_name), 'rb') as f:
                loaded_info = pickle.load(f)
                
                symptom_index = nodes.loc[nodes['id'] == loaded_info['symptom_id']].index[0]
                symptom_label = nodes.loc[nodes['id'] == loaded_info['symptom_id']]['label'].values[0]
                
                drug_index = nodes.loc[nodes['id'] == loaded_info['drug_id']].index[0]
                drug_label = nodes.loc[nodes['id'] == loaded_info['drug_id']]['label'].values[0]
                
                all_pairs.append([f'{symptom_label} {symptom_index}', f'{drug_label} {drug_index}'])
            
    explanations_per_dataset[dataset_nr] = all_graphs
    explanation_pairs_per_dataset[dataset_nr] = all_pairs
    
    print(f'For dataset {dataset_nr}, include the following explanations in the questionnaires:')
    random.seed(11)
    for expl_name in random.choices(all_explanations, k=3):
        print(expl_name)

c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v\expl_5
For dataset 1, include the following explanations in the questionnaires:
explanation_1_run_007
explanation_1_run_009
explanation_2_run_010
c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g2_e2v\expl_all
For dataset 2, include the following explanations in the questionnaires:
explanation_2_run_009
explanation_3_run_003
explanation_4_run_007


# Number of Types of Edges, Nodes, Metapaths

In [95]:
def get_occurrence_df(c, label_name):
    c_perc = [{label_name: i, 'Percentage': c[i] / c.total() * 100.0, 'Appearances': c[i]} for i in c]
    c_df = pd.DataFrame.from_dict(c_perc)
    c_df['Percentage'] = c_df['Percentage'].transform(lambda x: '{:,.2f}%'.format(x))
    c_df = c_df.sort_values(by = ['Appearances'], ascending = False)
    return c_df

def count_occurrences(G):    
    node_types = []
    for n, attr in G.nodes(data=True):
        node_types.append(attr['type'])
        
    edge_types = []
    triplets = []
    for n1, n2, attr in G.edges(data=True):
        edge_type = attr['label']
        edge_types.append(edge_type)
        
        n1_type = G.nodes[n1]['type']
        n2_type = G.nodes[n2]['type']
        
        triplet = tuple([n1_type, edge_type, n2_type])
        triplets.append(triplet)
    
    node_types_df = get_occurrence_df(Counter(node_types), 'Node Type')
    
    edge_types_df = get_occurrence_df(Counter(edge_types), 'Edge Type')
    
    metapaths_df = get_occurrence_df(Counter(triplets), 'Metapath')
    
    return node_types_df, edge_types_df, metapaths_df

def get_shortest_path_drug_symptom(n1, n2, G):
    path_len = nx.shortest_path_length(G, n1, n2)
    return path_len

In [96]:
for dataset_nr in dataset_nrs:
    print(f'For explanations generated from dataset {dataset_nr}')
    
    node_types_per_expl = []
    edge_types_per_expl = []
    triplet_types_per_expl = []
    path_len_per_expl = []
    for expl_g, pair in zip(explanations_per_dataset[dataset_nr], explanation_pairs_per_dataset[dataset_nr]):
        
        node_types_df, edge_types_df, metapaths_df = count_occurrences(expl_g)
        
        nr_node_types = node_types_df['Node Type'].nunique()
        node_types_per_expl.append(nr_node_types)
        
        nr_edge_types = edge_types_df['Edge Type'].nunique()
        edge_types_per_expl.append(nr_edge_types)
        
        nr_triplet_types = metapaths_df['Metapath'].nunique()
        triplet_types_per_expl.append(nr_triplet_types)
        
        node_1, node_2 = pair
        path_len_per_expl.append(get_shortest_path_drug_symptom(node_1, node_2, expl_g))
    
    print(f'Average number of node types: {(sum(node_types_per_expl)/len(node_types_per_expl))}')
    print(f'Average number of edge types: {(sum(edge_types_per_expl)/len(edge_types_per_expl))}')
    print(f'Average number of triplets: {(sum(triplet_types_per_expl)/len(triplet_types_per_expl))}')
    print(f'Average shortest path length between drug and symptom pair: {(sum(path_len_per_expl)/len(path_len_per_expl))}')

For explanations generated from dataset 1
Average number of node types: 3.0
Average number of edge types: 4.5
Average number of triplets: 5.25
Average shortest path length between drug and symptom pair: 2.625
For explanations generated from dataset 2
Average number of node types: 3.825
Average number of edge types: 4.65
Average number of triplets: 6.225
Average shortest path length between drug and symptom pair: 2.75
