# Import Libraries

In [195]:
import os
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set Parameters

In [196]:
dataset_nr = 1
assert dataset_nr == 1 or 2

embedding_method = 'e2v'
assert embedding_method == 'e2v' or 'm2v'

seeded_emb = False

if seeded_emb:
    fixed_emb = '_seeded'
else:
    fixed_emb = ''
    
if seeded_emb:
    title_seeded = ' with fixed node embeddings'
else:
    title_seeded = ''

# Get Result Paths

In [197]:
curr_working_dir = os.getcwd()
curr_output_dir = os.path.join(curr_working_dir, 'output')
dataset_output_dir = os.path.join(curr_output_dir, f'g{dataset_nr}_{embedding_method}{fixed_emb}')

if not os.path.exists(dataset_output_dir):
    print('First, run the edge2vec embedding script. Then, run this script.')
else:
    print(f'Output folder for dataset {dataset_nr} exists and will be loaded: {dataset_output_dir}')
    
run_folders_list = []
for item in os.listdir(dataset_output_dir):
    curr_path = os.path.join(dataset_output_dir, item)
    if os.path.isdir(curr_path) and 'run' in item:
        run_folders_list.append(item)

print(f'A total of {len(run_folders_list)} runs will be included in the analysis.')

run_folders_paths = []
pred_folders_paths = []
for run_folder in run_folders_list:
    run_path = os.path.join(dataset_output_dir, run_folder)
    run_folders_paths.append(run_path)
    pred_run_path = os.path.join(run_path, 'pred')
    pred_folders_paths.append(pred_run_path)
    print(pred_run_path)

Output folder for dataset 1 exists and will be loaded: c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v
A total of 10 runs will be included in the analysis.
c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v\run_001\pred
c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v\run_002\pred
c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v\run_003\pred
c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v\run_004\pred
c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v\run_005\pred
c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v\run_006\pred
c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v\run_007\pred
c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v\run_008\p

In [198]:
auc_scores_all_runs = []
auc_loss_scores_all_runs = []

for run_name, pred_folder in zip(run_folders_list, pred_folders_paths):
    with open(f'{pred_folder}/performance_scores_{dataset_nr}_{embedding_method}.pkl', 'rb') as f:
        loaded_info = pickle.load(f)
        
    keys = ['AUC Train', 'AUC Validation', 'AUC Test']
    for key in keys:
        auc_scores = loaded_info[key]
        for index, auc_score in enumerate(auc_scores):
            auc_scores_per_run = {'run': run_name, 'name': key, 'iteration': index, 'score': auc_score}
            auc_scores_all_runs.append(auc_scores_per_run)
            auc_loss_scores_all_runs.append(auc_scores_per_run)

    loss_scores = loaded_info['Loss']
    for index, loss_score in enumerate(loss_scores):
        formatted_loss_score = float(np.log10(loss_score))
        loss_scores_per_run = {'run': run_name, 'name': 'Cross-Entropy Loss', 'iteration': index, 'score': formatted_loss_score}
        auc_loss_scores_all_runs.append(loss_scores_per_run)

In [199]:
all_settings = [
    {'embedding_method': 'e2v',
     'fixed_emb': '',
     'dataset_nr': 1},
    {'embedding_method': 'e2v',
     'fixed_emb': '_seeded',
     'dataset_nr': 1},
    {'embedding_method': 'e2v',
     'fixed_emb': '',
     'dataset_nr': 2},
    {'embedding_method': 'e2v',
     'fixed_emb': '_seeded',
     'dataset_nr': 2}
]

curr_working_dir = os.getcwd()
curr_output_dir = os.path.join(curr_working_dir, 'output')

final_test_auc_roc_scores_all_runs_all_models = []
roc_curve_all_runs_all_models = []
f1_scores_all_runs_all_models = []

for setting in all_settings:
    curr_dataset_output_dir = os.path.join(curr_output_dir, f'g{setting["dataset_nr"]}_{setting["embedding_method"]}{setting["fixed_emb"]}')

    if not os.path.exists(curr_dataset_output_dir):
        print('First, run the edge2vec embedding script. Then, run this script.')
    else:
        print(f'Output folder for dataset {dataset_nr} exists and will be loaded: {curr_dataset_output_dir}')
            
        run_folders_list = []
        for item in os.listdir(curr_dataset_output_dir):
            curr_path = os.path.join(curr_dataset_output_dir, item)
            if os.path.isdir(curr_path) and 'run' in item:
                run_folders_list.append(item)

        print(f'A total of {len(run_folders_list)} runs will be included in the analysis.')

        run_folders_paths = []
        curr_pred_folders_paths = []
        for run_folder in run_folders_list:
            run_path = os.path.join(curr_dataset_output_dir, run_folder)
            run_folders_paths.append(run_path)
            pred_run_path = os.path.join(run_path, 'pred')
            curr_pred_folders_paths.append(pred_run_path)
            
        for run_name, pred_folder in zip(run_folders_list, curr_pred_folders_paths):
            with open(f'{pred_folder}/performance_scores_{setting["dataset_nr"]}_{setting["embedding_method"]}.pkl', 'rb') as f:
                loaded_info = pickle.load(f)

            auc_roc_score = loaded_info['ROC AUC Score']
            formatted_auc_roc_score = float(auc_roc_score)
            auc_roc_score_per_run = {'Model': f'g{setting["dataset_nr"]}_{setting["embedding_method"]}{setting["fixed_emb"]}', 
                                        'ROC AUC Score': formatted_auc_roc_score}
            final_test_auc_roc_scores_all_runs_all_models.append(auc_roc_score_per_run)

            roc_fpr_scores = loaded_info['ROC FPR']
            roc_tpr_scores = loaded_info['ROC TPR']
            
            for fpr, tpr in zip(roc_fpr_scores, roc_tpr_scores):
                auc_per_threshold_per_run = {'Model': f'g{setting["dataset_nr"]}_{setting["embedding_method"]}{setting["fixed_emb"]}',
                                             'ROC FPR': fpr, 'ROC TPR': tpr}
                roc_curve_all_runs_all_models.append(auc_per_threshold_per_run)

            f1_score = loaded_info['F1 Score']
            formatted_f1_score = float(f1_score)
            f1_score_per_run = {'Model': f'g{setting["dataset_nr"]}_{setting["embedding_method"]}{setting["fixed_emb"]}', 
                                'F1 Score': formatted_f1_score}
            f1_scores_all_runs_all_models.append(f1_score_per_run)

Output folder for dataset 1 exists and will be loaded: c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v
A total of 10 runs will be included in the analysis.
First, run the edge2vec embedding script. Then, run this script.
Output folder for dataset 1 exists and will be loaded: c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g2_e2v
A total of 10 runs will be included in the analysis.
First, run the edge2vec embedding script. Then, run this script.


# Plot ROC Curves for Each Model

In [200]:
roc_curve_all_runs_all_models = pd.DataFrame(roc_curve_all_runs_all_models)
roc_curve_all_runs_all_models

Unnamed: 0,Model,ROC FPR,ROC TPR
0,g1_e2v,0.000000,0.000000
1,g1_e2v,0.000000,0.000192
2,g1_e2v,0.000000,0.006531
3,g1_e2v,0.000000,0.006915
4,g1_e2v,0.000000,0.010949
...,...,...,...
36427,g2_e2v,0.875091,0.999637
36428,g2_e2v,0.875091,0.999818
36429,g2_e2v,0.934502,0.999818
36430,g2_e2v,0.934502,1.000000


In [201]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.set_title(f'ROC TPR/ROC FPR Distribution over Each Run for Each Model')
sns.scatterplot(data=roc_curve_all_runs_all_models, x="ROC FPR", y="ROC TPR", hue="Model", s=1)
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend(markerscale=10)

fig.savefig(f'{curr_output_dir}/roc_curves.png', bbox_inches='tight')
fig.clear()

<Figure size 800x600 with 0 Axes>

# Plot F1 Score for Each Model

In [202]:
f1_scores_all_runs_all_models = pd.DataFrame(f1_scores_all_runs_all_models)
f1_scores_all_runs_all_models

Unnamed: 0,Model,F1 Score
0,g1_e2v,0.92396
1,g1_e2v,0.918695
2,g1_e2v,0.921543
3,g1_e2v,0.926259
4,g1_e2v,0.9243
5,g1_e2v,0.925021
6,g1_e2v,0.923388
7,g1_e2v,0.926945
8,g1_e2v,0.920598
9,g1_e2v,0.926584


In [203]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.set_title(f'F1 Scores Overview for Each Model')
sns.barplot(f1_scores_all_runs_all_models, x="Model", y="F1 Score", errorbar="sd", color='cornflowerblue')
ax.bar_label(ax.containers[0], fontsize=10)
ax.set_xlabel('Model Variant')
ax.set_ylabel('F1 Score')

fig.savefig(f'{curr_output_dir}/f1_scores.png', bbox_inches='tight')
fig.clear()

<Figure size 800x600 with 0 Axes>

# Plot Final AUC-ROC Score for Each Model

In [204]:
final_test_auc_roc_scores_all_runs_all_models = pd.DataFrame(final_test_auc_roc_scores_all_runs_all_models)
final_test_auc_roc_scores_all_runs_all_models

Unnamed: 0,Model,ROC AUC Score
0,g1_e2v,0.981466
1,g1_e2v,0.978083
2,g1_e2v,0.977793
3,g1_e2v,0.979621
4,g1_e2v,0.97719
5,g1_e2v,0.978637
6,g1_e2v,0.976998
7,g1_e2v,0.979838
8,g1_e2v,0.976336
9,g1_e2v,0.978401


In [205]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.set_title(f'AUC-ROC Scores Overview for Each Model')
sns.barplot(final_test_auc_roc_scores_all_runs_all_models, x="Model", y="ROC AUC Score", errorbar="sd")
ax.bar_label(ax.containers[0], fontsize=10)
ax.set_xlabel('Model Variant')
ax.set_ylabel('AUC-ROC Score')

fig.savefig(f'{curr_output_dir}/auc_roc_scores.png', bbox_inches='tight')
fig.clear()

<Figure size 800x600 with 0 Axes>

# Plot Training Curve

In [206]:
auc_scores_all_runs = pd.DataFrame(auc_scores_all_runs)
auc_scores_all_runs

Unnamed: 0,run,name,iteration,score
0,run_001,AUC Train,0,0.938449
1,run_001,AUC Train,1,0.924796
2,run_001,AUC Train,2,0.902691
3,run_001,AUC Train,3,0.931373
4,run_001,AUC Train,4,0.950517
...,...,...,...,...
4465,run_010,AUC Test,144,0.978238
4466,run_010,AUC Test,145,0.978276
4467,run_010,AUC Test,146,0.978309
4468,run_010,AUC Test,147,0.978363


In [207]:
auc_loss_scores_all_runs = pd.DataFrame(auc_loss_scores_all_runs)
auc_loss_scores_all_runs

Unnamed: 0,run,name,iteration,score
0,run_001,AUC Train,0,0.938449
1,run_001,AUC Train,1,0.924796
2,run_001,AUC Train,2,0.902691
3,run_001,AUC Train,3,0.931373
4,run_001,AUC Train,4,0.950517
...,...,...,...,...
5955,run_010,Cross-Entropy Loss,144,-0.401013
5956,run_010,Cross-Entropy Loss,145,-0.403343
5957,run_010,Cross-Entropy Loss,146,-0.403418
5958,run_010,Cross-Entropy Loss,147,-0.403079


In [208]:
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_title(f'Training curve on dataset {dataset_nr} with method {embedding_method}{title_seeded}')
sns.lineplot(data=auc_scores_all_runs, x='iteration', y='score', hue='name')
ax.set_ylim(0.85,1)
ax.set_xlabel('Iteration')
ax.set_ylabel('AUC-ROC Score')

print(dataset_output_dir)
fig.savefig(f'{dataset_output_dir}/training_curve.png', bbox_inches='tight')
fig.clear()

c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v


<Figure size 800x800 with 0 Axes>

In [209]:
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_title(f'Training curve on dataset {dataset_nr} with method {embedding_method}{title_seeded}')
sns.lineplot(data=auc_loss_scores_all_runs, x='iteration', y='score', hue='name')
ax.set_ylim(top=1)
ax.set_xlabel('Iteration')
ax.set_ylabel('AUC-ROC/log10(Loss)')

fig.savefig(f'{dataset_output_dir}/training_curve_with_loss.png', bbox_inches='tight')
fig.clear()

<Figure size 800x800 with 0 Axes>

# Similarity between top scoring symptom-drug pairs

In [210]:
drug_symptom_pairs_per_run = []

for index, pred_path in enumerate(pred_folders_paths):
    with open(f'{pred_path}/candidates_per_symptom_{dataset_nr}_{embedding_method}.pkl', 'rb') as f:
        loaded_list = pickle.load(f)
        
        drug_symptom_pairs = []
        
        for _, row in loaded_list.iterrows():
            symptom_id = row['Symptom']
            candidates = row['Candidates']
            
            for candidate in candidates:
                drug_symptom_pairs.append(tuple([symptom_id, candidate]))
                
        total_drug_symptom_pairs = len(drug_symptom_pairs)
    
    drug_symptom_pairs_per_run.append(drug_symptom_pairs)

In [211]:
similarity_matrix = {}
ratios_non_diagonals = []

for index1, pairs1 in enumerate(drug_symptom_pairs_per_run):
    
    similarities = {}
    
    for index2, pairs2 in enumerate(drug_symptom_pairs_per_run):
        overlap = set([tuple(sorted(ele)) for ele in pairs1]) & set([tuple(sorted(ele)) for ele in pairs2])
        ratio_overlap = len(overlap) / total_drug_symptom_pairs * 100
        
        similarities[f'run {index2+1}'] = ratio_overlap
        
        if index1 != index2:
            ratios_non_diagonals.append(ratio_overlap)
        
    similarity_matrix[f'run {index1+1}'] = similarities
    
similarity_matrix_df = pd.DataFrame(similarity_matrix)
similarity_matrix_df

Unnamed: 0,run 1,run 2,run 3,run 4,run 5,run 6,run 7,run 8,run 9,run 10
run 1,100.0,34.567901,51.851852,39.506173,29.62963,38.271605,28.395062,37.037037,54.320988,54.320988
run 2,34.567901,100.0,32.098765,51.851852,53.08642,19.753086,62.962963,23.45679,23.45679,30.864198
run 3,51.851852,32.098765,100.0,50.617284,33.333333,43.209877,27.160494,37.037037,51.851852,61.728395
run 4,39.506173,51.851852,50.617284,100.0,51.851852,35.802469,53.08642,28.395062,43.209877,46.91358
run 5,29.62963,53.08642,33.333333,51.851852,100.0,19.753086,44.444444,40.740741,24.691358,41.975309
run 6,38.271605,19.753086,43.209877,35.802469,19.753086,100.0,27.160494,24.691358,48.148148,30.864198
run 7,28.395062,62.962963,27.160494,53.08642,44.444444,27.160494,100.0,17.283951,32.098765,25.925926
run 8,37.037037,23.45679,37.037037,28.395062,40.740741,24.691358,17.283951,100.0,33.333333,32.098765
run 9,54.320988,23.45679,51.851852,43.209877,24.691358,48.148148,32.098765,33.333333,100.0,53.08642
run 10,54.320988,30.864198,61.728395,46.91358,41.975309,30.864198,25.925926,32.098765,53.08642,100.0


In [212]:
mean_overlap_ratio = np.mean(ratios_non_diagonals)
median_overlap_ratio = np.median(ratios_non_diagonals)
print('Mean:', mean_overlap_ratio)
print('Median:', median_overlap_ratio)

Mean: 38.35390946502058
Median: 37.03703703703704


In [213]:
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_title(f'Overlap ratio (Mean: {mean_overlap_ratio}, Median: {median_overlap_ratio}) between list of predicted symptom-drug pairs per run on dataset {dataset_nr} with method {embedding_method}{title_seeded}')
sns.heatmap(similarity_matrix_df, annot=True, fmt='.1f', linewidths=0.5, ax=ax, cmap='RdYlGn')
ax.collections[0].set_clim(0,100)

fig.savefig(f'{dataset_output_dir}/overlap_between_runs.png', bbox_inches='tight')
fig.clear()

<Figure size 800x800 with 0 Axes>

In [214]:
for i in range(0, len(drug_symptom_pairs_per_run)):
    if i == 0:
        overlapping_pairs_all_runs = set(drug_symptom_pairs_per_run[i])
    else:
        overlapping_pairs_all_runs = overlapping_pairs_all_runs & set(drug_symptom_pairs_per_run[i])
            
print(f'There are {len(overlapping_pairs_all_runs)} symptom-drug pairs that are found in the top list of drug candidates in {len(drug_symptom_pairs_per_run)} runs: \n {overlapping_pairs_all_runs}')

with open(f'{dataset_output_dir}/symptom_drug_pair_overlapping_all_runs_{dataset_nr}_{embedding_method}{fixed_emb}.pkl', 'wb') as f:
    pickle.dump(overlapping_pairs_all_runs, f)

There are 4 symptom-drug pairs that are found in the top list of drug candidates in 10 runs: 
 {('HP:0011675', '1576'), ('HP:0002650', '5345'), ('HP:0003115', '231'), ('HP:0001635', '231')}


In [215]:
same_drug_symptom_pairs_thresholded = set()

threshold = 0.5
total_runs = len(drug_symptom_pairs_per_run)
min_nr_runs = int(threshold * total_runs)

for i in range(0, len(drug_symptom_pairs_per_run)):
    for pair in drug_symptom_pairs_per_run[i]:
        same_pairs = 0
        for j in range(0, len(drug_symptom_pairs_per_run)):
            for pair_to_compare in drug_symptom_pairs_per_run[j]:
                if pair == pair_to_compare:
                    same_pairs += 1
                        
        #print(f'For pair {pair} from run {i}, there are {same_pairs} same pairs found in list of all runs.')
        if same_pairs >= min_nr_runs:
            same_drug_symptom_pairs_thresholded.add(pair)
            
print(f'There are {len(same_drug_symptom_pairs_thresholded)} symptom-drug pairs that are found in the top list of drug candidates in at least {min_nr_runs} of the {total_runs} runs: \n {same_drug_symptom_pairs_thresholded}')

with open(f'{dataset_output_dir}/symptom_drug_pair_overlapping_{min_nr_runs}_runs_{dataset_nr}_{embedding_method}{fixed_emb}.pkl', 'wb') as f:
    pickle.dump(same_drug_symptom_pairs_thresholded, f)

There are 52 symptom-drug pairs that are found in the top list of drug candidates in at least 5 of the 10 runs: 
 {('HP:0002093', '5431'), ('HP:0001263', '5431'), ('HP:0001256', '5345'), ('HP:0011675', '522'), ('HP:0001328', '5345'), ('HP:0001265', '5345'), ('HP:0001644', '5345'), ('HP:0002093', '4225'), ('HP:0001644', '231'), ('HP:0001290', '5252'), ('HP:0100543', '5345'), ('HP:0001270', '5252'), ('HP:0003236', '1576'), ('HP:0001638', '522'), ('HP:0002650', '5345'), ('HP:0000750', '4225'), ('HP:0003323', '522'), ('HP:0008981', '5345'), ('HP:0001265', '5252'), ('HP:0008981', '231'), ('HP:0003323', '1576'), ('HP:0001644', '1576'), ('HP:0003307', '5345'), ('HP:0001290', '4225'), ('HP:0002093', '5345'), ('HP:0003115', '231'), ('HP:0001263', '5345'), ('HP:0100543', '5252'), ('HP:0003560', '522'), ('HP:0003236', '522'), ('HP:0003236', '231'), ('HP:0001638', '231'), ('HP:0002791', '926'), ('HP:0000750', '5345'), ('HP:0003707', '231'), ('HP:0003323', '231'), ('HP:0002515', '5345'), ('HP:00116

# Check overlap between each setting

In [216]:
nodes = pd.read_csv(f'output/indexed_nodes_{dataset_nr}.csv')
nodes.drop('index_id', axis=1, inplace=True)
nodes['semantic'] = nodes['semantic'].astype('category')
nodes

Unnamed: 0,id,semantic,label,semantic_id
0,WormBase:WBGene00000389,ORTH,cdc-25.4,5
1,ZP:0018675,DISO,right side lateral plate mesoderm mislocalised...,1
2,ZFIN:ZDB-GENE-040426-1197,ORTH,tbc1d5,5
3,5,DRUG,(S)-nicardipine,2
4,RGD:3443,ORTH,Ptk2,5
...,...,...,...,...
10029,MP:0009763,DISO,increased sensitivity to induced morbidity/mor...,1
10030,MP:0011057,DISO,absent brain ependyma motile cilia,1
10031,MP:0001412,DISO,excessive scratching,1
10032,WBPhenotype:0004023,DISO,frequency of body bend variant,1


In [217]:
all_settings = [
    {'embedding_method': 'e2v',
     'fixed_emb': '',
     'dataset_nr': 1},
    {'embedding_method': 'e2v',
     'fixed_emb': '_seeded',
     'dataset_nr': 1},
    {'embedding_method': 'e2v',
     'fixed_emb': '',
     'dataset_nr': 2},
    {'embedding_method': 'e2v',
     'fixed_emb': '_seeded',
     'dataset_nr': 2}
]

curr_working_dir = os.getcwd()
curr_output_dir = os.path.join(curr_working_dir, 'output')

overlapping_pairs = []

for setting in all_settings:
    dataset_output_dir = os.path.join(curr_output_dir, f'g{setting["dataset_nr"]}_{setting["embedding_method"]}{setting["fixed_emb"]}')

    if not os.path.exists(dataset_output_dir):
        print('First, run the edge2vec embedding script. Then, run this script.')
    else:
        print(f'Output folder for dataset {dataset_nr} exists and will be loaded: {dataset_output_dir}')
        
        with open(f'{dataset_output_dir}/symptom_drug_pair_overlapping_all_runs_{setting["dataset_nr"]}_{setting["embedding_method"]}{setting["fixed_emb"]}.pkl', 'rb') as f:
            loaded_list = pickle.load(f)
            print(loaded_list)
            overlapping_pairs.append(loaded_list)

Output folder for dataset 1 exists and will be loaded: c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g1_e2v
{('HP:0011675', '1576'), ('HP:0002650', '5345'), ('HP:0003115', '231'), ('HP:0001635', '231')}
First, run the edge2vec embedding script. Then, run this script.
Output folder for dataset 1 exists and will be loaded: c:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g2_e2v
{('HP:0003115', '1576'), ('HP:0001638', '1576'), ('HP:0001290', '269'), ('HP:0011675', '1576'), ('HP:0001644', '1576'), ('HP:0003236', '1576')}
First, run the edge2vec embedding script. Then, run this script.


In [218]:
non_fixed_emb_overlap = overlapping_pairs[0].intersection(overlapping_pairs[2])
for pair in non_fixed_emb_overlap:
    symptom_id, drug_id = pair
    symptom_name = nodes.loc[nodes['id'] == symptom_id]['label'].iloc[0]
    drug_name = nodes.loc[nodes['id'] == drug_id]['label'].iloc[0]
    print(drug_name, 'treats', symptom_name)

IndexError: list index out of range

In [None]:
fixed_emb_overlap = overlapping_pairs[1].intersection(overlapping_pairs[3])
for pair in fixed_emb_overlap:
    symptom_id, drug_id = pair
    symptom_name = nodes.loc[nodes['id'] == symptom_id]['label'].iloc[0]
    drug_name = nodes.loc[nodes['id'] == drug_id]['label'].iloc[0]
    print(drug_name, 'treats', symptom_name)

IndexError: list index out of range

In [None]:
dataset1_emb_overlap = overlapping_pairs[0].intersection(overlapping_pairs[1])
for pair in dataset1_emb_overlap:
    symptom_id, drug_id = pair
    symptom_name = nodes.loc[nodes['id'] == symptom_id]['label'].iloc[0]
    drug_name = nodes.loc[nodes['id'] == drug_id]['label'].iloc[0]
    print(drug_name, 'treats', symptom_name)

In [None]:
dataset2_emb_overlap = overlapping_pairs[2].intersection(overlapping_pairs[3])
for pair in dataset2_emb_overlap:
    symptom_id, drug_id = pair
    symptom_name = nodes.loc[nodes['id'] == symptom_id]['label'].iloc[0]
    drug_name = nodes.loc[nodes['id'] == drug_id]['label'].iloc[0]
    print(drug_name, 'treats', symptom_name)

levosimendan treats Congestive heart failure
aprindine treats Arrhythmia


# How often do predicted edges in the knowledge graph appear

In [None]:
edges = pd.read_csv(f'output/indexed_edges_{dataset_nr}.csv')
edges

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,FlyBase:FBgn0085464,CG34435,5,6825,0
1,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,HGNC:7585,MYL4,3,27,0
2,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,FlyBase:FBgn0002772,Mlc1,5,8901,0
3,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,NCBIGene:396472,MYL4,3,9508,0
4,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSECAG00000020967,ENSEMBL:ENSECAG00000020967,5,8807,1
...,...,...,...,...,...,...,...,...,...,...
82939,ClinVarVariant:659584,NC_000023.11:g.(?_32491267)_(32849830_?)del,7,3184,pathogenic for condition,MONDO:0010679,Duchenne muscular dystrophy,1,6315,12
82940,4810,ibrutinib,2,1618,targets,HGNC:11283,SRC,3,3279,14
82941,OMIM:300377.0013,"DMD, EX18DEL",1,2822,is allele of,HGNC:2928,DMD,3,6612,16
82942,Coriell:GM05113,NIGMS-GM05113,4,8105,has role in modeling,MONDO:0010679,Duchenne muscular dystrophy,1,6315,15


In [None]:
_, relation_labels = pd.factorize(edges['relation'])
relation_labels

Index(['in orthology relationship with',
       'in 1 to 1 orthology relationship with', 'expressed in', 'is part of',
       'has phenotype', 'enables', 'interacts with', 'involved in',
       'colocalizes with', 'is causal germline mutation in',
       'contributes to condition', 'has affected feature',
       'pathogenic for condition', 'contributes to', 'targets',
       'has role in modeling', 'is allele of',
       'likely pathogenic for condition', 'causes condition',
       'is substance that treats', 'source', 'has genotype',
       'is causal germline mutation partially giving rise to',
       'is marker for'],
      dtype='object')

In [None]:
if dataset_nr == 1:
    pheno_rel = 'has phenotype'
else:
    pheno_rel = 'associated with phenotype'

disease_ID = 'MONDO:0010679'
relation_index = list(relation_labels).index(pheno_rel)

symptoms = edges[(edges['head'] == 'MONDO:0010679') & (edges['type'] == relation_index)]

print(f'A total of {symptoms.shape[0]} symptoms found that are associated with {disease_ID}')
symptoms.head(symptoms.shape[0])

A total of 27 symptoms found that are associated with MONDO:0010679


Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
37674,MONDO:0010679,Duchenne muscular dystrophy,1,6315,has phenotype,HP:0011675,Arrhythmia,1,9512,4
37675,MONDO:0010679,Duchenne muscular dystrophy,1,6315,has phenotype,HP:0002515,Waddling gait,1,552,4
37677,MONDO:0010679,Duchenne muscular dystrophy,1,6315,has phenotype,HP:0003236,Elevated serum creatine kinase,1,1990,4
37678,MONDO:0010679,Duchenne muscular dystrophy,1,6315,has phenotype,HP:0002093,Respiratory insufficiency,1,4440,4
37679,MONDO:0010679,Duchenne muscular dystrophy,1,6315,has phenotype,HP:0003707,Calf muscle pseudohypertrophy,1,357,4
37680,MONDO:0010679,Duchenne muscular dystrophy,1,6315,has phenotype,HP:0001256,"Intellectual disability, mild",1,4247,4
37681,MONDO:0010679,Duchenne muscular dystrophy,1,6315,has phenotype,HP:0003701,Proximal muscle weakness,1,5299,4
37682,MONDO:0010679,Duchenne muscular dystrophy,1,6315,has phenotype,HP:0003202,Skeletal muscle atrophy,1,4453,4
37683,MONDO:0010679,Duchenne muscular dystrophy,1,6315,has phenotype,HP:0003560,Muscular dystrophy,1,2788,4
37684,MONDO:0010679,Duchenne muscular dystrophy,1,6315,has phenotype,HP:0003391,Gowers sign,1,3055,4


In [None]:
for index, row in symptoms.iterrows():
    print(row['tail'])
    edges[(edges['head'] == 'MONDO:0010679') & (edges['type'] == relation_index)]

HP:0011675
HP:0002515
HP:0003236
HP:0002093
HP:0003707
HP:0001256
HP:0003701
HP:0003202
HP:0003560
HP:0003391
HP:0001635
HP:0001328
HP:0003323
HP:0001371
HP:0002650
HP:0003115
HP:0001263
HP:0008981
HP:0001638
HP:0003307
HP:0000750
HP:0001265
HP:0001644
HP:0001270
HP:0001290
HP:0100543
HP:0002791


In [None]:
drug_symptom_pairs_per_run

[[('HP:0011675', '231'),
  ('HP:0011675', '1576'),
  ('HP:0011675', '522'),
  ('HP:0002515', '4225'),
  ('HP:0002515', '522'),
  ('HP:0002515', '5345'),
  ('HP:0003236', '231'),
  ('HP:0003236', '1576'),
  ('HP:0003236', '522'),
  ('HP:0002093', '4225'),
  ('HP:0002093', '5345'),
  ('HP:0002093', '522'),
  ('HP:0003707', '231'),
  ('HP:0003707', '1576'),
  ('HP:0003707', '522'),
  ('HP:0001256', '4225'),
  ('HP:0001256', '5345'),
  ('HP:0001256', '522'),
  ('HP:0003701', '4225'),
  ('HP:0003701', '522'),
  ('HP:0003701', '5345'),
  ('HP:0003202', '522'),
  ('HP:0003202', '4225'),
  ('HP:0003202', '231'),
  ('HP:0003560', '231'),
  ('HP:0003560', '522'),
  ('HP:0003560', '1576'),
  ('HP:0003391', '4225'),
  ('HP:0003391', '522'),
  ('HP:0003391', '5345'),
  ('HP:0001635', '1576'),
  ('HP:0001635', '231'),
  ('HP:0001635', '522'),
  ('HP:0001328', '4225'),
  ('HP:0001328', '522'),
  ('HP:0001328', '5345'),
  ('HP:0003323', '231'),
  ('HP:0003323', '522'),
  ('HP:0003323', '1576'),
  ('HP